In [None]:
import pickle
import pandas as pd
import re
import random

In [None]:
def open_book(filename):
    with open("../../Resources/Cleaned/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def add_book_to_df(book, book_title):
    # Split the book text into paragraphs
    paragraphs = book.split('\n')
    
    # Clean each paragraph by removing extra whitespace and trimming
    paragraphs = [re.sub(r'\s+', ' ', para.strip()) for para in paragraphs if para.strip()]

    # Create a DataFrame with two columns: book title and the paragraph text
    df = pd.DataFrame({'Title': [book_title] * len(paragraphs), 'Text': paragraphs})
    return df

def create_title_text_dict(df):
    # Initialize an empty dictionary
    title_text_dict = {}

    # Iterate through each row of the DataFrame
    for index, row in df.iterrows():
        # Extract the "Title" and "Text" values
        title = row['Title']
        text = row['Text']

        # Check if the title already exists in the dictionary
        if title in title_text_dict:
            # Append the text to the existing list for this title
            title_text_dict[title].append(text)
        else:
            # Create a new entry in the dictionary with the title as the key
            title_text_dict[title] = [text]

    return title_text_dict

def display_random_text(dictionary):

    random_title = random.choice(list(dictionary.keys()))

    random_text = random.choice(dictionary[random_title])

    print(f"{random_title} : {random_text}")
    
def export_dictionary_pickle(dictionary, filename):
    # Open a file in write-binary mode
    with open(filename, 'wb') as file:
        # Use pickle to serialize and save the dictionary
        pickle.dump(dictionary, file)


In [None]:
book1 = open_book("cosmos_cleaned")
book2 = open_book("into_thin_air_cleaned")
book3 = open_book("tom_sawyer_cleaned")
book5 = open_book("stardust_cleaned")
book4 = open_book("1984_cleaned")
book6 = open_book("androids_cleaned")

cosmos_df = add_book_to_df(book1, '"Cosmos" by Carl Sagan')
into_thin_air_df = add_book_to_df(book2, '"Into Thin Air" by Jon Krakauer')
tom_sawyer_df = add_book_to_df(book3, '"The Adventures of Tom Sawyer" by Mark Twain')
df_1984 = add_book_to_df(book4, '"1984" by George Orwell')
android_df = add_book_to_df(book6, '"Do Androids Dream of Electric Sheep?" by Philip K. Dick')
stardust_df = add_book_to_df(book5, '"Stardust" by Neil Gaiman')


In [None]:
corpus = pd.concat([cosmos_df, into_thin_air_df, android_df, tom_sawyer_df, df_1984, stardust_df], ignore_index=True)

In [None]:
corpus.Title.unique().tolist()

In [None]:
book_dict = create_title_text_dict(corpus)

In [None]:
display_random_text(book_dict)



In [None]:
export_dictionary_pickle(book_dict, 'book_dictionary.pkl')