In [1]:
import spacy
import pandas as pd
import re
import stanza
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict
from collections import Counter
from textblob import TextBlob

tqdm.pandas()

In [2]:
# Functions

def open_book(filename):
    with open("../../Resources/Cleaned/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def add_book_to_df(book, book_title):
    # Split the book text into paragraphs
    paragraphs = book.split('\n')
    
    # Clean each paragraph by removing extra whitespace and trimming
    paragraphs = [re.sub(r'\s+', ' ', para.strip()) for para in paragraphs if para.strip()]

    # Create a DataFrame with two columns: book title and the paragraph text
    df = pd.DataFrame({'Title': [book_title] * len(paragraphs), 'Text': paragraphs})
    return df

def correct_entity_type(entity_text, correction_dict):
    # Normalize the entity text (lowercase, remove extra spaces, handle special chars)
    entity_text_normalized = re.sub(r'\s+', ' ', entity_text).lower().strip(" '\"")

    for category, names in correction_dict.items():
        # Normalize and prepare the names in the dictionary
        normalized_names = [re.sub(r'\s+', ' ', name).lower().strip(" '\"") for name in names]
        
        if entity_text_normalized in normalized_names:
            return category
    return None

def find_entities_in_paragraph(paragraph, entities):
    entities_in_paragraph = set()
    for ent_text, ent_type in entities:
        if ent_text in paragraph:
            entities_in_paragraph.add((ent_text, ent_type))
    return list(entities_in_paragraph)

def dialogue_to_df(text):
    pattern = r'"([^"]*)"'
    dialogues = re.findall(pattern, text)
    df_dialogues = pd.DataFrame(dialogues, columns=['Dialogue'])
    return df_dialogues

def key_phrase_extractor(text, n=1):
    additional_stopwords = {'said', "'s", "n't", "'m", "'re", "'ve", "'ll", "'d"}
    custom_stopwords = set(stopwords.words('english')).union(additional_stopwords)

    # Tokenize the text into words, remove punctuation with regex
    words = word_tokenize(re.sub(r'[^\w\s]', '', text))

    # Remove stop words and convert to lowercase
    words_without_stopwords = [word.lower() for word in words if word.lower() not in custom_stopwords]

    # Generate n-grams
    n_grams = ngrams(words_without_stopwords, n)
    n_grams = [' '.join(grams) for grams in n_grams]

    # Count the frequency of each n-gram
    frequency = Counter(n_grams)

    # Get the top N key phrases
    N = 100
    key_phrases = frequency.most_common(N)

    # Create a DataFrame from the top key phrases
    df = pd.DataFrame(key_phrases, columns=['phrase', 'count'])

    return df

def is_character(entity):
    character_types = {'PERSON'}
    return entity[1] in character_types

def is_location(entity):
    location_types = {'LOC'}
    return entity[1] in location_types


def df_to_csv(df, filename):
    df.to_csv("../../Resources/Cleaned/"+filename+".csv", index=False)

In [3]:
book1 = open_book("cosmos_cleaned")
book2 = open_book("into_thin_air_cleaned")
book3 = open_book("tom_sawyer_cleaned")
book4 = open_book("1984_cleaned")
book5 = open_book("killing_machine_cleaned")
book6 = open_book("androids_cleaned")
book7 = open_book("stardust_cleaned")

In [4]:
cosmos_para = add_book_to_df(book1, "Cosmos")
into_thin_air_para = add_book_to_df(book2, "Into Thin Air")
tom_sawyer_para = add_book_to_df(book3, "Tom Sawyer")
para_1984 = add_book_to_df(book4, "1984")
killing_machine_para = add_book_to_df(book5, "Killing Machine")
android_para = add_book_to_df(book6, "Android")
stardust_para = add_book_to_df(book7, "Stardust")

In [5]:
killing_machine_para

Unnamed: 0,Title,Text
0,Killing Machine,"From ""How the Planets Trade,"" by Ignace Wodleckt:"
1,Killing Machine,"Cosmopolis, September, 1509:"
2,Killing Machine,"In all commercial communities, the prevalence ..."
3,Killing Machine,Regarding the 'quality of authenticity' there ...
4,Killing Machine,Gersen first encountered Kokor Hekkus at the a...
...,...,...
629,Killing Machine,"""There is nothing to keep us. Tomorrow we shal..."
630,Killing Machine,"""As you wish,"" said Gersen. Alusz Iphigenia cl..."
631,Killing Machine,Patch Engineering and Construction Company Pat...
632,Killing Machine,"Alusz Iphigenia looked up. ""W^hy do you laugh?..."


In [6]:
para_1984

Unnamed: 0,Title,Text
0,1984,"It was a bright cold day in April, and the clo..."
1,1984,corner. There was one on the house-front immed...
2,1984,Appendix.]--was startlingly different from any...
3,1984,"smell, as of Chinese rice-spirit. Winston pour..."
4,1984,possession. The thing that he was about to do ...
...,...,...
241,1984,"of Truth, for example, the Records Department,..."
242,1984,"ideologically neutral, as nearly as possible i..."
243,1984,It was of course possible to utter heresies of...
244,1984,"action, or was already orthodox (GOODTHINKFUL ..."


In [7]:
tom_sawyer_para

Unnamed: 0,Title,Text
0,Tom Sawyer,"Page 2 , Adventures of Tom Sawyer, The - Mark ..."
1,Tom Sawyer,"Page 3 , Adventures of Tom Sawyer, The - Mark ..."
2,Tom Sawyer,"Page 4 , Adventures of Tom Sawyer, The - Mark ..."
3,Tom Sawyer,"Page 5 , Adventures of Tom Sawyer, The - Mark ..."
4,Tom Sawyer,"Page 6 , Adventures of Tom Sawyer, The - Mark ..."
...,...,...
218,Tom Sawyer,"Page 220 , Adventures of Tom Sawyer, The - Mar..."
219,Tom Sawyer,"Page 221 , Adventures of Tom Sawyer, The - Mar..."
220,Tom Sawyer,"Page 222 , Adventures of Tom Sawyer, The - Mar..."
221,Tom Sawyer,"Page 223 , Adventures of Tom Sawyer, The - Mar..."


In [8]:
cosmos_para

Unnamed: 0,Title,Text
0,Cosmos,Today we have discovered a powerful and elegan...
1,Cosmos,other. But books and television series have so...
2,Cosmos,"by Frederick Reines, of the University of Cali..."
3,Cosmos,CHAPTER I The Shores of the Cosmic Ocean The f...
4,Cosmos,although they may trouble whatever gods may be...
...,...,...
189,Cosmos,"Some 3.6 million years ago, in what is now nor..."
190,Cosmos,APPENDIX I Reductio ad Absurdum and the Square...
191,Cosmos,"Dividing both sides of the last equality by 2,..."
192,Cosmos,APPENDIX 2 The Five Pythagorean Solids A regul...


In [9]:
df_to_csv(cosmos_para, "cosmos_paragraphs")
df_to_csv(into_thin_air_para, "into_thin_air_paragraphs")
df_to_csv(tom_sawyer_para, "tom_sawyer_paragraphs")
df_to_csv(para_1984, "1984_paragraphs")
df_to_csv(killing_machine_para, "killing_machine_paragraphs")
df_to_csv(android_para, "android_paragraphs")
df_to_csv(stardust_para, "stardust_paragraphs")