In [8]:
import spacy
import pandas as pd
import re
import stanza
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict
from collections import Counter
from textblob import TextBlob

tqdm.pandas()

In [9]:
# Functions

def open_book(filename):
    with open("../../Resources/Cleaned/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def add_book_to_df(book, book_title):
    # Split the book text into paragraphs
    paragraphs = book.split('\n')
    
    # Clean each paragraph by removing extra whitespace and trimming
    paragraphs = [re.sub(r'\s+', ' ', para.strip()) for para in paragraphs if para.strip()]

    # Create a DataFrame with two columns: book title and the paragraph text
    df = pd.DataFrame({'Title': [book_title] * len(paragraphs), 'Text': paragraphs})
    return df

def correct_entity_type(entity_text, correction_dict):
    # Normalize the entity text (lowercase, remove extra spaces, handle special chars)
    entity_text_normalized = re.sub(r'\s+', ' ', entity_text).lower().strip(" '\"")

    for category, names in correction_dict.items():
        # Normalize and prepare the names in the dictionary
        normalized_names = [re.sub(r'\s+', ' ', name).lower().strip(" '\"") for name in names]
        
        if entity_text_normalized in normalized_names:
            return category
    return None

def find_entities_in_paragraph(paragraph, entities):
    entities_in_paragraph = set()
    for ent_text, ent_type in entities:
        if ent_text in paragraph:
            entities_in_paragraph.add((ent_text, ent_type))
    return list(entities_in_paragraph)

def dialogue_to_df(text):
    pattern = r'"([^"]*)"'
    dialogues = re.findall(pattern, text)
    df_dialogues = pd.DataFrame(dialogues, columns=['Dialogue'])
    return df_dialogues

def key_phrase_extractor(text, n=1):
    additional_stopwords = {'said', "'s", "n't", "'m", "'re", "'ve", "'ll", "'d"}
    custom_stopwords = set(stopwords.words('english')).union(additional_stopwords)

    # Tokenize the text into words, remove punctuation with regex
    words = word_tokenize(re.sub(r'[^\w\s]', '', text))

    # Remove stop words and convert to lowercase
    words_without_stopwords = [word.lower() for word in words if word.lower() not in custom_stopwords]

    # Generate n-grams
    n_grams = ngrams(words_without_stopwords, n)
    n_grams = [' '.join(grams) for grams in n_grams]

    # Count the frequency of each n-gram
    frequency = Counter(n_grams)

    # Get the top N key phrases
    N = 100
    key_phrases = frequency.most_common(N)

    # Create a DataFrame from the top key phrases
    df = pd.DataFrame(key_phrases, columns=['phrase', 'count'])

    return df

def is_character(entity):
    character_types = {'PERSON'}
    return entity[1] in character_types

def is_location(entity):
    location_types = {'LOC'}
    return entity[1] in location_types


def df_to_csv(df, filename):
    df.to_csv("../../Resources/Cleaned/"+filename+".csv", index=False)

In [10]:
book = open_book("dying_earth2_cleaned")

In [11]:
de2_df_para = add_book_to_df(book, "The Eyes of the Overworld")

In [12]:
de2_df_para

Unnamed: 0,Title,Text
0,The Eyes of the Overworld,The Overworld! ON THE HEIGHTS above the river ...
1,The Eyes of the Overworld,"Behind the manse and across the valley, low hi..."
2,The Eyes of the Overworld,"Cugel was a man of many capabilities, with a d..."
3,The Eyes of the Overworld,"These, stamped with appropriate seals and rune..."
4,The Eyes of the Overworld,"Unfortunately for Cugel, not twenty paces from..."
...,...,...
682,The Eyes of the Overworld,"Cugel uttered a harsh laugh. ""I seem to have p..."
683,The Eyes of the Overworld,"I shall not do so a second time. Iucounu, your..."
684,The Eyes of the Overworld,"""The destination is as before: to the shore of..."
685,The Eyes of the Overworld,"For a day and a night the demon dew, grumbling..."


In [7]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp(book)

NameError: name 'stanza' is not defined

In [8]:
entities = []
for ent in doc.entities:
    entities.append((ent.text, ent.type))
entities

[('Overworld', 'PERSON'),
 ('Xzan', 'LOC'),
 ('Iucounu the Laughing Magician', 'PERSON'),
 ('three', 'CARDINAL'),
 ('Xzan', 'GPE'),
 ('the Old Forest', 'LOC'),
 ('Almery', 'GPE'),
 ('three', 'CARDINAL'),
 ('Scaum', 'FAC'),
 ('Azenomei', 'GPE'),
 ('Azenomei Fair Cugel', 'ORG'),
 ('Cugel', 'PERSON'),
 ('the Azenomei Fair', 'EVENT'),
 ('Cugel', 'PERSON'),
 ('twenty', 'CARDINAL'),
 ('Fianosther', 'ORG'),
 ('Cugel', 'PERSON'),
 ('Fianosther', 'PERSON'),
 ('the third day', 'DATE'),
 ('Cugel', 'ORG'),
 ('only four', 'CARDINAL'),
 ('Fianosther', 'ORG'),
 ('Cugel', 'PERSON'),
 ('Fianosther', 'PERSON'),
 ('Fianosther', 'PERSON'),
 ('Cugel', 'PERSON'),
 ('Fianosther', 'PERSON'),
 ('the night', 'TIME'),
 ('Cugel', 'PERSON'),
 ('Fianosther', 'PERSON'),
 ('Cugel', 'PERSON'),
 ('Fianosther', 'PERSON'),
 ('Iucounu the Laughing Magician', 'PERSON'),
 ('Fianosther', 'PERSON'),
 ('Dibarcas Maior', 'PERSON'),
 ('Great Phandaal', 'LOC'),
 ('at least three hours', 'TIME'),
 ('Cugel', 'PERSON'),
 ('Iucounu',

In [None]:
ent_df = pd.DataFrame(entities, columns=["Entity_Name", "Entity_Type"])
ent_df['Entity_Name'] = ent_df['Entity_Name'].str.title()

# Remove duplicates
de_ent_df = ent_df.drop_duplicates().reset_index(drop=True)

In [None]:
de_ent_df["Entity_Type"].unique().tolist()

In [None]:
de_ent_df

In [13]:
df_to_csv(de2_df_para, "dying_earth2_paragraphs")