In [1]:
# Imports
import spacy
import pandas as pd
import re
import stanza
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict
from collections import Counter
tqdm.pandas()


In [2]:
# Global Variables

books_songs =  [
    "Attractive and Detractive Hyperordnets",
    "the Lost Book of Kells",
    "The Opal, the Pearl and the Peacock",
    "'The Opal, the Pearl and the Peacock",
    "Demonlands",
    "Killings and Mortefactions",
    "Attractive and Detractive Hyperordnets",
    "Procedural Suggestions in Time of Risk",
    "the Tomes of Kae",
    "the Word of Pansiu"
]

artifacts_objects = [
    "Live Boots",
    "the Live Boots",
    "Cyclopedia",
    "the Expansible Egg",
    "Chair of Knowledge",
    "Scintillant Dagger",
    "Boots",
    "Mechanismus",
    "Rune",
    "Egg",
    "Sphere"
]

spells = [
    "the Omnipotent Sphere",
    "the Call to the Violent Cloud",
    "the Excellent Prismatic Spray",
    "Mantle of Stealth",
    "the Spell of the Slow Hour",
    "Four Directions",
    "Second Hypnotic Spell",
    "The Charm of Untiring Nourishment",
    "Critique of the Chill",
    "Gyrator",
    "Lumen",
    "the Call to the Violent Cloud",
    "the Spell of the Omnipotent Sphere"
]

characters = [
    "Pansiu's",
    "Guyal",
    "Kandive",
    "Kandive the Golden",
    "Guyal of Sfere", 
    "Liane the Wayfarer", 
    "Mazirian", 
    "Turjan", 
    "T'sais", 
    "Ulan Dhor", 
    "Elai", 
    "Etarr", 
    "Prince Kandive", 
    "Pandelume", 
    "Rogol Domedonfors", 
    "Shierl", 
    "T'sain",
    "Cazdal",
    "Javanne",
    "Kerlin",
    "the Lake Lord",
    "the Arch-Necromancer Phandaal",
    "Pansiu",
    "Melantine",
    "Voyevode",
    "Kandive the Golden",
    "Blikdak",
    "Laccodel",
    "Mad King Shin",
    "Lycurgat",
    "Saponid"
]

locations = [
    "Ampridatvir",
    "Erze Damath",
    "Kaiin",
    "Sanctuary of the Pelerines",
    "Ascolais",
    "The Scaum Valley",
    "The Forest of Tantrevalles",
    "Ruins of Old Romarth",
    "The Cleft of the Earth",
    "Overworld",
    "Azenomei",
    "Ulan Dhor",
    "Almery",
    "Embelyon",
    "the Land of the Falling Wall",
    "Sfere",
    "Thamber",
    "Kaiin",
    "Miir",
    "Ascolais",
    "Efred",
    "Jeldred",
    "Saponce",
    "Maurenron Range",
    "Porphiron Scar",
    "Omona Gap",
    "East Almery",
    "Bautiku",
    "Tenebrosa",
    "Kalu",
    "Fauvune",
    "Cansapara",
    "South Almery",
    "Ariventa",
    "Sanreale",
    "Tanvilkat",
    "the Old Town",
    "Ampridatvir",
    "Mel-Palusas",
    "Fer Aquila",
    "Carchasel",
    "Derna",
    "Regatta",
    "Carchesel",
    "Scaum",
    "Liane",
    "Thorsingol",
    "Peilvemchal Torrent",
    "the Porphiron Scar",
    "the River Scaum",
    "the Ide of Kauchique",
    "the Cape of Sad Remembrance",
    "Thamber Meadow",
    "the Lake of Dreams",
    "G'Vasan",
    "Melantine"
]

facilities = [
    "Mansion of Chun the Unavoidable",
    "the Place of Whispers",
    "the Tower of Fate",
    "the Tower of the Screaming Ghost",
    "the Tower of Trumpets",
    "the Museum of Man",
    "the Cognative Repository",
    "Temple",
    "Caseboard",
    "Museum of Man"
]

events = [
    "the Black Sabbath",
    "the Dance of the Fourteen Silken Movements",
    "Dawn"
]

norps = [
    "the Signs of the Aumoklopelastianic Cabal",
    "Ghost-takers",
    "Norns",
    "Gaun",
    "The Green Legion of Valdaran the Just",
    "the Grays of Ampridatvir",
    "Saponids",
    "Saponid",
    "the Saponids of Saponce",
    "Ampridatvians",
    "Grays",
    "Raiders",
    "the Green Legion",
    "Green Legion",
    "the Forty Kades",
    "the Sherit Empire",
    "Merioneth",
    "the Gray Sorcerers"
]

creatures = [
    "Deodand",
    "Vile Green Demon",
    "Thrang",
    "Deodands"
    
]

other = [
    "Poh",
    "Mark",
    "Green",
    "Lethargy",
    "Golden",
    "Aye",
    "Pulchritude",
    "the Mechanismus sixty",
    "The Curator guards the Museum of Man",
    "Curator or Museum",
    "Gap",
    "Wayfarer"
]

correction_dict = {
    "BOOK_SONG": books_songs,
    "ARTIFACT_OBJECT": artifacts_objects,
    "SPELL": spells,
    "PERSON": characters,
    "LOC": locations,
    "FAC": facilities,
    "EVENT": events,
    "NORP": norps,
    "CREATURE": creatures,
    "OTHER": other
}

In [3]:
# Functions

def open_book(filename):
    with open("../../Resources/Cleaned/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def add_book_to_df(book, book_title):
    # Split the book text into paragraphs
    paragraphs = book.split('\n')
    
    # Clean each paragraph by removing extra whitespace and trimming
    paragraphs = [re.sub(r'\s+', ' ', para.strip()) for para in paragraphs if para.strip()]

    # Create a DataFrame with two columns: book title and the paragraph text
    df = pd.DataFrame({'Title': [book_title] * len(paragraphs), 'Text': paragraphs})
    return df

def correct_entity_type(entity_text, correction_dict):
    # Normalize the entity text (lowercase, remove extra spaces, handle special chars)
    entity_text_normalized = re.sub(r'\s+', ' ', entity_text).lower().strip(" '\"")

    for category, names in correction_dict.items():
        # Normalize and prepare the names in the dictionary
        normalized_names = [re.sub(r'\s+', ' ', name).lower().strip(" '\"") for name in names]
        
        if entity_text_normalized in normalized_names:
            return category
    return None

def find_entities_in_paragraph(paragraph, entities):
    entities_in_paragraph = set()
    for ent_text, ent_type in entities:
        if ent_text in paragraph:
            entities_in_paragraph.add((ent_text, ent_type))
    return list(entities_in_paragraph)

def dialogue_to_df(text):
    pattern = r'"([^"]*)"'
    dialogues = re.findall(pattern, text)
    df_dialogues = pd.DataFrame(dialogues, columns=['Dialogue'])
    return df_dialogues

def key_phrase_extractor(text, n=1):
    additional_stopwords = {'said', "'s", "n't", "'m", "'re", "'ve", "'ll", "'d"}
    custom_stopwords = set(stopwords.words('english')).union(additional_stopwords)

    # Tokenize the text into words, remove punctuation with regex
    words = word_tokenize(re.sub(r'[^\w\s]', '', text))

    # Remove stop words and convert to lowercase
    words_without_stopwords = [word.lower() for word in words if word.lower() not in custom_stopwords]

    # Generate n-grams
    n_grams = ngrams(words_without_stopwords, n)
    n_grams = [' '.join(grams) for grams in n_grams]

    # Count the frequency of each n-gram
    frequency = Counter(n_grams)

    # Get the top N key phrases
    N = 100
    key_phrases = frequency.most_common(N)

    # Create a DataFrame from the top key phrases
    df = pd.DataFrame(key_phrases, columns=['phrase', 'count'])

    return df

def is_character(entity):
    character_types = {'PERSON'}
    return entity[1] in character_types

def is_location(entity):
    location_types = {'LOC'}
    return entity[1] in location_types


def df_to_csv(df, filename):
    df.to_csv("../../Resources/Cleaned/"+filename+".csv", index=False)

In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

entities = []

for ent in doc.ents:
    entities.append((ent.text, ent.label_))

df = pd.DataFrame(entities, columns=["Entity", "Entity Type"])

df = df.drop_duplicates()