In [None]:
# Imports
import spacy
import pandas as pd
import re
import stanza
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict
from collections import Counter
tqdm.pandas()


In [None]:
# Global Variables

books_songs =  [
    "Attractive and Detractive Hyperordnets",
    "the Lost Book of Kells",
    "The Opal, the Pearl and the Peacock",
    "'The Opal, the Pearl and the Peacock",
    "Demonlands",
    "Killings and Mortefactions",
    "Attractive and Detractive Hyperordnets",
    "Procedural Suggestions in Time of Risk",
    "the Tomes of Kae",
    "the Word of Pansiu"
]

artifacts_objects = [
    "Live Boots",
    "the Live Boots",
    "Cyclopedia",
    "the Expansible Egg",
    "Chair of Knowledge",
    "Scintillant Dagger",
    "Boots",
    "Mechanismus",
    "Rune",
    "Egg",
    "Sphere"
]

spells = [
    "the Omnipotent Sphere",
    "the Call to the Violent Cloud",
    "the Excellent Prismatic Spray",
    "Mantle of Stealth",
    "the Spell of the Slow Hour",
    "Four Directions",
    "Second Hypnotic Spell",
    "The Charm of Untiring Nourishment",
    "Critique of the Chill",
    "Gyrator",
    "Lumen",
    "the Call to the Violent Cloud",
    "the Spell of the Omnipotent Sphere"
]

characters = [
    "Pansiu's",
    "Guyal",
    "Kandive",
    "Kandive the Golden",
    "Guyal of Sfere", 
    "Liane the Wayfarer", 
    "Mazirian", 
    "Turjan", 
    "T'sais", 
    "Ulan Dhor", 
    "Elai", 
    "Etarr", 
    "Prince Kandive", 
    "Pandelume", 
    "Rogol Domedonfors", 
    "Shierl", 
    "T'sain",
    "Cazdal",
    "Javanne",
    "Kerlin",
    "the Lake Lord",
    "the Arch-Necromancer Phandaal",
    "Pansiu",
    "Melantine",
    "Voyevode",
    "Kandive the Golden",
    "Blikdak",
    "Laccodel",
    "Mad King Shin",
    "Lycurgat",
    "Saponid"
]

locations = [
    "Ampridatvir",
    "Erze Damath",
    "Kaiin",
    "Sanctuary of the Pelerines",
    "Ascolais",
    "The Scaum Valley",
    "The Forest of Tantrevalles",
    "Ruins of Old Romarth",
    "The Cleft of the Earth",
    "Overworld",
    "Azenomei",
    "Ulan Dhor",
    "Almery",
    "Embelyon",
    "the Land of the Falling Wall",
    "Sfere",
    "Thamber",
    "Kaiin",
    "Miir",
    "Ascolais",
    "Efred",
    "Jeldred",
    "Saponce",
    "Maurenron Range",
    "Porphiron Scar",
    "Omona Gap",
    "East Almery",
    "Bautiku",
    "Tenebrosa",
    "Kalu",
    "Fauvune",
    "Cansapara",
    "South Almery",
    "Ariventa",
    "Sanreale",
    "Tanvilkat",
    "the Old Town",
    "Ampridatvir",
    "Mel-Palusas",
    "Fer Aquila",
    "Carchasel",
    "Derna",
    "Regatta",
    "Carchesel",
    "Scaum",
    "Liane",
    "Thorsingol",
    "Peilvemchal Torrent",
    "the Porphiron Scar",
    "the River Scaum",
    "the Ide of Kauchique",
    "the Cape of Sad Remembrance",
    "Thamber Meadow",
    "the Lake of Dreams",
    "G'Vasan",
    "Melantine"
]

facilities = [
    "Mansion of Chun the Unavoidable",
    "the Place of Whispers",
    "the Tower of Fate",
    "the Tower of the Screaming Ghost",
    "the Tower of Trumpets",
    "the Museum of Man",
    "the Cognative Repository",
    "Temple",
    "Caseboard",
    "Museum of Man"
]

events = [
    "the Black Sabbath",
    "the Dance of the Fourteen Silken Movements",
    "Dawn"
]

norps = [
    "the Signs of the Aumoklopelastianic Cabal",
    "Ghost-takers",
    "Norns",
    "Gaun",
    "The Green Legion of Valdaran the Just",
    "the Grays of Ampridatvir",
    "Saponids",
    "Saponid",
    "the Saponids of Saponce",
    "Ampridatvians",
    "Grays",
    "Raiders",
    "the Green Legion",
    "Green Legion",
    "the Forty Kades",
    "the Sherit Empire",
    "Merioneth",
    "the Gray Sorcerers"
]

creatures = [
    "Deodand",
    "Vile Green Demon",
    "Thrang",
    "Deodands"
    
]

other = [
    "Poh",
    "Mark",
    "Green",
    "Lethargy",
    "Golden",
    "Aye",
    "Pulchritude",
    "the Mechanismus sixty",
    "The Curator guards the Museum of Man",
    "Curator or Museum",
    "Gap",
    "Wayfarer"
]

correction_dict = {
    "BOOK_SONG": books_songs,
    "ARTIFACT_OBJECT": artifacts_objects,
    "SPELL": spells,
    "PERSON": characters,
    "LOC": locations,
    "FAC": facilities,
    "EVENT": events,
    "NORP": norps,
    "CREATURE": creatures,
    "OTHER": other
}

In [None]:
# Functions

def open_book(filename):
    with open("../../Resources/Cleaned/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def add_book_to_df(book, book_title):
    # Split the book text into paragraphs
    paragraphs = book.split('\n')
    
    # Clean each paragraph by removing extra whitespace and trimming
    paragraphs = [re.sub(r'\s+', ' ', para.strip()) for para in paragraphs if para.strip()]

    # Create a DataFrame with two columns: book title and the paragraph text
    df = pd.DataFrame({'Title': [book_title] * len(paragraphs), 'Text': paragraphs})
    return df

def correct_entity_type(entity_text, correction_dict):
    # Normalize the entity text (lowercase, remove extra spaces, handle special chars)
    entity_text_normalized = re.sub(r'\s+', ' ', entity_text).lower().strip(" '\"")

    for category, names in correction_dict.items():
        # Normalize and prepare the names in the dictionary
        normalized_names = [re.sub(r'\s+', ' ', name).lower().strip(" '\"") for name in names]
        
        if entity_text_normalized in normalized_names:
            return category
    return None

def find_entities_in_paragraph(paragraph, entities):
    entities_in_paragraph = set()
    for ent_text, ent_type in entities:
        if ent_text in paragraph:
            entities_in_paragraph.add((ent_text, ent_type))
    return list(entities_in_paragraph)

def dialogue_to_df(text):
    pattern = r'"([^"]*)"'
    dialogues = re.findall(pattern, text)
    df_dialogues = pd.DataFrame(dialogues, columns=['Dialogue'])
    return df_dialogues

def key_phrase_extractor(text, n=1):
    additional_stopwords = {'said', "'s", "n't", "'m", "'re", "'ve", "'ll", "'d"}
    custom_stopwords = set(stopwords.words('english')).union(additional_stopwords)

    # Tokenize the text into words, remove punctuation with regex
    words = word_tokenize(re.sub(r'[^\w\s]', '', text))

    # Remove stop words and convert to lowercase
    words_without_stopwords = [word.lower() for word in words if word.lower() not in custom_stopwords]

    # Generate n-grams
    n_grams = ngrams(words_without_stopwords, n)
    n_grams = [' '.join(grams) for grams in n_grams]

    # Count the frequency of each n-gram
    frequency = Counter(n_grams)

    # Get the top N key phrases
    N = 100
    key_phrases = frequency.most_common(N)

    # Create a DataFrame from the top key phrases
    df = pd.DataFrame(key_phrases, columns=['phrase', 'count'])

    return df

def is_character(entity):
    character_types = {'PERSON'}
    return entity[1] in character_types

def is_location(entity):
    location_types = {'LOC'}
    return entity[1] in location_types


def df_to_csv(df, filename):
    df.to_csv("../../Resources/Cleaned/"+filename+".csv", index=False)

In [None]:
text = open_book("de")

In [None]:
type(text)

In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

entities = []

for ent in doc.ents:
    entities.append((ent.text, ent.label_))

df = pd.DataFrame(entities, columns=["Entity", "Entity Type"])

df = df.drop_duplicates()

df

In [None]:
unique_entity_types = df["Entity Type"].unique()

# Display the unique entity types
print(unique_entity_types)

In [None]:
language_entities = df[df["Entity Type"] == "PERSON"]

# Print the filtered entities
print(language_entities)

In [None]:
de_df = add_book_to_df(book, "The Dying Earth")

In [None]:
de_df 

In [None]:
stanza.download('en')

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp(book)

In [None]:
entities = set()
for sentence in doc.sentences:
    for entity in sentence.ents:
        # Add entity text to the set, this automatically removes duplicates
        entities.add(entity.text)

# Displaying the comprehensive list of entities
for entity in sorted(entities):
    print(entity)

In [None]:
corrected_entities = []

for ent in doc.entities:
    corrected_type = correct_entity_type(ent.text, correction_dict)
    
    # Use the corrected type if available; otherwise, use the original type
    
    ent_type = corrected_type if corrected_type else ent.type
    corrected_entities.append((ent.text, ent_type))

In [None]:
# Define a set of unwanted entities
unwanted_entities = {
    ("TURJAN", "PERSON"),
    ("Aghast", "PERSON"),
    ("Ampridatvirian", "NORP"),
    ("Azvan the Astronomer", "PERSON"),
    ("Boots", "ARTIFACT_OBJECT"),
    ("Carchasel", "LOC"),
    ("Caseboard", "FAC"),
    ("Castellan", "NORP"),
    ("Castellan", "PERSON"),
    ("Castellan's", "PERSON"),
    ("Dandanflores", "NORP"),
    ("Dawn", "EVENT"),
    ("Deodands", "CREATURE"),
    ("Dhor", "PERSON"),
    ("Dusty", "PERSON"),
    ("East", "LOC"),
    ("Egg", "ARTIFACT_OBJECT"),
    ("Elai's", "PERSON"),
    ("Etarr the Masked", "PERSON"),
    ("Ethodea", "NORP"),
    ("Falling Wall", "LOC"),
    ("Felon", "PERSON"),
    ("Four Directions", "SPELL"),
    ("Gaun", "NORP"),
    ("Gauns", "PERSON"),
    ("Golickan Kodek the Conqueror", "PERSON"),
    ("Gray", "PERSON"),
    ("Guyal of Sfere", "PERSON"),
    ("Gyrator", "SPELL"),
    ("Hideous", "PERSON"),
    ("I am Chun the Unavoidable", "PERSON"),
    ("Kandive the Golden", "PERSON"),
    ("Kerlin the Curator", "PERSON"),
    ("Kerlin's", "PERSON"),
    ("Liane the Wayfarer", "PERSON"),
    ("MAGICIAN", "PERSON"),
    ("MAZIRIAN", "PERSON"),
    ("Mazirian the Magician", "PERSON"),
    ("Moon", "LOC"),
    ("this Temple of Pansiu", "FAC"),
    ("Prince Kandive the Golden", "PERSON"),
    ("Porrina", "PERSON"),
    ("Pubescentarium", "FAC"),
    ("Raider", "PERSON"),
    ("Regatta", "LOC"),
    ("Rogol Domedonfors'", "PERSON"),
    ("Sergeant-Reader of the Litany", "PERSON"),
    ("South", "LOC"),
    ("Temple", "FAC"),
    ("Magician", "PERSON"),
    ("Turjan of Miir", "PERSON"),
    ("T’sais", "PERSON"),
    ("ULAN", "PERSON"),
    ("Ulan Dhor", "PERSON"),
    ("Ulan Dhor's", "PERSON"),
    ("Uncle Ludowik's", "PERSON"),
    ("Uncle Ludowik", "PERSON"),
    ("earth", "LOC"),
    ("the Dance of the Fourteen Silken Movements", "EVENT"),
    ("the Tower  of Fate", "FAC"),
}

# Define a set of unwanted types
unwanted_types = {'DATE', 'TIME', 'CARDINAL', 'ORDINAL', 'LAW', 'QUANTITY', 'BOOK_SONG', 'OTHER'}

# Initialize the list for the updated entities
updated_entities = []

# Iterate over corrected_entities to rename, correct format, and filter
for entity in corrected_entities:
    entity_text, entity_type = entity

    # Check for the specific entity "Bay the Cape of Sad Remembrance" to rename
    if entity == ("Bay the Cape of Sad Remembrance", "FAC"):
        entity_text = "Cape of Sad Remembrance"

    # Correct the format for the entity '"The Green Legion of Valdaran the Just'
    if entity == ('"The Green Legion of Valdaran the Just', 'NORP'):
        entity_text = "The Green Legion of Valdaran the Just"

    # Change ("Olek", "PERSON") to ("Olek'hnit", "NORP")
    if entity == ("Olek", "PERSON"):
        entity_text = "Olek'hnit"
        entity_type = "NORP"

    # Create a new tuple with the possibly updated text and type
    updated_entity = (entity_text, entity_type)

    # Add the entity to the list if it's not unwanted
    if updated_entity[1] not in unwanted_types and updated_entity not in unwanted_entities:
        updated_entities.append(updated_entity)

# Now updated_entities contains your required entities
entities = updated_entities

In [None]:
unique_entities = list(set(entities))

# Displaying the unique named entities
for ent_text, ent_type in sorted(unique_entities):
    print(f'Entity: ("{ent_text}", "{ent_type}")')

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

# Function to process a paragraph and return matching entities
def extract_matching_entities(paragraph):
    doc = nlp(paragraph)
    paragraph_entities = set((ent.text, ent.type) for sent in doc.sentences for ent in sent.ents)
    # Return entities that are in both paragraph_entities and unique_entities
    return paragraph_entities.intersection(unique_entities)

# Apply the function to each paragraph and create a new column with the matched entities
de_df['Entities'] = de_df['Text'].apply(extract_matching_entities)

In [None]:
de_df

In [None]:
ent_df = pd.DataFrame(unique_entities, columns=["Entity_Name", "Entity_Type"])
ent_df['Entity_Name'] = ent_df['Entity_Name'].str.title()

# Remove duplicates
de_ent_df = ent_df.drop_duplicates().reset_index(drop=True)

In [None]:
de_ent_df["Entity_Type"].unique().tolist()

In [None]:
de_ent_df

In [None]:
de_key_phrase_df = key_phrase_extractor(book)

In [None]:
de_dialogue_df = dialogue_to_df(book)

In [None]:
de_dialogue_df

In [None]:
de_df_para = de_df.drop(columns=['Entities'])

In [None]:
de_df_para.head()

In [None]:
df_to_csv(de_df, "dying_earth1_paragraphs_entities")
df_to_csv(de_dialogue_df, "dying_earth1_dialogue")
df_to_csv(de_key_phrase_df, "dying_earth1_key_phrases")
df_to_csv(de_ent_df, "dying_earth1_entities")
df_to_csv(de_df_para, "dying_earth1_paragraphs")

In [None]:
de_df['Entities'] = de_df['Entities'].apply(lambda x: list(x))


In [None]:
de_df.head(10)

In [None]:
de_df.shape

In [None]:
def filter_entities(entities_set):
    return {(entity, type) for entity, type in entities_set if not (entity == "Earth" and type == "LOC")}


In [None]:
import pandas as pd
import plotly.graph_objects as go
import networkx as nx
from itertools import combinations

# Assuming de_df is your DataFrame and already loaded with the 'Entities' column filled as per your function

# Define entity types and their corresponding colors
entity_types = ['PERSON', 'LOC', 'ARTIFACT_OBJECT', 'FAC', 'NORP', 'SPELL', 'CREATURE', 'EVENT']
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'orange', 'purple']
color_map = dict(zip(entity_types, colors))

# Create a network graph
G = nx.Graph()

# Add nodes with their type
for _, row in de_df.iterrows():
    filtered_entities = filter_entities(row['Entities'])
    for entity, entity_type in filtered_entities:
        G.add_node(entity, type=entity_type)

# Add edges (for simplicity, connecting all entities within the same text, except ("Earth", "LOC"))
for _, row in de_df.iterrows():
    filtered_entities = filter_entities(row['Entities'])
    entities = [entity for entity, _ in filtered_entities]
    for source, target in combinations(entities, 2):
        G.add_edge(source, target)

# Position the nodes using a layout to bring outliers closer
pos = nx.kamada_kawai_layout(G)

# Prepare plotly graph
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_color = []
node_text = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_color.append(color_map.get(G.nodes[node]['type'], 'grey'))
    node_text.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=node_text,
    textposition="top center",
    marker=dict(
        size=10,
        color=node_color,
        line_width=2))

# Create layout for the graph
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Network graph of entities in "The Dying Earth"',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
fig.update_layout(
    width=1000,  # Set the width of the plot
    height=1000)

# Code to display the graph
fig.show()

In [None]:
# Replace empty lists in 'Entities' with NaN, so they are handled properly when exploding
de_df['Entities'] = de_df['Entities'].apply(lambda x: x if x else float('nan'))

# Explode the 'Entities' column
exploded_df = de_df.explode('Entities')

# Drop rows with NaN in 'Entities' (which were originally empty lists)
exploded_df = exploded_df.dropna(subset=['Entities'])

# Split the tuples in 'Entities' into two columns, 'Entity' and 'Type'
exploded_df[['Entity', 'Type']] = pd.DataFrame(exploded_df['Entities'].tolist(), index=exploded_df.index)

# Drop the original 'Entities' column
exploded_df = exploded_df.drop('Entities', axis=1)

# Filter out rows where 'Entity' is 'Earth' and 'Type' is 'LOC'
exploded_df = exploded_df[~((exploded_df['Entity'] == 'Earth') & (exploded_df['Type'] == 'LOC'))]

exploded_df.loc[exploded_df['Entity'] == "Land of the Falling Wall", 'Type'] = "LOC"



In [None]:
readable_format = {
    'PERSON': 'Person',
    'LOC': 'Location',
    'ARTIFACT_OBJECT': 'Artifact or Object',
    'FAC': 'Facility',
    'NORP': 'Nationality or Religious or Political group',
    'SPELL': 'Spell',
    'CREATURE': 'Creature',
    'EVENT': 'Event'
}

exploded_df['Type'] = exploded_df['Type'].map(readable_format).fillna(exploded_df['Type'])


In [None]:
df_to_csv(exploded_df, "entities_graph_clean")

In [None]:
exploded_df
