In [79]:
# Imports
import spacy
import pandas as pd
import re
import stanza
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict
from collections import Counter
from textblob import TextBlob

tqdm.pandas()


In [80]:
# Global Variables

books_songs =  [
    "Attractive and Detractive Hyperordnets",
    "the Lost Book of Kells",
    "The Opal, the Pearl and the Peacock",
    "'The Opal, the Pearl and the Peacock",
    "Demonlands",
    "Killings and Mortefactions",
    "Attractive and Detractive Hyperordnets",
    "Procedural Suggestions in Time of Risk",
    "the Tomes of Kae",
    "the Word of Pansiu"
]

artifacts_objects = [
    "Live Boots",
    "the Live Boots",
    "Cyclopedia",
    "the Expansible Egg",
    "Chair of Knowledge",
    "Scintillant Dagger",
    "Boots",
    "Mechanismus",
    "Rune",
    "Egg",
    "Sphere"
]

spells = [
    "the Omnipotent Sphere",
    "the Call to the Violent Cloud",
    "the Excellent Prismatic Spray",
    "Mantle of Stealth",
    "the Spell of the Slow Hour",
    "Four Directions",
    "Second Hypnotic Spell",
    "The Charm of Untiring Nourishment",
    "Critique of the Chill",
    "Gyrator",
    "Lumen",
    "the Call to the Violent Cloud",
    "the Spell of the Omnipotent Sphere"
]

characters = [
    "Pansiu's",
    "Guyal",
    "Kandive",
    "Kandive the Golden",
    "Guyal of Sfere", 
    "Liane the Wayfarer", 
    "Mazirian", 
    "Turjan", 
    "T'sais", 
    "Ulan Dhor", 
    "Elai", 
    "Etarr", 
    "Prince Kandive", 
    "Pandelume", 
    "Rogol Domedonfors", 
    "Shierl", 
    "T'sain",
    "Cazdal",
    "Javanne",
    "Kerlin",
    "the Lake Lord",
    "the Arch-Necromancer Phandaal",
    "Pansiu",
    "Melantine",
    "Voyevode",
    "Kandive the Golden",
    "Blikdak",
    "Laccodel",
    "Mad King Shin",
    "Lycurgat",
    "Saponid"
]

locations = [
    "Ampridatvir",
    "Erze Damath",
    "Kaiin",
    "Sanctuary of the Pelerines",
    "Ascolais",
    "The Scaum Valley",
    "The Forest of Tantrevalles",
    "Ruins of Old Romarth",
    "The Cleft of the Earth",
    "Overworld",
    "Azenomei",
    "Ulan Dhor",
    "Almery",
    "Embelyon",
    "the Land of the Falling Wall",
    "Sfere",
    "Thamber",
    "Kaiin",
    "Miir",
    "Ascolais",
    "Efred",
    "Jeldred",
    "Saponce",
    "Maurenron Range",
    "Porphiron Scar",
    "Omona Gap",
    "East Almery",
    "Bautiku",
    "Tenebrosa",
    "Kalu",
    "Fauvune",
    "Cansapara",
    "South Almery",
    "Ariventa",
    "Sanreale",
    "Tanvilkat",
    "the Old Town",
    "Ampridatvir",
    "Mel-Palusas",
    "Fer Aquila",
    "Carchasel",
    "Derna",
    "Regatta",
    "Carchesel",
    "Scaum",
    "Liane",
    "Thorsingol",
    "Peilvemchal Torrent",
    "the Porphiron Scar",
    "the River Scaum",
    "the Ide of Kauchique",
    "the Cape of Sad Remembrance",
    "Thamber Meadow",
    "the Lake of Dreams",
    "G'Vasan",
    "Melantine"
]

facilities = [
    "Mansion of Chun the Unavoidable",
    "the Place of Whispers",
    "the Tower of Fate",
    "the Tower of the Screaming Ghost",
    "the Tower of Trumpets",
    "the Museum of Man",
    "the Cognative Repository",
    "Temple",
    "Caseboard",
    "Museum of Man"
]

events = [
    "the Black Sabbath",
    "the Dance of the Fourteen Silken Movements",
    "Dawn"
]

norps = [
    "the Signs of the Aumoklopelastianic Cabal",
    "Ghost-takers",
    "Norns",
    "Gaun",
    "The Green Legion of Valdaran the Just",
    "the Grays of Ampridatvir",
    "Saponids",
    "Saponid",
    "the Saponids of Saponce",
    "Ampridatvians",
    "Grays",
    "Raiders",
    "the Green Legion",
    "Green Legion",
    "the Forty Kades",
    "the Sherit Empire",
    "Merioneth",
    "the Gray Sorcerers"
]

creatures = [
    "Deodand",
    "Vile Green Demon",
    "Thrang",
    "Deodands"
    
]

other = [
    "Poh",
    "Mark",
    "Green",
    "Lethargy",
    "Golden",
    "Aye",
    "Pulchritude",
    "the Mechanismus sixty",
    "The Curator guards the Museum of Man",
    "Curator or Museum",
    "Gap",
    "Wayfarer"
]

correction_dict = {
    "BOOK_SONG": books_songs,
    "ARTIFACT_OBJECT": artifacts_objects,
    "SPELL": spells,
    "PERSON": characters,
    "LOC": locations,
    "FAC": facilities,
    "EVENT": events,
    "NORP": norps,
    "CREATURE": creatures,
    "OTHER": other
}

In [81]:
# Functions

def open_book(filename):
    with open("../../Resources/Cleaned/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def add_book_to_df(book, book_title):
    # Split the book text into paragraphs
    paragraphs = book.split('\n')
    
    # Clean each paragraph by removing extra whitespace and trimming
    paragraphs = [re.sub(r'\s+', ' ', para.strip()) for para in paragraphs if para.strip()]

    # Create a DataFrame with two columns: book title and the paragraph text
    df = pd.DataFrame({'Title': [book_title] * len(paragraphs), 'Text': paragraphs})
    return df

def correct_entity_type(entity_text, correction_dict):
    # Normalize the entity text (lowercase, remove extra spaces, handle special chars)
    entity_text_normalized = re.sub(r'\s+', ' ', entity_text).lower().strip(" '\"")

    for category, names in correction_dict.items():
        # Normalize and prepare the names in the dictionary
        normalized_names = [re.sub(r'\s+', ' ', name).lower().strip(" '\"") for name in names]
        
        if entity_text_normalized in normalized_names:
            return category
    return None

def find_entities_in_paragraph(paragraph, entities):
    entities_in_paragraph = set()
    for ent_text, ent_type in entities:
        if ent_text in paragraph:
            entities_in_paragraph.add((ent_text, ent_type))
    return list(entities_in_paragraph)

def dialogue_to_df(text):
    pattern = r'"([^"]*)"'
    dialogues = re.findall(pattern, text)
    df_dialogues = pd.DataFrame(dialogues, columns=['Dialogue'])
    return df_dialogues

def key_phrase_extractor(text, n=1):
    additional_stopwords = {'said', "'s", "n't", "'m", "'re", "'ve", "'ll", "'d"}
    custom_stopwords = set(stopwords.words('english')).union(additional_stopwords)

    # Tokenize the text into words, remove punctuation with regex
    words = word_tokenize(re.sub(r'[^\w\s]', '', text))

    # Remove stop words and convert to lowercase
    words_without_stopwords = [word.lower() for word in words if word.lower() not in custom_stopwords]

    # Generate n-grams
    n_grams = ngrams(words_without_stopwords, n)
    n_grams = [' '.join(grams) for grams in n_grams]

    # Count the frequency of each n-gram
    frequency = Counter(n_grams)

    # Get the top N key phrases
    N = 100
    key_phrases = frequency.most_common(N)

    # Create a DataFrame from the top key phrases
    df = pd.DataFrame(key_phrases, columns=['phrase', 'count'])

    return df

def is_character(entity):
    character_types = {'PERSON'}
    return entity[1] in character_types

def is_location(entity):
    location_types = {'LOC'}
    return entity[1] in location_types


def df_to_csv(df, filename):
    df.to_csv("../../Resources/Cleaned/"+filename+".csv", index=False)

In [82]:
book = open_book("dying_earth1_cleaned")

In [83]:
de_df = add_book_to_df(book, "The Dying Earth")

In [84]:
de_df 

Unnamed: 0,Title,Text
0,The Dying Earth,"TURJAN SAT in his workroom, legs sprawled out ..."
1,The Dying Earth,It was a thing to arouse pity—a great head on ...
2,The Dying Earth,"Turjan stood up, found a bowl of pap. With a l..."
3,The Dying Earth,"Turjan put down the bowl, stood back and slowl..."
4,The Dying Earth,Turjan sighed and left the room. He mounted wi...
...,...,...
959,The Dying Earth,Guyal and Shierl climbed to the upper ways and...
960,The Dying Earth,Across the plain the yellow lights of Saponce ...
961,The Dying Earth,"Guyal said to Shierl, ""There is your home; the..."
962,The Dying Earth,"""Knowledge is ours, Shierl—all of knowing to o..."


In [85]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-13 06:15:09 INFO: Downloading default packages for language: en (English) ...
2023-11-13 06:15:10 INFO: File exists: C:\Users\dontb\stanza_resources\en\default.zip
2023-11-13 06:15:14 INFO: Finished downloading models and saved to C:\Users\dontb\stanza_resources.


In [86]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp(book)

2023-11-13 06:15:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-13 06:15:15 INFO: Loading these models for language: en (English):
| Processor | Package          |
--------------------------------
| tokenize  | combined         |
| ner       | ontonotes_charlm |

2023-11-13 06:15:15 INFO: Using device: cpu
2023-11-13 06:15:15 INFO: Loading: tokenize
2023-11-13 06:15:15 INFO: Loading: ner
2023-11-13 06:15:16 INFO: Done loading processors!


In [87]:
entities = set()
for sentence in doc.sentences:
    for entity in sentence.ents:
        # Add entity text to the set, this automatically removes duplicates
        entities.add(entity.text)

# Displaying the comprehensive list of entities
for entity in sorted(entities):
    print(entity)

"The Green Legion of Valdaran the Just
'The Opal, the Pearl and the Peacock
1
1,200 years
A bitter night
A few days later
A few minutes later
A hundred
A million
Aghast
Almery
Ameth
Ampridatvians
Ampridatvir
Ampridatvirian
An inch
Another hour
Ariventa
Ascolais
Attractive and Detractive Hyperordnets
Aye
Azvan
Azvan the Astronomer
Bautiku
Bay the Cape of Sad Remembrance
Blikdak
Boots
Cansapara
Carchasel
Carchesel
Caseboard
Castellan
Castellan's
Castle Miir
Cazdal
Cazdal's Temple
Chair of Knowledge
Chun
Clambs
Cobalt Mountain
Critique of the Chill
Curator
Curator or Museum
Cyclopedia
Daily
Dandanflores
Datul Omaet
Dawn
Delaphasians
Dellare
Demonlands
Deodand
Deodands
Derna
Dhor
Dusty
Earth
East
East Almery
Efred
Egg
Eight
Elai
Elai's
Embelyon
Embelyon night
Etarr
Etarr the Masked
Ethodea
Falling Wall
Fauvune
Felojun
Felon
Fer Aquila
Fifty
Fifty years
First
Five thousand years
Florejin
Floriel
Four
Four Directions
G'Vasan
Gap
Gaun
Gauns
Ghost-takers
Golden
Golickan Kodek
Golickan Kodek th

In [88]:
corrected_entities = []

for ent in doc.entities:
    corrected_type = correct_entity_type(ent.text, correction_dict)
    
    # Use the corrected type if available; otherwise, use the original type
    
    ent_type = corrected_type if corrected_type else ent.type
    corrected_entities.append((ent.text, ent_type))

In [112]:
# Define a set of unwanted entities
unwanted_entities = {
    ("TURJAN", "PERSON"),
    ("Aghast", "PERSON"),
    ("Ampridatvirian", "NORP"),
    ("Azvan the Astronomer", "PERSON"),
    ("Boots", "ARTIFACT_OBJECT"),
    ("Carchasel", "LOC"),
    ("Caseboard", "FAC"),
    ("Castellan", "NORP"),
    ("Castellan", "PERSON"),
    ("Castellan's", "PERSON"),
    ("Dandanflores", "NORP"),
    ("Dawn", "EVENT"),
    ("Deodands", "CREATURE"),
    ("Dhor", "PERSON"),
    ("Dusty", "PERSON"),
    ("East", "LOC"),
    ("Egg", "ARTIFACT_OBJECT"),
    ("Elai's", "PERSON"),
    ("Etarr the Masked", "PERSON"),
    ("Ethodea", "NORP"),
    ("Falling Wall", "LOC"),
    ("Felon", "PERSON"),
    ("Four Directions", "SPELL"),
    ("Gaun", "NORP"),
    ("Gauns", "PERSON"),
    ("Golickan Kodek the Conqueror", "PERSON"),
    ("Gray", "PERSON"),
    ("Guyal of Sfere", "PERSON"),
    ("Gyrator", "SPELL"),
    ("Hideous", "PERSON"),
    ("I am Chun the Unavoidable", "PERSON"),
    ("Kandive the Golden", "PERSON"),
    ("Kerlin the Curator", "PERSON"),
    ("Kerlin's", "PERSON"),
    ("Liane the Wayfarer", "PERSON"),
    ("MAGICIAN", "PERSON"),
    ("MAZIRIAN", "PERSON"),
    ("Mazirian the Magician", "PERSON"),
    ("Moon", "LOC"),
    ("this Temple of Pansiu", "FAC"),
    ("Prince Kandive the Golden", "PERSON"),
    ("Porrina", "PERSON"),
    ("Pubescentarium", "FAC"),
    ("Raider", "PERSON"),
    ("Regatta", "LOC"),
    ("Rogol Domedonfors'", "PERSON"),
    ("Sergeant-Reader of the Litany", "PERSON"),
    ("South", "LOC"),
    ("Temple", "FAC"),
    ("Magician", "PERSON"),
    ("Turjan of Miir", "PERSON"),
    ("T’sais", "PERSON"),
    ("ULAN", "PERSON"),
    ("Ulan Dhor", "PERSON"),
    ("Ulan Dhor's", "PERSON"),
    ("Uncle Ludowik's", "PERSON"),
    ("Uncle Ludowik", "PERSON"),
    ("earth", "LOC"),
    ("the Dance of the Fourteen Silken Movements", "EVENT"),
    ("the Tower  of Fate", "FAC"),
}

# Define a set of unwanted types
unwanted_types = {'DATE', 'TIME', 'CARDINAL', 'ORDINAL', 'LAW', 'QUANTITY', 'BOOK_SONG', 'OTHER'}

# Initialize the list for the updated entities
updated_entities = []

# Iterate over corrected_entities to rename, correct format, and filter
for entity in corrected_entities:
    entity_text, entity_type = entity

    # Check for the specific entity "Bay the Cape of Sad Remembrance" to rename
    if entity == ("Bay the Cape of Sad Remembrance", "FAC"):
        entity_text = "Cape of Sad Remembrance"

    # Correct the format for the entity '"The Green Legion of Valdaran the Just'
    if entity == ('"The Green Legion of Valdaran the Just', 'NORP'):
        entity_text = "The Green Legion of Valdaran the Just"

    # Change ("Olek", "PERSON") to ("Olek'hnit", "NORP")
    if entity == ("Olek", "PERSON"):
        entity_text = "Olek'hnit"
        entity_type = "NORP"

    # Create a new tuple with the possibly updated text and type
    updated_entity = (entity_text, entity_type)

    # Add the entity to the list if it's not unwanted
    if updated_entity[1] not in unwanted_types and updated_entity not in unwanted_entities:
        updated_entities.append(updated_entity)

# Now updated_entities contains your required entities
entities = updated_entities

In [113]:
unique_entities = list(set(entities))

# Displaying the unique named entities
for ent_text, ent_type in sorted(unique_entities):
    print(f'Entity: ("{ent_text}", "{ent_type}")')

Entity: ("Almery", "LOC")
Entity: ("Ameth", "PERSON")
Entity: ("Ampridatvians", "NORP")
Entity: ("Ampridatvir", "LOC")
Entity: ("Ariventa", "LOC")
Entity: ("Ascolais", "LOC")
Entity: ("Azvan", "PERSON")
Entity: ("Bautiku", "LOC")
Entity: ("Blikdak", "PERSON")
Entity: ("Cansapara", "LOC")
Entity: ("Cape of Sad Remembrance", "FAC")
Entity: ("Carchesel", "LOC")
Entity: ("Castle Miir", "LOC")
Entity: ("Cazdal", "PERSON")
Entity: ("Cazdal's Temple", "FAC")
Entity: ("Chair of Knowledge", "ARTIFACT_OBJECT")
Entity: ("Chun", "PERSON")
Entity: ("Clambs", "NORP")
Entity: ("Cobalt Mountain", "LOC")
Entity: ("Critique of the Chill", "SPELL")
Entity: ("Curator", "PERSON")
Entity: ("Cyclopedia", "ARTIFACT_OBJECT")
Entity: ("Datul Omaet", "PERSON")
Entity: ("Delaphasians", "NORP")
Entity: ("Dellare", "PERSON")
Entity: ("Deodand", "CREATURE")
Entity: ("Derna", "LOC")
Entity: ("Earth", "LOC")
Entity: ("East Almery", "LOC")
Entity: ("Efred", "LOC")
Entity: ("Elai", "PERSON")
Entity: ("Embelyon", "LOC")


In [114]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

# Function to process a paragraph and return matching entities
def extract_matching_entities(paragraph):
    doc = nlp(paragraph)
    paragraph_entities = set((ent.text, ent.type) for sent in doc.sentences for ent in sent.ents)
    # Return entities that are in both paragraph_entities and unique_entities
    return paragraph_entities.intersection(unique_entities)

# Apply the function to each paragraph and create a new column with the matched entities
de_df['Entities'] = de_df['Text'].apply(extract_matching_entities)

2023-11-13 06:29:13 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-13 06:29:14 INFO: Loading these models for language: en (English):
| Processor | Package          |
--------------------------------
| tokenize  | combined         |
| ner       | ontonotes_charlm |

2023-11-13 06:29:14 INFO: Using device: cpu
2023-11-13 06:29:14 INFO: Loading: tokenize
2023-11-13 06:29:14 INFO: Loading: ner
2023-11-13 06:29:15 INFO: Done loading processors!


In [115]:
de_df

Unnamed: 0,Title,Text,Entities
0,The Dying Earth,"TURJAN SAT in his workroom, legs sprawled out ...",{}
1,The Dying Earth,It was a thing to arouse pity—a great head on ...,{}
2,The Dying Earth,"Turjan stood up, found a bowl of pap. With a l...","{(Turjan, PERSON)}"
3,The Dying Earth,"Turjan put down the bowl, stood back and slowl...","{(Turjan, PERSON)}"
4,The Dying Earth,Turjan sighed and left the room. He mounted wi...,"{(Turjan, PERSON)}"
...,...,...,...
959,The Dying Earth,Guyal and Shierl climbed to the upper ways and...,"{(Guyal, PERSON), (Shierl, PERSON)}"
960,The Dying Earth,Across the plain the yellow lights of Saponce ...,{}
961,The Dying Earth,"Guyal said to Shierl, ""There is your home; the...","{(Guyal, PERSON), (Clambs, NORP), (the Gray So..."
962,The Dying Earth,"""Knowledge is ours, Shierl—all of knowing to o...","{(Shierl, PERSON)}"


In [116]:
ent_df = pd.DataFrame(unique_entities, columns=["Entity_Name", "Entity_Type"])
ent_df['Entity_Name'] = ent_df['Entity_Name'].str.title()

# Remove duplicates
de_ent_df = ent_df.drop_duplicates().reset_index(drop=True)

In [117]:
de_ent_df["Entity_Type"].unique().tolist()

['PERSON',
 'LOC',
 'ARTIFACT_OBJECT',
 'FAC',
 'NORP',
 'SPELL',
 'CREATURE',
 'EVENT']

In [118]:
de_ent_df

Unnamed: 0,Entity_Name,Entity_Type
0,Mad King Shin,PERSON
1,Carchesel,LOC
2,The Cape Of Sad Remembrance,LOC
3,Peilvemchal Torrent,LOC
4,Thamber Meadow,LOC
...,...,...
179,Modavna Moor,LOC
180,The Spell Of The Slow Hour,SPELL
181,Mazirian,PERSON
182,The Green Legion Of Valdaran The Just,NORP


In [119]:
de_key_phrase_df = key_phrase_extractor(book)

In [120]:
de_dialogue_df = dialogue_to_df(book)

In [121]:
de_dialogue_df

Unnamed: 0,Dialogue
0,"In ages gone,"
1,a thousand spells were known to sorcery and th...
2,Where is this Pandelume?
3,"He dwells in the land of Embelyon,"
4,"but where this land lies, no one knows."
...,...
1295,How do I extract from the banks?
1296,"The key to the index is in my chambers, the in..."
1297,There is your home; there is Saponce. Do you w...
1298,"Guyal, leaning back on the weathered pillar, ..."


In [122]:
de_df_para = de_df.drop(columns=['Entities'])

In [123]:
de_df_para.head()

Unnamed: 0,Title,Text
0,The Dying Earth,"TURJAN SAT in his workroom, legs sprawled out ..."
1,The Dying Earth,It was a thing to arouse pity—a great head on ...
2,The Dying Earth,"Turjan stood up, found a bowl of pap. With a l..."
3,The Dying Earth,"Turjan put down the bowl, stood back and slowl..."
4,The Dying Earth,Turjan sighed and left the room. He mounted wi...


In [124]:
df_to_csv(de_df, "dying_earth1_paragraphs_entities")
df_to_csv(de_dialogue_df, "dying_earth1_dialogue")
df_to_csv(de_key_phrase_df, "dying_earth1_key_phrases")
df_to_csv(de_ent_df, "dying_earth1_entities")
df_to_csv(de_df_para, "dying_earth1_paragraphs")

In [125]:
de_df['Entities'] = de_df['Entities'].apply(lambda x: list(x))


In [126]:
de_df.head(10)

Unnamed: 0,Title,Text,Entities
0,The Dying Earth,"TURJAN SAT in his workroom, legs sprawled out ...",[]
1,The Dying Earth,It was a thing to arouse pity—a great head on ...,[]
2,The Dying Earth,"Turjan stood up, found a bowl of pap. With a l...","[(Turjan, PERSON)]"
3,The Dying Earth,"Turjan put down the bowl, stood back and slowl...","[(Turjan, PERSON)]"
4,The Dying Earth,Turjan sighed and left the room. He mounted wi...,"[(Turjan, PERSON)]"
5,The Dying Earth,In the west the sun hung close to old earth; r...,"[(Turjan, PERSON)]"
6,The Dying Earth,He considered its many precursors: the thing a...,"[(Turjan, PERSON)]"
7,The Dying Earth,"As he sat gazing across the darkening land, me...","[(Turjan, PERSON), (Sage, PERSON)]"
8,The Dying Earth,"""In ages gone,"" the Sage had said, his eyes fi...","[(Sage, PERSON), (Pandelume, PERSON), (Earth, ..."
9,The Dying Earth,"""Where is this Pandelume?"" Turjan had asked pr...","[(Turjan, PERSON), (Sage, PERSON)]"


In [127]:
de_df.shape

(964, 3)

In [128]:
def filter_entities(entities_set):
    return {(entity, type) for entity, type in entities_set if not (entity == "Earth" and type == "LOC")}


In [129]:
import pandas as pd
import plotly.graph_objects as go
import networkx as nx
from itertools import combinations

# Assuming de_df is your DataFrame and already loaded with the 'Entities' column filled as per your function

# Define entity types and their corresponding colors
entity_types = ['PERSON', 'LOC', 'ARTIFACT_OBJECT', 'FAC', 'NORP', 'SPELL', 'CREATURE', 'EVENT']
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'orange', 'purple']
color_map = dict(zip(entity_types, colors))

# Create a network graph
G = nx.Graph()

# Add nodes with their type
for _, row in de_df.iterrows():
    filtered_entities = filter_entities(row['Entities'])
    for entity, entity_type in filtered_entities:
        G.add_node(entity, type=entity_type)

# Add edges (for simplicity, connecting all entities within the same text, except ("Earth", "LOC"))
for _, row in de_df.iterrows():
    filtered_entities = filter_entities(row['Entities'])
    entities = [entity for entity, _ in filtered_entities]
    for source, target in combinations(entities, 2):
        G.add_edge(source, target)

# Position the nodes using a layout to bring outliers closer
pos = nx.kamada_kawai_layout(G)

# Prepare plotly graph
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_color = []
node_text = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_color.append(color_map.get(G.nodes[node]['type'], 'grey'))
    node_text.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=node_text,
    textposition="top center",
    marker=dict(
        size=10,
        color=node_color,
        line_width=2))

# Create layout for the graph
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Network graph of entities in "The Dying Earth"',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
fig.update_layout(
    width=1000,  # Set the width of the plot
    height=1000)

# Code to display the graph
fig.show()

In [130]:
# Replace empty lists in 'Entities' with NaN, so they are handled properly when exploding
de_df['Entities'] = de_df['Entities'].apply(lambda x: x if x else float('nan'))

# Explode the 'Entities' column
exploded_df = de_df.explode('Entities')

# Drop rows with NaN in 'Entities' (which were originally empty lists)
exploded_df = exploded_df.dropna(subset=['Entities'])

# Split the tuples in 'Entities' into two columns, 'Entity' and 'Type'
exploded_df[['Entity', 'Type']] = pd.DataFrame(exploded_df['Entities'].tolist(), index=exploded_df.index)

# Drop the original 'Entities' column
exploded_df = exploded_df.drop('Entities', axis=1)

# Filter out rows where 'Entity' is 'Earth' and 'Type' is 'LOC'
exploded_df = exploded_df[~((exploded_df['Entity'] == 'Earth') & (exploded_df['Type'] == 'LOC'))]

exploded_df.loc[exploded_df['Entity'] == "Land of the Falling Wall", 'Type'] = "LOC"



In [131]:
readable_format = {
    'PERSON': 'Person',
    'LOC': 'Location',
    'ARTIFACT_OBJECT': 'Artifact or Object',
    'FAC': 'Facility',
    'NORP': 'Nationality or Religious or Political group',
    'SPELL': 'Spell',
    'CREATURE': 'Creature',
    'EVENT': 'Event'
}

exploded_df['Type'] = exploded_df['Type'].map(readable_format).fillna(exploded_df['Type'])


In [132]:
df_to_csv(exploded_df, "entities_graph_clean")

In [133]:
exploded_df


Unnamed: 0,Title,Text,Entity,Type
2,The Dying Earth,"Turjan stood up, found a bowl of pap. With a l...",Turjan,Person
3,The Dying Earth,"Turjan put down the bowl, stood back and slowl...",Turjan,Person
4,The Dying Earth,Turjan sighed and left the room. He mounted wi...,Turjan,Person
5,The Dying Earth,In the west the sun hung close to old earth; r...,Turjan,Person
6,The Dying Earth,He considered its many precursors: the thing a...,Turjan,Person
...,...,...,...,...
961,The Dying Earth,"Guyal said to Shierl, ""There is your home; the...",the Gray Sorcerers,Person
961,The Dying Earth,"Guyal said to Shierl, ""There is your home; the...",Shierl,Person
961,The Dying Earth,"Guyal said to Shierl, ""There is your home; the...",Golwan Andra,Person
961,The Dying Earth,"Guyal said to Shierl, ""There is your home; the...",Pharials,Nationality or Religious or Political group
