In [39]:
import spacy
import pandas as pd
import re
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS


In [40]:
nlp = spacy.load("en_core_web_sm")


In [None]:
def open_book(filename):
    with open("../../Resources/Cleaned/" + filename + ".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def add_book_to_df(book, book_title):
    paragraphs = book.split('\n\n')
    paragraphs = [re.sub(r'\s+', ' ', para.strip()) for para in paragraphs if para.strip()]
    return pd.DataFrame({'Title': [book_title] * len(paragraphs), 'Text': paragraphs})

def correct_entity_type(entity_text, entity_type, correction_dict):
    entity_text_normalized = re.sub(r'\s+', ' ', entity_text).lower().strip(" '\"")
    for category, names in correction_dict.items():
        normalized_names = [re.sub(r'\s+', ' ', name).lower().strip(" '\"") for name in names]
        if entity_text_normalized in normalized_names:
            return category
    return entity_type

def is_unwanted_entity(entity_text, entity_type):
    return (entity_text, entity_type) in unwanted_entities or entity_type in unwanted_types

# Read and Process Book
text = open_book("de")
df = add_book_to_df(text, "The Dying Earth")

# NLP Processing
def extract_entities(paragraph):
    doc = nlp(paragraph)
    entities = set()
    for ent in doc.ents:
        corrected_type = correct_entity_type(ent.text, ent.label_, correction_dict)
        if not is_unwanted_entity(ent.text, corrected_type):
            entities.add((ent.text, corrected_type))
    return entities

df['Entities'] = df['Text'].apply(extract_entities)

In [27]:
# Functions

def open_book(filename):
    with open("../../Resources/Cleaned/"+filename+".txt", 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def add_book_to_df(book, book_title):
    # Split the book text into paragraphs
    paragraphs = book.split('\n\n')
    
    # Clean each paragraph by removing extra whitespace and trimming
    paragraphs = [re.sub(r'\s+', ' ', para.strip()) for para in paragraphs if para.strip()]

    # Create a DataFrame with two columns: book title and the paragraph text
    df = pd.DataFrame({'Title': [book_title] * len(paragraphs), 'Text': paragraphs})
    return df

def correct_entity_type(entity_text, correction_dict):
    # Normalize the entity text (lowercase, remove extra spaces, handle special chars)
    entity_text_normalized = re.sub(r'\s+', ' ', entity_text).lower().strip(" '\"")

    for category, names in correction_dict.items():
        # Normalize and prepare the names in the dictionary
        normalized_names = [re.sub(r'\s+', ' ', name).lower().strip(" '\"") for name in names]
        
        if entity_text_normalized in normalized_names:
            return category
    return None

def find_entities_in_paragraph(paragraph, entities):
    entities_in_paragraph = set()
    for ent_text, ent_type in entities:
        if ent_text in paragraph:
            entities_in_paragraph.add((ent_text, ent_type))
    return list(entities_in_paragraph)

def dialogue_to_df(text):
    pattern = r'"([^"]*)"'
    dialogues = re.findall(pattern, text)
    df_dialogues = pd.DataFrame(dialogues, columns=['Dialogue'])
    return df_dialogues

def key_phrase_extractor(text, n=1):
    additional_stopwords = {'said', "'s", "n't", "'m", "'re", "'ve", "'ll", "'d"}
    custom_stopwords = set(stopwords.words('english')).union(additional_stopwords)

    # Tokenize the text into words, remove punctuation with regex
    words = word_tokenize(re.sub(r'[^\w\s]', '', text))

    # Remove stop words and convert to lowercase
    words_without_stopwords = [word.lower() for word in words if word.lower() not in custom_stopwords]

    # Generate n-grams
    n_grams = ngrams(words_without_stopwords, n)
    n_grams = [' '.join(grams) for grams in n_grams]

    # Count the frequency of each n-gram
    frequency = Counter(n_grams)

    # Get the top N key phrases
    N = 100
    key_phrases = frequency.most_common(N)

    # Create a DataFrame from the top key phrases
    df = pd.DataFrame(key_phrases, columns=['phrase', 'count'])

    return df

def is_character(entity):
    character_types = {'PERSON'}
    return entity[1] in character_types

def is_location(entity):
    location_types = {'LOC'}
    return entity[1] in location_types


def df_to_csv(df, filename):
    df.to_csv("../../Resources/Cleaned/"+filename+".csv", index=False)

In [28]:
text = open_book("de")

In [29]:
df = add_book_to_df(text, "The Dying Earth")
df

Unnamed: 0,Title,Text
0,The Dying Earth,"Turjan sat in his workroom, legs sprawled out ..."
1,The Dying Earth,It was a thing to arouse pity—a great head on ...
2,The Dying Earth,"Turjan stood up, found a bowl of pap. With a l..."
3,The Dying Earth,"Turjan put down the bowl, stood back and slowl..."
4,The Dying Earth,Turjan sighed and left the room. He mounted wi...
...,...,...
844,The Dying Earth,"Kerlin fell back against the wall. ""I expire; ..."
845,The Dying Earth,Guyal and Shierl climbed to the upper ways and...
846,The Dying Earth,Across the plain the yellow lights of Saponce ...
847,The Dying Earth,"Guyal said to Shierl, ""There is your home; the..."


In [42]:
# Load the Spacy model
nlp = spacy.load("en_core_web_sm")

# Open the book and convert it into a DataFrame
text = open_book("de")  # Replace with actual file path if different
df = add_book_to_df(text, "The Dying Earth")



def clean_text(text):
    # Normalize whitespace and strip leading/trailing whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Define additional stopwords
    additional_stopwords = {"said"}

    # Update NLP stop words with additional stopwords
    for word in additional_stopwords:
        nlp.Defaults.stop_words.add(word)

    # Process the text
    doc = nlp(text)

    # Filter out tokens that are stop words or punctuation, and lemmatize
    filtered_tokens = [token.lemma_ for token in doc if token.text.lower() not in nlp.Defaults.stop_words and not token.is_punct]

    # Join the tokens back into a single string
    cleaned_text = ' '.join(filtered_tokens)

    return cleaned_text


# Correcting Entity Types Function
def correct_entity_type(entity_text, entity_type, correction_dict):
    entity_text_normalized = re.sub(r'\s+', ' ', entity_text).lower().strip(" '\"")
    for category, names in correction_dict.items():
        normalized_names = [re.sub(r'\s+', ' ', name).lower().strip(" '\"") for name in names]
        if entity_text_normalized in normalized_names:
            return category
    return entity_type

# Renaming Entities Function
def rename_entity(entity_text, entity_type):
    return rename_dict.get((entity_text, entity_type), (entity_text, entity_type))

# Filtering Unwanted Entities Function
def is_unwanted_entity(entity_text, entity_type):
    return (entity_text, entity_type) in unwanted_entities or entity_type in unwanted_types

# Process the text with Spacy NLP
text = clean_text(text)

doc = nlp(text)

# Extract Entities and Apply Corrections
final_entities = set()
for ent in doc.ents:
    corrected_type = correct_entity_type(ent.text, ent.label_, correction_dict)
    renamed_text, renamed_type = rename_entity(ent.text, corrected_type)
    if not is_unwanted_entity(renamed_text, renamed_type):
        final_entities.add((renamed_text, renamed_type))

# Function to Extract Matching Entities in Paragraphs
def extract_matching_entities(paragraph):
    doc = nlp(paragraph)
    paragraph_entities = set()
    for ent in doc.ents:
        corrected_type = correct_entity_type(ent.text, ent.label_, correction_dict)
        renamed_text, renamed_type = rename_entity(ent.text, corrected_type)
        if not is_unwanted_entity(renamed_text, renamed_type):
            paragraph_entities.add((renamed_text, renamed_type))
    return paragraph_entities.intersection(final_entities)

# Apply the Function to Each Paragraph
df['Entities'] = df['Text'].apply(extract_matching_entities)

# Now, df contains your text with the processed entities

In [43]:
ent_df = pd.DataFrame(final_entities, columns=["Entity_Name", "Entity_Type"])
ent_df['Entity_Name'] = ent_df['Entity_Name'].str.title()

# Remove duplicates
de_ent_df = ent_df.drop_duplicates().reset_index(drop=True)
de_ent_df

Unnamed: 0,Entity_Name,Entity_Type
0,Cazdal,PERSON
1,Millennia Epoch,GPE
2,Violent Cloud,PERSON
3,Saponids Saponce,ORG
4,Guans,NORP
...,...,...
449,Sankaferrin Liane,PERSON
450,Curator,PERSON
451,Deodand Mazirian,PERSON
452,Great Phandaal,ORG


In [44]:
def filter_entities(entities_set):
    return {(entity, type) for entity, type in entities_set if not (entity == "Earth" and type == "LOC")}

In [49]:

import plotly.graph_objects as go
import networkx as nx
from itertools import combinations

# Assuming de_df is your DataFrame and already loaded with the 'Entities' column filled as per your function

# Define entity types and their corresponding colors
entity_types = ['PERSON', 'LOC', 'ARTIFACT_OBJECT', 'FAC', 'NORP', 'SPELL', 'CREATURE', 'EVENT']
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'orange', 'purple']
color_map = dict(zip(entity_types, colors))

# Create a network graph
G = nx.Graph()

# Add nodes with their type
for _, row in df.iterrows():
    filtered_entities = filter_entities(row['Entities'])
    for entity, entity_type in filtered_entities:
        G.add_node(entity, type=entity_type)

# Add edges (for simplicity, connecting all entities within the same text, except ("Earth", "LOC"))
for _, row in df.iterrows():
    filtered_entities = filter_entities(row['Entities'])
    entities = [entity for entity, _ in filtered_entities]
    for source, target in combinations(entities, 2):
        G.add_edge(source, target)

# Position the nodes using a layout to bring outliers closer
pos = nx.kamada_kawai_layout(G)

# Prepare plotly graph
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_color = []
node_text = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_color.append(color_map.get(G.nodes[node]['type'], 'grey'))
    node_text.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=node_text,
    textposition="top center",
    marker=dict(
        size=10,
        color=node_color,
        line_width=2))

# Create layout for the graph
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Network graph of entities in "The Dying Earth"',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
fig.update_layout(
    width=1000,  # Set the width of the plot
    height=1000)

# Code to display the graph
fig.show()