# Character Social Network Analysis

In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import re

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!python3 -m spacy download en_core_web_sm

In [41]:
# Load the model
nlp = spacy.load("en_core_web_sm")

# Increase the maximum length
nlp.max_length = 2500000

## Load the character data frame

In [26]:
# open csv file
character_df = pd.read_csv("characters_merged.csv")

In [27]:
# Clean up data frame

# Delete automatically generated column
del character_df["Unnamed: 0"]

# Add first name into a separate column
character_df['character_firstname'] = character_df['character_name'].apply(lambda x: x.split(' ', 1)[0])

# Replace 'Unknown' with np.nan
character_df["alias"].replace('Unknown',np.nan,inplace=True)

character_df["alias"] = character_df["alias"].astype(str)
character_df["alias"]

# Replace 'The' with empty string
character_df["alias"] = character_df["alias"].map(lambda x: x.replace("The ", ""))

# Remove a part of the Ramsay Bolton's aliases
character_df["alias"] = np.where(character_df['character_name']=='Ramsay Bolton',character_df['alias'].replace(re.compile(',Reek,Red Helm,Monster'), '', regex=True),character_df['alias'])

# Separate string by ',' and create a list
character_df['alias'] = character_df['alias'].str.split(',')


In [28]:
# Delete character duplicates to reduce computing load when replacing aliases
character_df = character_df.drop("book", axis=1)
character_df = character_df.drop_duplicates(subset=["character_name"], keep='last')
character_df.head()

Unnamed: 0,character_name,alias,character_firstname
0,Addam Marbrand,[nan],Addam
1,Adrack Humble,[nan],Adrack
5,Aegon Frey,[nan],Aegon
10,Aegon I Targaryen,"[Aegon the Conqueror, Aegon the Dragonlord, Ae...",Aegon
11,Aemon (wildling),[nan],Aemon


## Load books

In [29]:
# Get book names
path = "books_txt"
current_dir = os.getcwd()
abs_path = os.path.abspath(os.path.join(current_dir, path))

# Get all files in directory except temporary files
books = [book for book in os.listdir(abs_path) if os.path.isfile(os.path.join(path, book)) and not book.startswith('.')]
books.sort()

# Print all files
print(books)

['1_A_Game_Of_Thrones.txt', '2_A_Clash_Of_Kings.txt', '3_A_Storm_Of_Swords.txt', '4_A_Feast_For_Crows.txt', '5_A_Dance_With_Dragons.txt']


In [30]:
# Import function
from utils.my_functions import replace_word_in_file

In [33]:
# check presense of aliases before applying the function
book_text = open(f"{abs_path}/{books[0]}").read()
book_text.count("Dany")

413

In [34]:
# Iterate through books to replace aliases with full names

for book in books:
    book_path = f"{abs_path}/{book}"

    # iterate through character aliases and replace in text
    for i, row in character_df.iterrows():
        for alias in row['alias']:
            replace_word_in_file(book_path, alias, row['character_name'])

In [35]:
# sanity check
book_text = open(f"{abs_path}/{books[0]}").read()
book_text.count("Dany")

0

## Named entity recognition

In [36]:
# Open modified text files and append to a list

modified_books = []

for book in books:
    book_text = open(f"{abs_path}/{book}").read()
    modified_books.append(book_text)

In [42]:
# Apply named entity recognition to books
book_docs = []
for index, book in enumerate(modified_books):
    print(f"started processing book {index+1}")
    book_doc = nlp(book)
    book_docs.append(book_doc)

started processing book 1
started processing book 2
started processing book 3
started processing book 4
started processing book 5


In [45]:
# Visualize identified entities
displacy.render(book_docs[0][0:1000], style="ent", jupyter=True)

## Get name entity list per sentence

In [None]:
sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in book_docs[0].sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)


In [None]:
sent_entity_df.head()