# Character Social Network Analysis

In [84]:
import pandas as pd
import spacy
from spacy import displacy
import networkx as nx
import os
import re

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!python3 -m spacy download en_core_web_sm

In [22]:
# Load the model
nlp = spacy.load("en_core_web_sm")

# Increase the maximum length
nlp.max_length = 2000000

## Load the character data frame

In [93]:
# open csv file
character_df = pd.read_csv("characters_merged.csv")

In [94]:
# Clean up data frame

# Delete automatically generated column
del character_df["Unnamed: 0"]

# Add first name into a separate column
character_df['character_firstname'] = character_df['character_name'].apply(lambda x: x.split(' ', 1)[0])

# Replace 'Unknown' with np.nan
character_df["alias"].replace('Unknown',np.nan,inplace=True)

character_df["alias"] = character_df["alias"].astype(str)
character_df["alias"]

# Replace 'The' with empty string
character_df["alias"] = character_df["alias"].map(lambda x: x.replace("The ", ""))

# Remove a part of the Ramsay Bolton's aliases
character_df["alias"] = np.where(character_df['character_name']=='Ramsay Bolton',character_df['alias'].replace(re.compile(',Reek,Red Helm,Monster'), '', regex=True),character_df['alias'])

# Separate string by ',' and create a list
character_df['alias'] = character_df['alias'].str.split(',')


In [99]:
# sanity check
rows = character_df[character_df["character_name"].str.contains("Jon", case=False, na=False)]
rows

Unnamed: 0,book,character_name,alias,character_firstname
612,A Feast for Crows,Jon Bettley,[Beardless Jon Bettley],Jon
613,A Feast for Crows,Jon Myre,[Pinchface Jon Myre],Jon
614,A Game of Thrones,Jon Umber (Greatjon),"[Greatjon, Greatjon Umber]",Jon
615,A Clash of Kings,Jon Umber (Greatjon),"[Greatjon, Greatjon Umber]",Jon
616,A Storm of Sword,Jon Umber (Greatjon),"[Greatjon, Greatjon Umber]",Jon
617,A Feast for Crows,Jon Umber (Greatjon),"[Greatjon, Greatjon Umber]",Jon
618,A Dance with Dragons,Jon Umber (Greatjon),"[Greatjon, Greatjon Umber]",Jon
619,The Winds of Winter,Jon Umber (Greatjon),"[Greatjon, Greatjon Umber]",Jon
620,A Feast for Crows,Jon Waters,[nan],Jon
621,A Game of Thrones,Jonos Bracken,[nan],Jonos


## Load books

In [96]:
# Get book names
path = "books_txt"
current_dir = os.getcwd()
abs_path = os.path.abspath(os.path.join(current_dir, path))

# Get all files in directory except temporary files
books = [book for book in os.listdir(abs_path) if os.path.isfile(os.path.join(path, book)) and not book.startswith('.')]
books.sort()

# Print all files
print(books)

['1_A_Game_Of_Thrones.txt', '2_A_Clash_Of_Kings.txt', '3_A_Storm_Of_Swords.txt', '4_A_Feast_For_Crows.txt', '5_A_Dance_With_Dragons.txt']


In [119]:
# Import function
from utils.my_functions import replace_word_in_string

In [None]:
# Read books into strings and store them in a list
contents = []
for book in books:
    book_text = open(f"{abs_path}/{book}").read()
    contents.append(book_text)

# Iterate through books to replace aliases with full names
# Create an empty list to store the modified text
modified_texts = []

for book in contents:
    # Create an empty string variable for the modified text
    modified_text = ""
    # iterate through character aliases and replace in text
    for i, row in character_df.iterrows():
        for alias in row['alias']:
            modified_text += replace_word_in_string(book, alias, row['character_name'])
    modified_texts.append(modified_text)

In [118]:
# sanity check
modified_text[0].count("Dany")

0

## Named entity recognition

In [121]:
book_docs = []

In [None]:
for book in modified_text:
    book_doc = nlp(book)
    book_docs.append(book_doc)

In [None]:
len(book_docs)

In [24]:
# Visualize identified entities
displacy.render(book_doc[0:1000], style="ent", jupyter=True)

In [None]:
# create dataframe with character full names and first names
character_df = pd.DataFrame({'first_name': ['Jon', 'Daenerys', 'Tyrion'], 'full_name': ['Jon Snow', 'Daenerys Targaryen', 'Tyrion Lannister']})

# create a set of all character names (both full names and first names)
character_names = set(character_df['first_name'].tolist() + character_df['full_name'].tolist())


character_names

## Get name entity list per sentence

In [32]:
sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in book_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)


In [33]:
sent_entity_df.head()

Unnamed: 0,sentence,entities
0,"(, PROLOGUE, \n, We, should, start, back, ,, ...",[]
1,"(“, The, \n, wildlings, are, dead, ., ”, \n)",[]
2,"(“, Do, the, dead, frighten, you, ?, ”)",[]
3,"(Ser, Waymar, Royce, asked, with, just, the, h...",[Waymar Royce]
4,"(Gared, did, not, rise, to, the, bait, .)",[Gared]
