# Character Social Network Analysis

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import re

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!python3 -m spacy download en_core_web_sm

In [3]:
# Load the model
nlp = spacy.load("en_core_web_sm")

# Increase the maximum length
nlp.max_length = 2500000

## Load the character data frame

In [61]:
# open csv file
character_df = pd.read_csv("characters_merged.csv")

In [62]:
# Clean up data frame

# Delete automatically generated column
del character_df["Unnamed: 0"]

# Add first name into a separate column
character_df['character_firstname'] = character_df['character_name'].apply(lambda x: x.split(' ', 1)[0])

# Replace 'Unknown' with np.nan
character_df["alias"].replace('Unknown',np.nan,inplace=True)

character_df["alias"] = character_df["alias"].astype(str)
character_df["alias"]

# Replace 'The' with empty string
character_df["alias"] = character_df["alias"].map(lambda x: x.replace("The ", ""))

# Remove a part of the Ramsay Bolton's aliases
character_df["alias"] = np.where(character_df['character_name']=='Ramsay Bolton',character_df['alias'].replace(re.compile(',Reek,Red Helm,Monster'), '', regex=True),character_df['alias'])

# Separate string by ',' and create a list
character_df['alias'] = character_df['alias'].str.split(',')


In [64]:
# Create book count column
books_count = character_df.groupby('character_name')['book'].size().reset_index(name='books_count')

# Merge the books_count with main dataframe
character_df = character_df.merge(books_count, on='character_name')

# Delete character duplicates to reduce computing load when replacing aliases
character_df = character_df.drop("book", axis=1)
character_df = character_df.drop_duplicates(subset=["character_name"], keep='last')

In [105]:
# change some character names for correct recognition
character_df.loc[character_df['character_name'].str.contains('Drogo'), 'character_name'] = 'Khal Drogo'
character_df.loc[character_df['character_name'].str.contains('Grey Worm'), 'character_firstname'] = 'Grey Worm'
character_df.loc[character_df['character_name'].str.contains('High Sparrow'), 'character_firstname'] = 'High Sparrow'
character_df.loc[character_df['character_name'].str.contains("Lord Commander's Raven"), 'character_firstname'] = "Lord Commander's Raven"
character_df.loc[character_df['character_name'].str.contains("Lord Sunglass"), 'character_firstname'] = "Lord Sunglass"
character_df['alias'] = character_df['alias'].apply(lambda x: np.nan if (isinstance(x, list) and "nan" in x) else x)


In [106]:
result = character_df[character_df['character_firstname'].duplicated(keep=False)]
pd.set_option('display.max_rows', None)
result

Unnamed: 0,character_name,alias,character_firstname,books_count
5,Aegon Frey,,Aegon,4
10,Aegon I Targaryen,"[Aegon the Conqueror, Aegon the Dragonlord, Ae...",Aegon,5
11,Aemon (wildling),,Aemon,1
13,Aemon Targaryen (Son of Maekar I),"[Aemon Targaryen, Uncle Maester]",Aemon,2
41,Alyn,,Alyn,1
44,Alyn Haigh,,Alyn,3
45,Alyn Orkwood,[Orkwood of Orkmont],Alyn,1
47,Alyn Stackspear,,Alyn,2
59,Andrey Charlton,,Andrey,1
60,Andrey Dalt,[Drey],Andrey,1


## Load books

In [51]:
# Get book names
path = "books_txt"
current_dir = os.getcwd()
abs_path = os.path.abspath(os.path.join(current_dir, path))

# Get all files in directory except temporary files
books = [book for book in os.listdir(abs_path) if os.path.isfile(os.path.join(path, book)) and not book.startswith('.')]
books.sort()

# Print all files
print(books)

['1_A_Game_Of_Thrones.txt', '2_A_Clash_Of_Kings.txt', '3_A_Storm_Of_Swords.txt', '4_A_Feast_For_Crows.txt', '5_A_Dance_With_Dragons.txt']


In [52]:
# Import function
from utils.my_functions import replace_word_in_file

In [41]:
# check presense of aliases before applying the function
book_text = open(f"{abs_path}/{books[0]}").read()
book_text.count("Dany")

413

In [53]:
# Iterate through books to replace aliases with full names

for book in books:
    book_path = f"{abs_path}/{book}"

    # iterate through character aliases and replace in text
    for i, row in character_df.iterrows():
        for alias in row['alias']:
            replace_word_in_file(book_path, alias, row['character_name'])

In [57]:
# sanity check
book_text = open(f"{abs_path}/{books[0]}").read()
book_text.count("Dany")

0

## Named entity recognition

In [58]:
# Open modified text files and append to a list

modified_books = []

for book in books:
    book_text = open(f"{abs_path}/{book}").read()
    modified_books.append(book_text)

In [59]:
# Apply named entity recognition to books
book_docs = []
for index, book in enumerate(modified_books):
    print(f"started processing book {index+1}")
    book_doc = nlp(book)
    book_docs.append(book_doc)

started processing book 1
started processing book 2
started processing book 3
started processing book 4
started processing book 5


In [66]:
# Visualize identified entities
displacy.render(book_docs[0][0:1000], style="ent", jupyter=True)

## Get name entity list per sentence

In [67]:
sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in book_docs[0].sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)


In [68]:
sent_entity_df.head()

Unnamed: 0,sentence,entities
0,"(, PROLOGUE, \n, We, should, start, back, ,, ...",[]
1,"(“, The, \n, wildlings, are, dead, ., ”, \n)",[]
2,"(“, Do, the, dead, frighten, you, ?, ”)",[]
3,"(Ser, Waymar, Royce, asked, with, just, the, h...",[Waymar Royce]
4,"(Gared, did, not, rise, to, the, bait, .)",[Gared]


In [124]:
# Import function
from utils.my_functions import filter_entity

ImportError: cannot import name 'filter_entity' from 'utils.my_functions' (/Users/mariiamorskovatykh/code/mariamorskovatykh/GOT_project/utils/my_functions.py)

In [122]:
sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))

# Filter out sentences that don't have any character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]
sent_entity_df_filtered.head(150)

Unnamed: 0,sentence,entities,character_entities
3,"(Ser, Waymar, Royce, asked, with, just, the, h...",[Waymar Royce],[Waymar Royce]
4,"(Gared, did, not, rise, to, the, bait, .)",[Gared],[Gared]
23,"(Are, you, unmanned, by, the, dark, ,, Gared, ...",[Gared],[Gared]
48,"(Ser, Waymar, had, been, a, Sworn, Brother, of...","[Waymar, the Night’s Watch, less than half]",[Waymar]
103,"(“, What, do, you, think, might, have, killed,...",[Gared],[Gared]
104,"(Ser, Waymar, asked, casually, .)",[Waymar],[Waymar]
117,"(Peaceful, ,, \n, like, ., ”, \n, “, Such, elo...","[Gared, Waymar]","[Gared, Waymar]"
125,"(“, You, ought, dress, more, warmly, ,, Gared,...","[Gared, Gared, Maester Aemon]","[Gared, Gared]"
128,"(“, If, Gared, said, it, was, the, cold, ., .,...",[Gared],[Gared]
130,"(“, Have, you, drawn, any, watches, this, past...","[this past week, Will]",[Will]


In [71]:
sent_entity_df_filtered["character_entities"] = sent_entity_df_filtered["character_entities"].apply(lambda x: ', '.join(x))
un= sent_entity_df_filtered["character_entities"].unique()

for u in un:
    print(u)
    print("__________")

Waymar Royce
__________
Gared
__________
Waymar
__________
Gared, Waymar
__________
Gared, Gared
__________
Will
__________
Gared, Will
__________
Robert
__________
Bran
__________
Jeyne Poole
__________
Jon, Bran
__________
Jory Cassel
__________
Jon Snow
__________
Jon, Balon IX Greyjoy
__________
Jon
__________
Balon IX Greyjoy
__________
Jory
__________
Jory, Bran
__________
Robb Stark
__________
Hullen
__________
Harwin
__________
Jon, Jon
__________
Rickon
__________
Desmond
__________
Bran, Robb Stark, Jon
__________
Ned
__________
Sansa, Rickon
__________
Brynden
__________
Benjen Stark
__________
Viserys
__________
Illyrio
__________
Illyrio, Viserys
__________
Khal Drogo
__________
Daenerys I Targaryen
__________
Viserys, Daenerys
__________
Rhaegar
__________
Daenerys
__________
Aegon
__________
Illyrio, Khal Drogo
__________
Drogo
__________
Khal Drogo, Illyrio
__________
Illyrio, Illyrio
__________
Illyrio Mopatis
__________
Rhogoro
__________
Daenerys I Targaryen, Illyrio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_entity_df_filtered["character_entities"] = sent_entity_df_filtered["character_entities"].apply(lambda x: ', '.join(x))


In [74]:
main_characters = {"Brynden": "Brynden Tully", "Balon": "Balon IX Greyjoy", "Jeyne": "Jeyne Poole", "Jon": "Jon Snow",
                   "Lyanna": "Lyanna Stark", "Petyr": "Petyr Baelish", "Robert": "Robert I Baratheon",
                   "Tommen": "Tommen I Baratheon", "Tyrion": "Tyrion Lannister", "Viserys": "Viserys III Targaryen",
                   "Tywin": "Tywin Lannister"}

main_characters = pd.DataFrame.from_dict(main_characters, orient='index', columns=['full_name'])
main_characters = main_characters.reset_index()
main_characters = main_characters.rename(columns={'index':'first_name'})
main_characters

Unnamed: 0,first_name,full_name
0,Brynden,Brynden Tully
1,Balon,Balon IX Greyjoy
2,Jeyne,Jeyne Poole
3,Jon,Jon Snow
4,Lyanna,Lyanna Stark
5,Petyr,Petyr Baelish
6,Robert,Robert I Baratheon
7,Tommen,Tommen I Baratheon
8,Tyrion,Tyrion Lannister
9,Viserys,Viserys III Targaryen


In [119]:
def replace_name(lst, main_characters, characters_df):
    for i, item in enumerate(lst):
        if isinstance(item, str) and len(item.split()) < 2:
            if item in main_characters['first_name'].values:
                full_name = main_characters.loc[main_characters['first_name'] == item, 'full_name'].values[0]
                lst[i] = full_name
            elif item in character_df['character_firstname'].values:
                character_rows = character_df[character_df['character_firstname'] == item]
                if len(character_rows) == 1:
                    lst[i] = character_rows['character_name'].values[0]
                else:
                    character_rows = character_rows.sort_values(by=['books_count'], ascending=[False])
                    if character_rows['books_count'].iloc[0] == character_rows['books_count'].iloc[1]:
                        if pd.notnull(character_rows['alias'].iloc[0]) and pd.notnull(character_rows['alias'].iloc[1]):
                            lst[i] = character_rows.iloc[0]['character_name']
                        elif pd.notnull(character_rows['alias'].iloc[0]):
                            lst[i] = character_rows.iloc[0]['character_name']
                        elif pd.notnull(character_rows['alias'].iloc[1]):
                            lst[i] = character_rows.iloc[1]['character_name']
                        else:
                            lst[i] = character_rows.iloc[0]['character_name']
                    else:
                        lst[i] = character_rows.iloc[0]['character_name']
        else:
            lst[i] = item
    return lst

In [121]:
replace_name(["Jon", "Petyr", "Andrey", "Waymar Royce", "Waymar"], main_characters, character_df)


['Jon Snow', 'Petyr Baelish', 'Andrey Dalt', 'Waymar Royce', 'Waymar Royce']

In [108]:
item = "Andrey"

item in character_df['character_firstname'].values

True

In [109]:
character_rows = character_df[character_df['character_firstname'] == item]
character_rows

Unnamed: 0,character_name,alias,character_firstname,books_count
59,Andrey Charlton,,Andrey,1
60,Andrey Dalt,[Drey],Andrey,1


In [113]:
if character_rows['books_count'].iloc[0] == character_rows['books_count'].iloc[1]:
    if pd.notnull(character_rows['alias'].iloc[0]) and pd.notnull(character_rows['alias'].iloc[1]):
        item = character_rows.iloc[0]['character_name']
    elif pd.notnull(character_rows['alias'].iloc[0]):
        item = character_rows.iloc[0]['character_name']
    elif pd.notnull(character_rows['alias'].iloc[1]):
        item = character_rows.iloc[1]['character_name']
    else:
        item = character_rows.iloc[0]['character_name']
else:
    item = character_rows.iloc[0]['character_name']
item

'Andrey Dalt'

In [112]:
pd.notnull(character_rows['alias'].iloc[0]) and pd.notnull(character_rows['alias'].iloc[1])

False

In [None]:
Jeyne Westerling for other books
Lyanna Mormont