In [1]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt


In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     --------------------------------------- 0.1/12.8 MB 991.0 kB/s eta 0:00:13
     --------------------------------------- 0.1/12.8 MB 991.0 kB/s eta 0:00:13
     --------------------------------------- 0.1/12.8 MB 525.1 kB/s eta 0:00:25
     --------------------------------------- 0.2/12.8 MB 706.2 kB/s eta 0:00:18
      -------------------------------------- 0.2/12.8 MB 901.1 kB/s eta 0:00:14
      -------------------------------------- 0.2/12.8 MB 901.1 kB/s eta 0:00:14
      -------------------------------------- 0.3/12.8 MB 895.6 kB/s eta 0:00:14
     - -------------------------------------- 0.5/12.8 MB 1.2 MB/s eta 0:00:11
     - --------------------------

In [3]:
#Load the space english language model into a variable
nlp = spacy.load("en_core_web_sm")

In [4]:
# get all the book files from the data folder

import os
path = '..\data\external'

all_books = [book for book  in os.scandir(path) if '.txt' in book.name]
all_books

[<DirEntry "Book 1 - The Philosopher's Stone.txt">,
 <DirEntry 'Book 2 - The Chamber of Secrets.txt'>,
 <DirEntry 'Book 3 - The Prisoner of Azkaban.txt'>,
 <DirEntry 'Book 4 - The Goblet of Fire.txt'>,
 <DirEntry 'Book 5 - The Order of the Phoenix.txt'>,
 <DirEntry 'Book 6 - The Half Blood Prince.txt'>,
 <DirEntry 'Book 7 - The Deathly Hallows.txt'>]

In [5]:
book1 = all_books[0]
book1_text = open(book1).read()
book1_doc = nlp(book1_text)

In [6]:
# visualise the identified entities in book 1

displacy.render(book1_doc[0:1000], style='ent', jupyter=True)

In [7]:
# read characters file

characters_path = r'..\data\interim\ characters.csv'
characters_df = pd.read_csv(characters_path)
characters_df



Unnamed: 0.1,Unnamed: 0,letter,character
0,0,A,Mrs. Abbott
1,1,A,Abbott family
2,2,A,Hannah Abbott
3,3,A,Abel Treetops
4,4,A,Euan Abercrombie
...,...,...,...
2015,2015,Z,Georgi Zdravko
2016,2016,Z,Zograf
2017,2017,Z,Zonko
2018,2018,Z,Valentina Vázquez


In [8]:
import re

#remove Mr and Mrs from the names
characters_df['character'] = characters_df['character'].str.replace(r'\b(Mr|Mrs)\.?\s', '', regex=True)

#remove all bracketed words from the characters df
characters_df['character'] = characters_df['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x))

# create a firstname column for the characters
characters_df['firstname'] = characters_df['character'].apply(lambda x: x.split(' ', 1)[0])

In [9]:
#drop unwanted columns
characters_df.drop(['Unnamed: 0', 'letter'], axis=1, inplace=True)
characters_df

Unnamed: 0,character,firstname
0,Abbott,Abbott
1,Abbott family,Abbott
2,Hannah Abbott,Hannah
3,Abel Treetops,Abel
4,Euan Abercrombie,Euan
...,...,...
2015,Georgi Zdravko,Georgi
2016,Zograf,Zograf
2017,Zonko,Zonko
2018,Valentina Vázquez,Valentina


In [10]:
# save processed data frame
path= r'..\data\processed\ '
characters_df.to_csv(path+'harry_potter_characters.csv')

# Get named entities in each sentence

In [11]:
sentence_entity_df=[]

#loop through sentences and store the named entity list for each sentence
for sentence in book1_doc.sents:
    entity_list = [entity.text for entity in sentence.ents]
    sentence_entity_df.append({'sentence':sentence, 'entities': entity_list})
    
sentence_entity_df=pd.DataFrame(sentence_entity_df)

In [12]:
sentence_entity_df

Unnamed: 0,sentence,entities
0,"(/, \n\n\n\n\n, THE, BOY, WHO, LIVED, \n\n, Mr...","[LIVED, Dursley, number four, Privet Drive]"
1,"(They, were, the, last, people, you, ’d, \n, e...",[]
2,"(Mr., Dursley, was, the, director, of, a, firm...","[Dursley, Grunnings]"
3,"(He, was, a, big, ,, beefy, \n, man, with, har...",[]
4,"(Mrs., Dursley, was, thin, and, \n, blonde, an...",[Dursley]
...,...,...
6337,"(“, Hope, you, have, —, er, —, a, good, holida...",[Uncle Vernon]
6338,"(Page, |, 347, Harry, Potter, and, the, Philos...","[347, Harry Potter, the Philosophers Stone - J..."
6339,"(“, They, do, n’t, \n, know, we, ’re, not, all...",[]
6340,"(I, ’m, \n, going, to, have, a, lot, of, fun, ...","[Dudley, this summer]"


In [13]:
#little function to filter entity list, taken from Thu Vu

def filter_entity(entity_list, characters_df):
    """_summary_
    this function takes a list of spacy generated entities and the character dataframe and returns only entities that are characters in the dataframe

    Args:
        entity_list (list): list of entities generated by spacy nlp
        characters_df (dataframe): dataframe of characters from scraped from the harry potter website
    """
    return [entity for entity in entity_list
            if entity in list(characters_df.character)
            or entity in list(characters_df.firstname)]

In [14]:
#test
filter_entity(['Harry','boy','waakye'], characters_df)

['Harry']

In [15]:
sentence_entity_df['character_entities'] = sentence_entity_df['entities'].apply(lambda x: filter_entity(x, characters_df))

#remove sentences without named entities
sentence_entity_df_filtered = sentence_entity_df[sentence_entity_df['character_entities'].map(len)>0]
sentence_entity_df_filtered.head()

Unnamed: 0,sentence,entities,character_entities
0,"(/, \n\n\n\n\n, THE, BOY, WHO, LIVED, \n\n, Mr...","[LIVED, Dursley, number four, Privet Drive]",[Dursley]
2,"(Mr., Dursley, was, the, director, of, a, firm...","[Dursley, Grunnings]",[Dursley]
4,"(Mrs., Dursley, was, thin, and, \n, blonde, an...",[Dursley],[Dursley]
5,"(The, Dursley, s, had, a, small, son, \n, call...","[Dursley, Dudley]","[Dursley, Dudley]"
8,"(Mrs., Potter, was, Mrs., Dursley, ’s, sister,...","[Potter, Dursley, n’t, Harry Potter, the Philo...","[Potter, Dursley, Harry Potter, Dursley]"


In [16]:
# Take only first name of characters
sentence_entity_df_filtered['character_entities'] = sentence_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0] 
                                                                                                               for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_entity_df_filtered['character_entities'] = sentence_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0]


In [17]:
#potter = harry
sentence_entity_df_filtered['character_entities'] = sentence_entity_df_filtered['character_entities'].apply(lambda x: [item.replace('Potter', 'Harry') for item in x])
sentence_entity_df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_entity_df_filtered['character_entities'] = sentence_entity_df_filtered['character_entities'].apply(lambda x: [item.replace('Potter', 'Harry') for item in x])


Unnamed: 0,sentence,entities,character_entities
0,"(/, \n\n\n\n\n, THE, BOY, WHO, LIVED, \n\n, Mr...","[LIVED, Dursley, number four, Privet Drive]",[Dursley]
2,"(Mr., Dursley, was, the, director, of, a, firm...","[Dursley, Grunnings]",[Dursley]
4,"(Mrs., Dursley, was, thin, and, \n, blonde, an...",[Dursley],[Dursley]
5,"(The, Dursley, s, had, a, small, son, \n, call...","[Dursley, Dudley]","[Dursley, Dudley]"
8,"(Mrs., Potter, was, Mrs., Dursley, ’s, sister,...","[Potter, Dursley, n’t, Harry Potter, the Philo...","[Harry, Dursley, Harry, Dursley]"
...,...,...,...
6331,"(“, You, must, be, Harry, ’s, family, !, ”, sa...","[Harry, Weasley]","[Harry, Weasley]"
6335,"(Harry, hung, back, for, a, last, word, with, ...","[Harry hung, Ron]",[Ron]
6338,"(Page, |, 347, Harry, Potter, and, the, Philos...","[347, Harry Potter, the Philosophers Stone - J...","[Harry, Harry]"
6340,"(I, ’m, \n, going, to, have, a, lot, of, fun, ...","[Dudley, this summer]",[Dudley]


## Create relationships between characters

In [18]:
window_size = 5
character_relationships =[]

for i in range(sentence_entity_df_filtered.index[-1]):
    end_i = min(i+5, sentence_entity_df_filtered.index[-1])
    char_list = sum((sentence_entity_df_filtered.loc[i:end_i].character_entities),[])
    
    #remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list))
                    if (i==0) or char_list[i] != char_list[1-i]]
    
    #create a source and target character for each interaction in the window
    if len(char_unique) > 1:
        for index, source in enumerate(char_unique[:-1]):
            target = char_unique[index+1]
            character_relationships.append({'source': source, 'target': target})

In [19]:
character_relationships_df = pd.DataFrame(character_relationships)
character_relationships_df

Unnamed: 0,source,target
0,Dursley,Dursley
1,Dursley,Dudley
2,Dursley,Dursley
3,Dursley,Dudley
4,Dursley,Dursley
...,...,...
11676,Dudley,Harry
11677,Harry,Dudley
11678,Dudley,Harry
11679,Dudley,Harry


In [20]:
#some duplicates still remain so remove rows where values for source and target are the same
drop_indexes = character_relationships_df[character_relationships_df['source'] == character_relationships_df['target']].index
character_relationships_df.drop(drop_indexes, inplace=True)

In [21]:
character_relationships_df

Unnamed: 0,source,target
1,Dursley,Dudley
3,Dursley,Dudley
5,Dursley,Dudley
6,Dursley,Dudley
7,Dudley,Dursley
...,...,...
11676,Dudley,Harry
11677,Harry,Dudley
11678,Dudley,Harry
11679,Dudley,Harry


In [22]:

#sort rows column wise to fix reverse interactions eg a->b = b->a

character_relationships_df = pd.DataFrame(np.sort(character_relationships_df.values, axis=1), columns = character_relationships_df.columns)
character_relationships_df

Unnamed: 0,source,target
0,Dudley,Dursley
1,Dudley,Dursley
2,Dudley,Dursley
3,Dudley,Dursley
4,Dudley,Dursley
...,...,...
10919,Dudley,Harry
10920,Dudley,Harry
10921,Dudley,Harry
10922,Dudley,Harry


In [23]:
#aggregate interactions for each character pair and create a weight column
character_relationships_df['value']=1
character_relationships_df = character_relationships_df.groupby(['source', 'target'], sort=False, as_index=False).sum()
character_relationships_df

Unnamed: 0,source,target,value
0,Dudley,Dursley,55
1,Dudley,Harry,496
2,Dursley,Harry,62
3,Dudley,second,22
4,Dursley,second,6
...,...,...,...
519,Fred,Hagrid,5
520,Fred,Neville,1
521,Ginny,Ron,6
522,Ginny,Weasley,5


In [24]:
path = r'..\data\processed\ '
character_relationships_df.to_csv(path+'character_relationships.csv')