In [8]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt


In [9]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
#Loading Spacy English language model
NER = spacy.load("en_core_web_sm") 

## Load Books

In [11]:
import os

#Get all book files in the data directory
all_books = [b for b in os.scandir('data') if '.txt' in b.name]

In [12]:
all_books

[<DirEntry '1 The Last Wish.txt'>,
 <DirEntry '2 The Sword of Destiny.txt'>,
 <DirEntry '3 Blood of Elves.txt'>,
 <DirEntry '4 Times of Contempt.txt'>,
 <DirEntry '5 Baptism of Fire.txt'>,
 <DirEntry '6 The Tower of the Swallow.txt'>,
 <DirEntry '7 The Lady of the Lake.txt'>,
 <DirEntry '8 something ends something begins.txt'>]

In [13]:
book = all_books[1]
book_text = open(book).read()
book_doc = NER(book_text)

In [14]:
#Visualize identified entities
displacy.render(book_doc[0:2000],style="ent",jupyter = True)

## Load Character names

In [25]:
# Read characters
character_df = pd.read_csv('characters.csv')
character_df = character_df.drop('Unnamed: 0',axis=1)

In [26]:
#Remove brackets and text within brackets
import re
character_df['character'] = character_df['character'].apply(lambda x:re.sub("[\(].*?[\)]","",x))
character_df['character_firstname'] = character_df['character'].apply(lambda x: x.split(' ',1)[0])


In [27]:
pd.set_option('display.max_rows',None)
character_df

Unnamed: 0,book,character,character_firstname
0,Category:Baptism of Fire characters,Adalia,Adalia
1,Category:Baptism of Fire characters,Adela,Adela
2,Category:Baptism of Fire characters,Aen Saevherne,Aen
3,Category:Baptism of Fire characters,Aevenien,Aevenien
4,Category:Baptism of Fire characters,Aglaïs,Aglaïs
5,Category:Baptism of Fire characters,Albrich,Albrich
6,Category:Baptism of Fire characters,Amavet,Amavet
7,Category:Baptism of Fire characters,Angus Bri Cri,Angus
8,Category:Baptism of Fire characters,Anna Kameny,Anna
9,Category:Baptism of Fire characters,Anzelm Aubry,Anzelm


## Get named entity list per sentence

In [28]:
sent_entity_df = []
# Loop through sentences, store named entity for each sentence
for sent in book_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence":sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)

In [29]:
sent_entity_df

Unnamed: 0,sentence,entities
0,"(ï»¿The, Limits, of, the, Possible, \n, I, \n,...",[ï»¿The Limits of the Possible]
1,"("", It, 's, been, an, hour, and, a, quarter, s...",[an hour and a quarter]
2,"(He, 's, done, for, .)",[]
3,"("", \n, The, townsfolk, ,, huddled, together, ...",[]
4,"(A, fat, man, dressed, in, a, yellow, smock, \...",[]
5,"("", We, have, to, wait, a, bit, longer, ,, "", ...",[]
6,"("", Why, wait, ?, "", snorted, pimply, ,, "", Th...",[]
7,"(Anyone, goes, down, there, ,, that, 's, the, ...",[]
8,"(Have, you, forgotten, how, many, \n, have, di...",[]
9,"(What, are, we, waiting, for, ?, "", \n, "", Thi...",[]
