## Named Entity Recognition

In [None]:
## Make sure to run !python -m spacy download en_core_web_trf which is the named entity recognition model we use
import spacy
from nltk import sent_tokenize
import os
import sys
import pathlib
import glob
from series_analysis.utils.main_utils import load_subtitles_dataset

In [2]:
def load_model():
    return spacy.load("en_core_web_trf")

### Loading the Dataset

In [63]:
dataset_path = "../../../data/subtitles"
df = load_subtitles_dataset(dataset_path)

In [64]:
df.head()

Unnamed: 0,script
0,We are Fighting Dreamers aiming high\n Fightin...
1,We are Fighting Dreamers aiming high\n Fightin...
2,Press down hard on the gas\n That’s right ther...
3,Rock away your existence\n Shouting that you a...
4,Rock away your existence\n Shouting that you a...


In [16]:
sample_script = df.iloc[0]['script']
print(sample_script)

We are Fighting Dreamers aiming high
 Fighting Dreamersdon't care what people think about them
 Fighting Dreamersfollow what they believe
 Oli Oli Oli Oh! Just go my way
 Right here right now (Bang)Hit it straight like a line drive!
 Right here right now (Burn)
 Down a difficult roadfilled with endless struggles
 Where do you think you are goingfollowing someone else's map?
 An insightful crow comes alongto tear up the map
 Now open your eyes andtake a look at the truth (Yeah!)
 There's nothing to loseso let's GO!!!
 We are Fighting Dreamers aiming high
 Fighting Dreamersdon't care what people think about them
 Fighting Dreamersfollow what they believe
 Oli Oli Oli Oh!Just go my way
 Right here right now (Bang)Hit it straight like a line drive!
 Right here right now (Burn)We're gonna do it and do our best!
 Right here right now (Bang)Hit it straight like a line drive!
 Right here right now (Burn)We're gonna do it and do our best! BANG!
 My body movementshave finally returned…
 Blood…
 

In [19]:
sentences = sent_tokenize(sample_script)
sentence = ".".join(sentences)
sentence

'We are Fighting Dreamers aiming high\n Fighting Dreamersdon\'t care what people think about them\n Fighting Dreamersfollow what they believe\n Oli Oli Oli Oh!.Just go my way\n Right here right now (Bang)Hit it straight like a line drive!.Right here right now (Burn)\n Down a difficult roadfilled with endless struggles\n Where do you think you are goingfollowing someone else\'s map?.An insightful crow comes alongto tear up the map\n Now open your eyes andtake a look at the truth (Yeah!).There\'s nothing to loseso let\'s GO!!!.We are Fighting Dreamers aiming high\n Fighting Dreamersdon\'t care what people think about them\n Fighting Dreamersfollow what they believe\n Oli Oli Oli Oh!Just go my way\n Right here right now (Bang)Hit it straight like a line drive!.Right here right now (Burn)We\'re gonna do it and do our best!.Right here right now (Bang)Hit it straight like a line drive!.Right here right now (Burn)We\'re gonna do it and do our best!.BANG!.My body movementshave finally returned

### Run the model

In [55]:
model = load_model()

  model.load_state_dict(torch.load(filelike, map_location=device))


In [26]:
doc = model(sentence) #requires a string to work
doc.ents

  with torch.cuda.amp.autocast(self._mixed_precision):


(Dreamers,
 Oli Oli Oli,
 Dreamers,
 Oli Oli Oli,
 Kabuto,
 Orochimaru,
 Orochimaru,
 Shizune,
 Orochimaru,
 Tsunadedo,
 Tsunade,
 Tsunade,
 four,
 Sannin,
 Third,
 four,
 Kakashi,
 the Shadow Clone Jutsu.lt’d,
 Jiraiya,
 Jiraiya,
 Tsunade,
 Jutsu,
 Jiraiya,
 Tsunade,
 the Chunin Exam,
 Akatsuki,
 Sannin,
 Third,
 Jiraiya,
 Naruto,
 Shinobi,
 Gamatatsu,
 Gamakichi,
 first,
 Jiraiya,
 Jiraiyaone,
 Sannin,
 one,
 Ninja Art!.Needle Jizo!.Dang,
 Ninja,
 Tsunade,
 Sannin,
 Orochimaruis,
 Uchiha,
 Uchiha,
 Sharingan,
 Sharingan,
 Ninja,
 Ninja,
 one,
 Shinobi,
 Jutsuone,
 one,
 Genin,
 Geninso,
 Jiraiya,
 three daysI’ll,
 one week,
 Jutsu,
 Tsunade,
 Naruto,
 tomorrow,
 Fifth)

In [27]:
for entity in doc.ents:
    print(entity, entity.label_)

Dreamers NORP
Oli Oli Oli PERSON
Dreamers NORP
Oli Oli Oli PERSON
Kabuto PERSON
Orochimaru PERSON
Orochimaru PERSON
Shizune PERSON
Orochimaru PERSON
Tsunadedo PERSON
Tsunade PERSON
Tsunade PERSON
four CARDINAL
Sannin NORP
Third ORDINAL
four CARDINAL
Kakashi PERSON
the Shadow Clone Jutsu.lt’d PRODUCT
Jiraiya PERSON
Jiraiya PERSON
Tsunade PERSON
Jutsu PERSON
Jiraiya PERSON
Tsunade PERSON
the Chunin Exam EVENT
Akatsuki NORP
Sannin NORP
Third ORDINAL
Jiraiya PERSON
Naruto PERSON
Shinobi NORP
Gamatatsu PERSON
Gamakichi PERSON
first ORDINAL
Jiraiya PERSON
Jiraiyaone PERSON
Sannin NORP
one CARDINAL
Ninja Art!.Needle Jizo!.Dang PERSON
Ninja PERSON
Tsunade PERSON
Sannin NORP
Orochimaruis PERSON
Uchiha NORP
Uchiha PERSON
Sharingan NORP
Sharingan NORP
Ninja PERSON
Ninja PERSON
one CARDINAL
Shinobi NORP
Jutsuone PERSON
one CARDINAL
Genin NORP
Geninso NORP
Jiraiya PERSON
three daysI’ll DATE
one week DATE
Jutsu PERSON
Tsunade PERSON
Naruto PERSON
tomorrow DATE
Fifth ORDINAL


In [28]:
def get_ner(script, model=model):
    script_sentences = sent_tokenize(script)
    ner_output = []
    for sent in script_sentences:
        doc = model(sent)
        sent_names = set()
        for entity in doc.ents:
            if entity.label_ == "PERSON":
                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                sent_names.add(first_name) #a name can multiple times in the same sentence, so adding to a set reduced errors
        ner_output.append(sent_names)
    
    return ner_output
        
            
            

In [65]:
df = df.head(40)

In [66]:
df.shape

(40, 1)

In [67]:
df['ner'] = df['script'].apply(get_ner)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [68]:
df.head()

Unnamed: 0,script,ner
0,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {}, {}, {},..."
1,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {}, {}, {},..."
2,Press down hard on the gas\n That’s right ther...,"[{}, {}, {}, {}, {}, {}, {Lee}, {}, {}, {}, {}..."
3,Rock away your existence\n Shouting that you a...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
4,Rock away your existence\n Shouting that you a...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {Hinata},..."


### Character Network

In [69]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [70]:
def generate_character(data=df):
    window = 10
    entity_relationship = []
    for row in data['ner']:
        previous_entity_in_window = []
        
        for sentence in row:
            previous_entity_in_window.append(list(sentence))
            # take the most recent 10 elements
            previous_entity_in_window = previous_entity_in_window[-window:]
            
            # Flatten 2D List into 1D List
            previous_entities_flattened = sum(previous_entity_in_window, [])  
            
            for entity in sentence:
                for subject in previous_entities_flattened:
                    if entity != subject:
                        entity_relationship.append(sorted([entity, subject]))
    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending = False)
    return relationship_df
    

In [71]:
relationship_df = generate_character()

In [74]:
relationship_df.tail()

Unnamed: 0,source,target,value
530,Intense,Lee,1
533,Intense,roger,1
534,Intense,thatBushy,1
535,Iruka,Kabuto,1
605,Jiraiya,Toad,1


In [73]:
G = nx.from_pandas_edgelist(
    relationship_df,
    source="source",
    target="target",
    edge_attr="value",
    create_using=nx.Graph()
)
net = Network(notebook=True, width='1000px', height="700px", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, "size")
net.from_nx(G)
net.show("anime.html")



anime.html
