# Named entity recognition

In [4]:
import spacy
from nltk import sent_tokenize

In [None]:
!python -m spacy download en_core_web_trf

# Load Model

In [6]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

In [7]:
nlp_model = load_model()

# Load Dataset

In [8]:
import os
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils_ import load_subtitles_dataset


In [9]:
dataset_path = "../data/subtitles/"
df = load_subtitles_dataset(dataset_path)

In [None]:
df.head()

In [None]:
sample_script = df.iloc[0]['script']
sample_script

In [23]:
sentences = sent_tokenize(sample_script)

In [24]:
sentences = sentences[60:90]

In [25]:
sentence = ".".join(sentences)

In [None]:
sentence

# Run Model

In [27]:
doc = nlp_model(sentence)

In [None]:
doc.ents

In [None]:
for entity in doc.ents:
    print(entity, entity.label_)

In [31]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)

    ner_outpt = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ =="PERSON":
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
        ner_outpt.append(ners)

    return ner_outpt

In [32]:
df = df.head(10)

In [None]:
df

In [34]:
df['ners'] = df['script'].apply(get_ners_inference)

In [None]:
df

# Character Network

In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network


In [58]:
def generate_character_network(df):

    windows=10
    entity_relationship = []

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-windows:]

            # Flatten 2D list into 1D list
            previous_entities_flatttened = sum(previous_entities_in_window, [])

            for entity in sentence:
                for entity_in_window in previous_entities_flatttened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))

    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df

In [59]:
relationship_df = generate_character_network(df)

In [60]:
relationship_df

Unnamed: 0,source,target,value
125,Naruto,Sasuke,117
152,Sakura,Sasuke,65
67,Iruka,Naruto,43
124,Naruto,Sakura,41
118,Mizuki,Naruto,28
...,...,...,...
98,Kakashi,Sharingan,1
91,Jonin,Zabuza,1
87,Jonin,Manji,1
75,Jerk,Sakura,1


In [61]:
relationship_df = relationship_df.sort_values('value', ascending=False)
relationship_df = relationship_df.head(200)

In [62]:
G = nx.from_pandas_edgelist(
    relationship_df,
    source='source',
    target='target',
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, width="1000px", height="700px", bgcolor='#222222', font_color='white', cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("naruto.html")


naruto.html
