## Named entity recognition

In [1]:
import spacy
from nltk import sent_tokenize

## Load model

In [2]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

In [3]:
nlp_model = load_model()

  from .autonotebook import tqdm as notebook_tqdm


## Load dataset

In [4]:
import os
import sys
import pathlib
folder_path=pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [5]:
dataset_path = "../dataset/Subtitles/"
df = load_subtitles_dataset(dataset_path)

In [6]:
df.head()

Unnamed: 0,episode,scripts
0,3,",Go Food CEO We’re getting noticed—don’t lose ..."
1,1,",Ha-ri's Mom He better not make you cry.\n ,Sh..."
2,2,",Ha-ri's Mom A CEO with manners? I’m impressed..."
3,4,",Cha Sung-hoon He trusts me to fix this.\n ,Ha..."
4,5,",Shin Ha-ri When did my life turn into a rom-c..."


In [7]:
sample_scrpit = df.iloc[0]['scripts']
sample_scrpit

",Go Food CEO We’re getting noticed—don’t lose momentum.\n ,Kang Tae-moo No one dares talk back to me. But she does.\n ,Kang Tae-moo I didn’t expect honesty to be this refreshing.\n ,Kang Tae-moo I didn’t expect honesty to be this refreshing.\n ,Ha-ri's Dad Even when you were little, you were a fighter.\n ,Jin Young-seo I just wanted to skip the date, not start a scandal.\n ,Jin Young-seo Ha-ri is way braver than I could ever be.\n ,Young-seo's Dad Your actions reflect on our family name.\n ,Go Food CEO We’re getting noticed—don’t lose momentum.\n ,Go Food CEO This better not turn into another scandal.\n ,Young-seo's Dad He’s not in your league. Think wisely.\n ,Young-seo's Dad Even your mother is concerned now.\n ,Ha-ri's Mom I want you to live without regrets.\n ,Cha Sung-hoon Young-seo… you’re unlike anyone I’ve met.\n ,Cha Sung-hoon I’ll stand by your side, no matter what.\n ,Ha-ri's Mom Do I like him? I love him already!\n ,Cha Sung-hoon The board won't like this news...\n ,Young-

In [8]:
sentences = sent_tokenize(sample_scrpit)
sentences

[',Go Food CEO We’re getting noticed—don’t lose momentum.',
 ',Kang Tae-moo No one dares talk back to me.',
 'But she does.',
 ',Kang Tae-moo I didn’t expect honesty to be this refreshing.',
 ',Kang Tae-moo I didn’t expect honesty to be this refreshing.',
 ",Ha-ri's Dad Even when you were little, you were a fighter.",
 ',Jin Young-seo I just wanted to skip the date, not start a scandal.',
 ',Jin Young-seo Ha-ri is way braver than I could ever be.',
 ",Young-seo's Dad Your actions reflect on our family name.",
 ',Go Food CEO We’re getting noticed—don’t lose momentum.',
 ',Go Food CEO This better not turn into another scandal.',
 ",Young-seo's Dad He’s not in your league.",
 'Think wisely.',
 ",Young-seo's Dad Even your mother is concerned now.",
 ",Ha-ri's Mom I want you to live without regrets.",
 ',Cha Sung-hoon Young-seo… you’re unlike anyone I’ve met.',
 ',Cha Sung-hoon I’ll stand by your side, no matter what.',
 ",Ha-ri's Mom Do I like him?",
 'I love him already!',
 ",Cha Sung-hoo

In [9]:
sentence = ".".join(sentences)
sentence

",Go Food CEO We’re getting noticed—don’t lose momentum..,Kang Tae-moo No one dares talk back to me..But she does..,Kang Tae-moo I didn’t expect honesty to be this refreshing..,Kang Tae-moo I didn’t expect honesty to be this refreshing..,Ha-ri's Dad Even when you were little, you were a fighter..,Jin Young-seo I just wanted to skip the date, not start a scandal..,Jin Young-seo Ha-ri is way braver than I could ever be..,Young-seo's Dad Your actions reflect on our family name..,Go Food CEO We’re getting noticed—don’t lose momentum..,Go Food CEO This better not turn into another scandal..,Young-seo's Dad He’s not in your league..Think wisely..,Young-seo's Dad Even your mother is concerned now..,Ha-ri's Mom I want you to live without regrets..,Cha Sung-hoon Young-seo… you’re unlike anyone I’ve met..,Cha Sung-hoon I’ll stand by your side, no matter what..,Ha-ri's Mom Do I like him?.I love him already!.,Cha Sung-hoon The board won't like this news...\n ,Young-seo's Dad He’s not in your leagu

## Run model

In [10]:
doc = nlp_model(sentence)


In [11]:
doc.ents

(Tae-moo,
 Tae-moo,
 Tae-moo,
 -ri's,
 ,Jin Young-seo,
 Young-seo Ha-ri,
 seo,
 seo,
 -ri's,
 Sung-hoon,
 Young-seo,
 Sung-hoon,
 -ri's,
 Sung-hoon,
 Young-seo's,
 -ri's,
 Young-seo,
 -seo's,
 Ha-ri,
 Young-seo,
 -ri's,
 Tae-moo,
 -ri's,
 -ri's,
 seo,
 Sung-hoon,
 Jin Young-seo Ha-ri,
 -ri's,
 -ri's,
 -seo's,
 Tae-moo,
 Tae-moo,
 Ha-ri,
 -seo's,
 -ri's,
 ,Cha Sung-hoon,
 -ri's,
 Ha-ri,
 ,Kang Tae-moo,
 -ri's,
 ,Cha Sung-hoon,
 -ri's,
 -ri's,
 -seo's,
 Ha-ri,
 -seo's,
 this quarter,
 ,Cha Sung-hoon,
 Ha-ri,
 -ri's,
 ,Cha Sung-hoon,
 -ri's,
 -ri's,
 -ri's,
 Young-seo,
 -ri's,
 -ri's,
 -ri's,
 -ri's,
 -ri's,
 Young-seo,
 Ha-,
 Ha-,
 Kang Tae-moo,
 ,Cha Sung-hoon,
 ,Cha Sung-hoon,
 -ri's,
 Tae-moo,
 Shin Ha-ri?.,Shin,
 -ri's,
 -ri's,
 -ri's,
 Ha-ri,
 -ri's,
 -seo's,
 -ri's,
 Ha-,
 Sung-hoon,
 ,Jin Young-seo,
 -seo's,
 seo's,
 ,Cha Sung-hoon,
 ,Kang Tae-moo,
 seo's,
 -ri's,
 Ha-ri,
 ,Jin Young-seo,
 Ha-ri,
 -ri's,
 Sung-hoon,
 this quarter,
 -ri's,
 -seo's,
 seo,
 this quarter,
 -ri's,
 ,Ka

In [12]:
for entity in doc.ents:
    print(entity, entity.label_)

Tae-moo PERSON
Tae-moo PERSON
Tae-moo PERSON
-ri's PERSON
,Jin Young-seo PERSON
Young-seo Ha-ri PERSON
seo PERSON
seo PERSON
-ri's PERSON
Sung-hoon PERSON
Young-seo PERSON
Sung-hoon PERSON
-ri's PERSON
Sung-hoon PERSON
Young-seo's PERSON
-ri's PERSON
Young-seo PERSON
-seo's PERSON
Ha-ri PERSON
Young-seo PERSON
-ri's PERSON
Tae-moo PERSON
-ri's PERSON
-ri's PERSON
seo PERSON
Sung-hoon PERSON
Jin Young-seo Ha-ri PERSON
-ri's PERSON
-ri's PERSON
-seo's PERSON
Tae-moo PERSON
Tae-moo PERSON
Ha-ri PERSON
-seo's PERSON
-ri's PERSON
,Cha Sung-hoon PERSON
-ri's PERSON
Ha-ri PERSON
,Kang Tae-moo PERSON
-ri's PERSON
,Cha Sung-hoon PERSON
-ri's PERSON
-ri's PERSON
-seo's PERSON
Ha-ri PERSON
-seo's PERSON
this quarter DATE
,Cha Sung-hoon PERSON
Ha-ri PERSON
-ri's PERSON
,Cha Sung-hoon PERSON
-ri's PERSON
-ri's PERSON
-ri's PERSON
Young-seo PERSON
-ri's PERSON
-ri's PERSON
-ri's PERSON
-ri's PERSON
-ri's PERSON
Young-seo PERSON
Ha- PERSON
Ha- PERSON
Kang Tae-moo PERSON
,Cha Sung-hoon PERSON
,Cha Sun

In [13]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)

    ner_output = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ == "PERSON":
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
            ner_output.append(ners)

    return ner_output

In [14]:
df['ners'] = df['scripts'].apply(get_ners_inference)

In [15]:
df

Unnamed: 0,episode,scripts,ners
0,3,",Go Food CEO We’re getting noticed—don’t lose ...","[{Kang}, {Kang}, {Kang}, {Ha-ri's}, {Jin}, {Ji..."
1,1,",Ha-ri's Mom He better not make you cry.\n ,Sh...","[{Shin}, {Ha-ri's}, {Young-seo's}, {Young-seo'..."
2,2,",Ha-ri's Mom A CEO with manners? I’m impressed...","[{,Shin}, {Ha-ri's}, {,Jin}, {Young-seo's}, {H..."
3,4,",Cha Sung-hoon He trusts me to fix this.\n ,Ha...","[{Cha}, {Ha-ri's}, {Cha}, {Ha-ri's}, {Young-se..."
4,5,",Shin Ha-ri When did my life turn into a rom-c...","[{,Shin}, {Young-seo's}, {Ha-ri's}, {Shin}, {K..."
5,6,",Go Food CEO Bring me results, not excuses.\n ...","[{}, {,Shin}, {Cha}, {}, {Ha-ri}, {}, {Ha-ri's..."
6,7,",Jin Young-seo Let’s get drinks and forget thi...","[{Jin}, {Ha-ri's}, {Ha-ri's}, {Young-seo's}, {..."
7,8,",Ha-ri's Mom Stop sneaking out—I know everythi...","[{ri}, {Shin}, {Cha}, {Jin}, {Ha-ri's}, {Kang,..."
8,9,",Go Food CEO Bring me results, not excuses.\n ...","[{}, {Kang}, {Shin}, {Young-seo's}, {Ha-ri's},..."
9,10,",Cha Sung-hoon Sometimes I wonder what life wo...","[{,Cha}, {Jin}, {Shin}, {Cha}, {Shin}, {Ha-ri'..."


## Character network

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx 
from pyvis.network import Network

In [17]:
def generate_character_network(df):
    windows = 10
    entity_relationship = []

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-windows:]

            #Flatten 2D list into 1D list
            previous_entities_flattened = sum(previous_entities_in_window, [])

            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    entity_relationship.append(sorted([entity, entity_in_window]))
    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df

In [18]:
relationship_df = generate_character_network(df)
relationship_df

Unnamed: 0,source,target,value
57,Ha-ri's,Ha-ri's,2338
61,Ha-ri's,Young-seo's,1984
44,Cha,Ha-ri's,1970
60,Ha-ri's,Shin,1954
59,Ha-ri's,Kang,1715
...,...,...,...
5,",Cha",Ha-ri,46
2,",Cha",",Kang",40
1,",Cha",",Jin",34
11,",Cha",ri,33


In [19]:
relationship_df = relationship_df.sort_values('value', ascending=False)

In [20]:
G = nx.from_pandas_edgelist(
    relationship_df,
    source='source',
    target='target',
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, height="700px", width="100%", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("business_proposal.html")

business_proposal.html
