In [1]:
import re  
import pandas as pd  
from time import time  
from collections import defaultdict 

import spacy

In [2]:
df = pd.read_csv('datasets/data3.csv')
df.head()

Unnamed: 0,Title,Character,Line
0,Cartman Gets an Anal Probe,Scene Description,At the bus stop.
1,Cartman Gets an Anal Probe,The Boys,"School days, school days, teacher's golden ru..."
2,Cartman Gets an Anal Probe,Kyle Broflovski,"Ah, damn it! My little brother's trying to fol..."
3,Cartman Gets an Anal Probe,Ike Broflovski,Eat banana.
4,Cartman Gets an Anal Probe,Kyle,"Ike, you can't come to school with me. [Ike Ch..."


In [3]:
df.isnull().sum()

Title         0
Character     0
Line         12
dtype: int64

In [5]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Line'])
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

In [None]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
sent = [row.split() for row in df_clean['clean']]

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [None]:
bigram = Phraser(phrases)

In [None]:
sentences = bigram[sent]

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
import multiprocessing

from gensim.models import Word2Vec


cores = multiprocessing.cpu_count()

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
w2v_model.build_vocab(sentences, progress_per=10000)

In [None]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
w2v_model.wv.most_similar("cartman", topn=100)

In [None]:
w2v_model.wv.most_similar("elect", topn=100)

In [None]:
def find_closest(w1, w2, w2v):
    s1 = w2v.wv.most_similar(w1, topn=100)
    s2 = w2v.wv.most_similar(w2, topn=100)
    for i in s1:
        for j in s2:
            if i[0] == j[0]:
                return (i[0],(i[1]**2 + j[1]**2)**0.5)
            else:
                continue            

In [None]:
print(find_closest("president", "obama", w2v_model))