In [124]:
import pandas as pd
import re
from collections import defaultdict
from gensim.models.phrases import  Phrases, Phraser


In [125]:
simpsons_data = pd.read_csv("~/Downloads/simpsons_dataset.csv", sep = ",")
simpsons_data.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [126]:
simpsons_data.isna().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [127]:
new_simpsons =simpsons_data.dropna().reset_index(drop = True)

In [128]:
clean_data = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in new_simpsons['spoken_words'])

In [129]:
clean_simp = pd.DataFrame({'clean': clean_data})
clean_simp = clean_simp.dropna().drop_duplicates()
clean_simp.shape

(119516, 1)

Bigram we are using gensim phrases package to automatically detect common phrases(bigrams) from a list of sentences. Using the bigram model to catch or capture words like "mr_burns" or "bart_simpson"!

In [130]:
sentence = [row.split() for row in clean_simp['clean']]

Creates the relevant phrases from the list of sentences:

In [131]:
phrases = Phrases(sentence, min_count=30, progress_per=10000)

In [132]:
bigram = Phraser(phrases)

Tokenizing sentences

In [133]:
sentences_ = bigram[sentence]

In [134]:
word_frequency = defaultdict(int)
for sentence in sentences_:
    for i in sentence:
        word_frequency[i] += 1
len(word_frequency)

40991

getting frequent words from the documents

In [135]:
sorted(word_frequency, key=word_frequency.get, reverse=True)[:10]

['the', 'you', 'i', 'a', 'to', 'and', 'of', 'it', 'my', 'that']

training the model using the gensim word2vec model

In [136]:
import multiprocessing
from gensim.models import Word2Vec

In [137]:
cores = multiprocessing.cpu_count() #count the number of cores in a computer
cores

4

In [138]:
w2v_model = Word2Vec(min_count=20, #ignores all words with total absolute frequency lower than this - (2, 100)
                     window=6,
                     size=300, #dimensionality of the feature vectors. - (50, 300)
                     sample=6e-5, #The threshold for configuring which higher -frequency words are randomly downsampled. Highly influencial. -(0, 1e-5)
                     alpha=0.03, #The initial learning rate -(0.01, 0.05)
                     min_alpha=0.0007, #learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
                     negative=20, #if > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0,negative sampling is used. - (5, 20)
                     workers=cores-1)

In [139]:
w2v_model.build_vocab(sentences_, progress_per=10000)

In [140]:
w2v_model.train(sentences_, total_examples = w2v_model.corpus_count, epochs=30, report_delay=1)

(12599243, 37627050)

(12599243, 37627050)

In [141]:
w2v_model.init_sims(replace=True)

In [142]:
w2v_model.wv.most_similar(positive=['homer'])

[('marge', 0.5678250193595886),
 ('dad', 0.4641968011856079),
 ('homie', 0.4537961483001709),
 ('becky', 0.40316736698150635),
 ('bart', 0.39305901527404785),
 ('gee', 0.39249613881111145),
 ('abe', 0.38196370005607605),
 ('mom', 0.378150999546051),
 ('son', 0.3679531216621399),
 ('honey', 0.36209654808044434)]

[('marge', 0.567729115486145),
 ('dad', 0.4640953242778778),
 ('homie', 0.45330923795700073),
 ('becky', 0.4027969241142273),
 ('bart', 0.3929387331008911),
 ('gee', 0.392608106136322),
 ('abe', 0.38157254457473755),
 ('mom', 0.37801483273506165),
 ('son', 0.36788779497146606),
 ('honey', 0.3620525002479553)]

checking for similarities between words

In [143]:
w2v_model.similarity("mom", "dad")

  w2v_model.similarity("mom", "dad")


0.7336106524568102

  w2v_model.similarity("mom", "dad")


0.7337602817219319

checking for words that did not match from an array of words

In [144]:
w2v_model.doesnt_match(['good','smooth','cute'])

  w2v_model.doesnt_match(['good','smooth','cute'])
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'smooth'

  w2v_model.doesnt_match(['good','smooth','cute'])
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'smooth'