In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

## Preprocessing

In [2]:
df = pd.read_csv('simpsons_dataset.csv')
df.shape

(158314, 2)

In [3]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [4]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [5]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

## Cleaning

In [15]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)
    
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [16]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 1.53 mins


In [17]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85960, 1)

## Bigrams

In [19]:
from gensim.models.phrases import Phrases, Phraser

INFO - 18:46:31: 'pattern' package not found; tag filters are not available for English


In [20]:
sent = [row.split() for row in df_clean['clean']]

In [21]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 18:47:10: collecting all words and their counts
INFO - 18:47:10: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 18:47:10: PROGRESS: at sentence #10000, processed 63561 words and 52716 word types
INFO - 18:47:10: PROGRESS: at sentence #20000, processed 130949 words and 99637 word types
INFO - 18:47:10: PROGRESS: at sentence #30000, processed 192972 words and 138212 word types
INFO - 18:47:10: PROGRESS: at sentence #40000, processed 249845 words and 172230 word types
INFO - 18:47:10: PROGRESS: at sentence #50000, processed 311277 words and 208051 word types
INFO - 18:47:10: PROGRESS: at sentence #60000, processed 373597 words and 243068 word types
INFO - 18:47:11: PROGRESS: at sentence #70000, processed 436446 words and 278001 word types
INFO - 18:47:11: PROGRESS: at sentence #80000, processed 497916 words and 311099 word types
INFO - 18:47:11: collected 329869 word types from a corpus of 537147 words (unigram + bigrams) and 85960 sentences
INFO - 18:47:11: us

In [23]:
bigram = Phraser(phrases)

INFO - 18:49:32: source_vocab length 329869
INFO - 18:49:37: Phraser built with 126 phrasegrams


In [24]:
sentences = bigram[sent]

### Most frequent words

In [25]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

29643

In [26]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']

# Training the model

In [29]:
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

print('cores: ', cores)

cores:  8


In [30]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

## Building the vocabulary table

In [31]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 18:51:40: collecting all words and their counts
INFO - 18:51:40: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:51:40: PROGRESS: at sentence #10000, processed 61706 words, keeping 9491 word types
INFO - 18:51:40: PROGRESS: at sentence #20000, processed 127342 words, keeping 14373 word types
INFO - 18:51:40: PROGRESS: at sentence #30000, processed 187807 words, keeping 17431 word types
INFO - 18:51:41: PROGRESS: at sentence #40000, processed 243316 words, keeping 20124 word types
INFO - 18:51:41: PROGRESS: at sentence #50000, processed 303167 words, keeping 22558 word types
INFO - 18:51:41: PROGRESS: at sentence #60000, processed 363915 words, keeping 24804 word types
INFO - 18:51:42: PROGRESS: at sentence #70000, processed 425375 words, keeping 26960 word types
INFO - 18:51:42: PROGRESS: at sentence #80000, processed 485514 words, keeping 28777 word types
INFO - 18:51:42: collected 29643 word types from a corpus of 523645 raw words and 85960 sentence

Time to build vocab: 0.05 mins


## Training of the model

In [32]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 18:52:42: training model with 7 workers on 3315 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 18:52:43: EPOCH 1 - PROGRESS: at 33.38% examples, 67139 words/s, in_qsize 0, out_qsize 0
INFO - 18:52:44: EPOCH 1 - PROGRESS: at 65.22% examples, 62331 words/s, in_qsize 0, out_qsize 0
INFO - 18:52:45: worker thread finished; awaiting finish of 6 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 5 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 4 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 3 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 2 more threads
INFO - 18:52:45: EPOCH 1 - PROGRESS: at 98.13% examples, 63150 words/s, in_qsize 1, out_qsize 1
INFO - 18:52:45: worker thread finished; awaiting finish of 1 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 0 more threads
INFO - 18:52:45: EPOCH - 1 : training on 523645 raw w

INFO - 18:53:12: worker thread finished; awaiting finish of 6 more threads
INFO - 18:53:12: worker thread finished; awaiting finish of 5 more threads
INFO - 18:53:12: worker thread finished; awaiting finish of 4 more threads
INFO - 18:53:12: worker thread finished; awaiting finish of 3 more threads
INFO - 18:53:12: worker thread finished; awaiting finish of 2 more threads
INFO - 18:53:12: worker thread finished; awaiting finish of 1 more threads
INFO - 18:53:12: worker thread finished; awaiting finish of 0 more threads
INFO - 18:53:12: EPOCH - 10 : training on 523645 raw words (199127 effective words) took 3.5s, 56683 effective words/s
INFO - 18:53:14: EPOCH 11 - PROGRESS: at 29.43% examples, 57561 words/s, in_qsize 0, out_qsize 0
INFO - 18:53:15: EPOCH 11 - PROGRESS: at 59.45% examples, 57035 words/s, in_qsize 0, out_qsize 0
INFO - 18:53:16: EPOCH 11 - PROGRESS: at 84.22% examples, 54267 words/s, in_qsize 1, out_qsize 0
INFO - 18:53:16: worker thread finished; awaiting finish of 6 mor

INFO - 18:53:41: worker thread finished; awaiting finish of 2 more threads
INFO - 18:53:41: worker thread finished; awaiting finish of 1 more threads
INFO - 18:53:41: worker thread finished; awaiting finish of 0 more threads
INFO - 18:53:41: EPOCH - 19 : training on 523645 raw words (199656 effective words) took 2.9s, 68992 effective words/s
INFO - 18:53:42: EPOCH 20 - PROGRESS: at 33.38% examples, 67309 words/s, in_qsize 0, out_qsize 0
INFO - 18:53:43: EPOCH 20 - PROGRESS: at 70.97% examples, 65322 words/s, in_qsize 0, out_qsize 0
INFO - 18:53:44: worker thread finished; awaiting finish of 6 more threads
INFO - 18:53:44: worker thread finished; awaiting finish of 5 more threads
INFO - 18:53:44: worker thread finished; awaiting finish of 4 more threads
INFO - 18:53:44: worker thread finished; awaiting finish of 3 more threads
INFO - 18:53:44: worker thread finished; awaiting finish of 2 more threads
INFO - 18:53:44: worker thread finished; awaiting finish of 1 more threads
INFO - 18:53

INFO - 18:54:10: worker thread finished; awaiting finish of 6 more threads
INFO - 18:54:10: worker thread finished; awaiting finish of 5 more threads
INFO - 18:54:10: worker thread finished; awaiting finish of 4 more threads
INFO - 18:54:10: worker thread finished; awaiting finish of 3 more threads
INFO - 18:54:10: worker thread finished; awaiting finish of 2 more threads
INFO - 18:54:10: worker thread finished; awaiting finish of 1 more threads
INFO - 18:54:10: worker thread finished; awaiting finish of 0 more threads
INFO - 18:54:10: EPOCH - 29 : training on 523645 raw words (199377 effective words) took 2.9s, 68580 effective words/s
INFO - 18:54:11: EPOCH 30 - PROGRESS: at 35.35% examples, 69296 words/s, in_qsize 0, out_qsize 0
INFO - 18:54:12: EPOCH 30 - PROGRESS: at 70.97% examples, 68248 words/s, in_qsize 0, out_qsize 0
INFO - 18:54:13: worker thread finished; awaiting finish of 6 more threads
INFO - 18:54:13: worker thread finished; awaiting finish of 5 more threads
INFO - 18:54

Time to train the model: 1.52 mins


In [33]:
w2v_model.init_sims(replace=True)

INFO - 18:56:07: precomputing L2-norms of word weight vectors


## Exploring the model

In [34]:
# Most similar to Homero
w2v_model.wv.most_similar(positive=["homer"])

[('depressed', 0.8020632863044739),
 ('marge', 0.7842615246772766),
 ('snuggle', 0.770393967628479),
 ('sweetheart', 0.7703680992126465),
 ('terrific', 0.7653756141662598),
 ('crummy', 0.7538477182388306),
 ('feel_well', 0.7537402510643005),
 ('creepy', 0.7515972256660461),
 ('good_friend', 0.7505632042884827),
 ('nervous', 0.7461447715759277)]

In [35]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

[('select', 0.790610671043396),
 ('council', 0.7715891599655151),
 ('congratulation', 0.7705077528953552),
 ('robert', 0.7687914967536926),
 ('aboard', 0.7673720121383667),
 ('pleased', 0.7658112049102783),
 ('united_state', 0.7519148588180542),
 ('recent', 0.7456115484237671),
 ('threat', 0.7453498840332031),
 ('elect', 0.7438726425170898)]

In [36]:
w2v_model.wv.most_similar(positive=["marge"])

[('homer', 0.7842615246772766),
 ('sorry', 0.7802915573120117),
 ('nervous', 0.7766187191009521),
 ('depressed', 0.7650347948074341),
 ('snuggle', 0.7640378475189209),
 ('surprised', 0.7563819289207458),
 ('rude', 0.7559762597084045),
 ('sweetheart', 0.7519235014915466),
 ('feel_well', 0.7512655258178711),
 ('hopeless', 0.7505140900611877)]

In [37]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.8627593517303467),
 ('hearing', 0.8017841577529907),
 ('strangle', 0.7903481721878052),
 ('substitute', 0.7894383668899536),
 ('convince', 0.7877330780029297),
 ('badly', 0.7814056277275085),
 ('jealous', 0.7771100401878357),
 ('mom', 0.7760716080665588),
 ('muntz', 0.7737230658531189),
 ('mom_dad', 0.7711635828018188)]

In [38]:
w2v_model.wv.most_similar(positive=["willie"])

[('groundskeeper', 0.9361370205879211),
 ('aye', 0.603545606136322),
 ('puke', 0.6017469763755798),
 ("'_tis", 0.5868011713027954),
 ('seymour', 0.5557730197906494),
 ('mess', 0.5498030185699463),
 ('oi', 0.548999011516571),
 ('arrr', 0.542657732963562),
 ('grass', 0.5248633027076721),
 ('field', 0.5233394503593445)]

## Similarities

In [41]:
w2v_model.wv.similarity("moe", 'tavern')

0.90058094

In [42]:
w2v_model.wv.similarity('maggie', 'baby')

0.7199063

In [43]:
w2v_model.wv.similarity('bart', 'nelson')

0.686138

## Odd-One-Out

In [49]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'nelson'])

'jimbo'

In [50]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

'nelson'

In [51]:
w2v_model.wv.doesnt_match(['homer', 'patty', 'selma'])

'homer'

## Analogy difference

In [52]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('see', 0.6590675115585327),
 ('admire', 0.6295326352119446),
 ('care', 0.6255589723587036)]

In [54]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.7746933698654175),
 ('parent', 0.6939990520477295),
 ('surprised', 0.6744728088378906)]