In [37]:
import gensim
import pandas as pd

from wmdecompose.utils import *
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer

## 1. Load Data and Pretrained Embeddings

In [38]:
PATH = "../data/"
data = pd.read_csv(f"{PATH}IMDB_Dataset.csv")
stopword_list=stopwords.words('english')

In [39]:
sentences = data.review.astype('str').tolist()
tokenizer = ToktokTokenizer()

In [62]:
%%time

sentences_clean=[remove_stopwords(r, stopword_list, tokenizer) for r in sentences]

CPU times: user 39.3 s, sys: 478 ms, total: 39.7 s
Wall time: 41.8 s


In [63]:
%%time

sentences_clean=pd.Series(sentences_clean).apply(denoise_text)
sentences_clean=sentences_clean.apply(remove_special_characters)
sentences_clean=sentences_clean.apply(simple_lemmatizer)

CPU times: user 27.8 s, sys: 347 ms, total: 28.1 s
Wall time: 28.8 s


In [64]:
%%time

sentences_clean=[remove_stopwords(r, stopword_list, tokenizer) for r in sentences_clean]

CPU times: user 21.8 s, sys: 210 ms, total: 22.1 s
Wall time: 22.6 s


In [65]:
sentences_clean[0]

'one reviewer mentioned watching oz episode hooked right exactly happened first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experience watching oz m

In [67]:
sentences_tokenized = [w.lower() for w in sentences_clean]
sentences_tokenized = [tokenizer.tokenize(i) for i in sentences_tokenized]

In [68]:
print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 54 s, sys: 8.35 s, total: 1min 2s
Wall time: 1min 21s


## 2. Phrase Data

In [69]:
PHRASING = True
MIN = 10
THRESHOLD = 200

In [70]:
%%time 

if PHRASING:
    sentences_phrased = get_phrases(sentences_tokenized, 
                                    min_count = MIN, 
                                    threshold = THRESHOLD)
    sentences_training = sentences_phrased
    
else:
    sentences_training = sentences_tokenized

CPU times: user 1min 34s, sys: 11.7 s, total: 1min 45s
Wall time: 2min 4s


In [91]:
print(sentences_training[0])

['one', 'reviewer_mentioned', 'watching', 'oz', 'episode', 'hooked', 'right', 'exactly_happened', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scene', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint_hearted', 'timid', 'show', 'pull_punch', 'regard', 'drug', 'sex_violence', 'hardcore', 'classic', 'use_word', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum_security', 'state', 'penitentary', 'focus_mainly', 'emerald_city', 'experimental', 'section', 'prison_cell', 'glass', 'front', 'face', 'inwards', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'many', 'aryan', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'scuffle', 'death', 'stare', 'dodgy', 'dealing', 'shady', 'agreement', 'never', 'far_away', 'would', 'say', 'main', 'appeal', 'show', 'due_fact', 'go', 'show', 'dare', 'forget', 'pretty', 'picture', 'painted', 'mainstream_audience', 'forget', 'charm', 'forget', 'romance', 'oz', 'mess', 'around', 'first', 'episode', 'eve

## 3. Finetune Embeddings

### 3.1. Initialize Loggers

In [72]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 1

    def on_epoch_begin(self, model):
        print(f"Epoch {self.epoch} starting.")

    def on_epoch_end(self, model):
        print(f"Epoch {self.epoch} ended.")
        self.epoch += 1
        
class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch != 1:
            previous_loss = self.losses[self.epoch-2]
        else:
            previous_loss = 0
        self.losses.append(loss)
        difference = loss-previous_loss
        print(f'  Loss: {loss}  Difference: {difference}')
        self.epoch += 1

In [73]:
epoch_logger = EpochLogger()
loss_logger = LossLogger()

### 3.2. Initialize Model

In [74]:
SIZE = model.vector_size
WINDOW = 10
EPOCHS = 4
MIN_COUNT = 2
SG = 1
HS = 0
SEED = 42
LOSS = True
ALPHA = 0.01

In [75]:
%%time

model_ft = Word2Vec(vector_size= SIZE, 
                    window = WINDOW,
                    min_count= MIN_COUNT,
                    epochs=EPOCHS,
                    sg = SG,
                    hs = HS,
                    seed = SEED)
model_ft.build_vocab(sentences_training)
total_examples = model_ft.corpus_count
model_ft.build_vocab([list(model.key_to_index.keys())], update=True)

CPU times: user 6.12 s, sys: 381 ms, total: 6.5 s
Wall time: 6.72 s


### 3.3. Train Word2Vec

In [76]:
outfile = "../embeddings/imdb_w2v"

In [77]:
%%time

model_ft.train(sentences_training, 
               total_examples=total_examples,
               epochs=model_ft.epochs,
               callbacks=[loss_logger],
               compute_loss=LOSS,
               start_alpha = ALPHA)
model_ft.wv.save_word2vec_format(f"{outfile}.txt", binary=False)

Epoch: 1	  Loss: 34382252.0  Difference: 34382252.0
Epoch: 2	  Loss: 50387232.0  Difference: 16004980.0
Epoch: 3	  Loss: 66620860.0  Difference: 16233628.0
Epoch: 4	  Loss: 67109536.0  Difference: 488676.0
CPU times: user 8min, sys: 4.31 s, total: 8min 4s
Wall time: 2min 55s


### 3.4. Load Finetuned Vectors and Test

In [78]:
ft_vectors = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(f"{outfile}.txt")

In [86]:
ft_vectors.distance("citizen", "kane")

0.45544904470443726

In [87]:
model.distance("citizen", "kane")

0.9603805989027023

In [88]:
ft_vectors.distance("lord", "ring")

0.29060083627700806

In [89]:
model.distance("lord", "ring")

0.7185895442962646

In [90]:
ft_vectors.get_vector("citizen_kane")[:10]

array([ 0.03295106, -0.10795593, -0.10198571,  0.00111192,  0.00233896,
       -0.06005668,  0.10686377,  0.03378343, -0.08767647,  0.02884012],
      dtype=float32)