In [142]:
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.test.utils import get_tmpfile

import pandas as pd
from nltk.tokenize import RegexpTokenizer

## 1. Load Data and Pretrained Embeddings

In [143]:
PATH = "../data/"
data = pd.read_csv(f"{PATH}IMDB_Dataset.csv")

In [144]:
sentences = data.review.astype('str').tolist()
tokenizer = RegexpTokenizer(r'\w+')
sentences_tokenized = [w.lower() for w in sentences]
sentences_tokenized = [tokenizer.tokenize(i) for i in sentences_tokenized]

In [145]:
print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 48.5 s, sys: 4.09 s, total: 52.6 s
Wall time: 54.7 s


## 2. Phrase Data

In [146]:
def get_phrases(sentences, min_count=5, threshold=100):
    bigram = Phrases(sentences, min_count=min_count, threshold = threshold) # higher threshold fewer phrases.
    trigram = Phrases(bigram[sentences])  

    # 'Phraser' is a wrapper that makes 'Phrases' run faster
    bigram_phraser = Phraser(bigram)
    trigram_phraser = Phraser(trigram)

    phrased_bi = [b for b in bigram[sentences]]
    phrased_tri = [t for t in trigram[[b for b in bigram[sentences]]]]
    
    return phrased_tri

In [165]:
PHRASING = True
MIN = 10
THRESHOLD = 200

In [166]:
%time 

if PHRASING:
    sentences_phrased = get_phrases(sentences_tokenized, 
                                    min_count = MIN, 
                                    threshold = THRESHOLD)
    sentences_training = sentences_phrased
    
else:
    sentences_training = sentences_tokenized

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


In [170]:
print(sentences_training[0])

['one', 'of', 'the', 'other_reviewers', 'has', 'mentioned', 'that', 'after_watching', 'just', '1', 'oz', 'episode', 'you_ll', 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly_what', 'happened', 'with', 'me', 'br', 'br', 'the', 'first', 'thing', 'that', 'struck_me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust_me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint_hearted', 'or', 'timid', 'this', 'show', 'pulls', 'no_punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence', 'its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'word', 'br', 'br', 'it', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum_security', 'state', 'penitentary', 'it', 'focuses_mainly', 'on', 'emerald_city', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', 'the', 'cells', 'have'

## 3. Finetune Embeddings

### 3.1. Initialize Loggers

In [171]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 1

    def on_epoch_begin(self, model):
        print(f"Epoch {self.epoch} starting.")

    def on_epoch_end(self, model):
        print(f"Epoch {self.epoch} ended.")
        self.epoch += 1
        
class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch != 1:
            previous_loss = self.losses[self.epoch-2]
        else:
            previous_loss = 0
        self.losses.append(loss)
        difference = loss-previous_loss
        print(f'  Loss: {loss}  Difference: {difference}')
        self.epoch += 1

In [172]:
epoch_logger = EpochLogger()
loss_logger = LossLogger()

### 3.2. Initialize Model

In [173]:
SIZE = model.vector_size
WINDOW = 10
EPOCHS = 10
MIN_COUNT = 2
SG = 1
HS = 0
SEED = 42
LOSS = True
ALPHA = 0.01

In [174]:
%time

model_ft = Word2Vec(vector_size= SIZE, 
                    window = WINDOW,
                    min_count= MIN_COUNT,
                    epochs=EPOCHS,
                    sg = SG,
                    hs = HS,
                    seed = SEED)
model_ft.build_vocab(sentences_training)
total_examples = model_ft.corpus_count
model_ft.build_vocab([list(model.key_to_index.keys())], update=True)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.01 µs


### 3.3. Train Word2Vec

In [175]:
outfile = "../embeddings/imdb_w2v"

In [None]:
%time

model_ft.train(sentences_training, 
               total_examples=total_examples,
               epochs=model_ft.epochs,
               callbacks=[loss_logger],
               compute_loss=LOSS,
               start_alpha = ALPHA)
model_ft.wv.save_word2vec_format(f"{outfile}.txt", binary=False)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs
Epoch: 1	  Loss: 43901376.0  Difference: 43901376.0
Epoch: 2	  Loss: 67113664.0  Difference: 23212288.0
Epoch: 3	  Loss: 67149352.0  Difference: 35688.0
Epoch: 4	  Loss: 67188464.0  Difference: 39112.0
Epoch: 5	  Loss: 67230064.0  Difference: 41600.0
Epoch: 6	

### 3.4. Load Finetuned Vectors and Test

In [None]:
ft_vectors = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(f"{outfile}.txt")

In [None]:
ft_vectors.distance("citizen", "kane")

In [None]:
model.distance("citizen", "kane")

In [None]:
ft_vectors.get_vector("faint_hearted")