In [2]:
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

import pandas as pd
from nltk.tokenize import RegexpTokenizer



In [4]:
PATH = "../data/"
data = pd.read_csv(f"{PATH}IMDB_Dataset.csv")

In [5]:
sentences = data.review.astype('str').tolist()
tokenizer = RegexpTokenizer(r'\w+')
sentences_tokenized = [w.lower() for w in sentences]
sentences_tokenized = [tokenizer.tokenize(i) for i in sentences_tokenized]

In [6]:
print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 48 s, sys: 3.31 s, total: 51.4 s
Wall time: 53 s


In [104]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 1

    def on_epoch_begin(self, model):
        print(f"Epoch {self.epoch} starting.")

    def on_epoch_end(self, model):
        print(f"Epoch {self.epoch} ended.")
        self.epoch += 1
        
class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch != 1:
            previous_loss = self.losses[self.epoch-2]-loss
        else:
            previous_loss = loss
        self.losses.append(loss)
        difference = loss-previous_loss
        print(f'  Loss: {loss}  Difference: {difference}')
        self.epoch += 1

In [105]:
SIZE = model.vector_size
WINDOW = 10
EPOCHS = 10
MIN_COUNT = 2
SG = 1
HS = 0
SEED = 42
LOSS = True
ALPHA = 0.01

In [106]:
epoch_logger = EpochLogger()
loss_logger = LossLogger()

In [107]:
model_ft = Word2Vec(vector_size= SIZE, 
                    window = WINDOW,
                    min_count= MIN_COUNT,
                    epochs=EPOCHS,
                    sg = SG,
                    hs = HS,
                    seed = SEED)
model_ft.build_vocab(sentences_tokenized)
total_examples = model_ft.corpus_count
model_ft.build_vocab([list(model.key_to_index.keys())], update=True)

In [108]:
outfile = "../embeddings/imdb_w2v"

In [None]:
model_ft.train(sentences_tokenized, 
               total_examples=total_examples,
               epochs=model_ft.epochs,
               callbacks=[loss_logger],
               compute_loss=LOSS,
               start_alpha = ALPHA)
model_ft.wv.save_word2vec_format(f"{outfile}.txt", binary=False)

Epoch: 1	  Loss: 46185348.0  Difference: 0.0
Epoch: 2	  Loss: 67128448.0  Difference: 88071548.0
Epoch: 3	  Loss: 67196512.0  Difference: 67264576.0
Epoch: 4	  Loss: 67264584.0  Difference: 67332656.0
Epoch: 5	  Loss: 67336088.0  Difference: 67407592.0
Epoch: 6	

In [73]:
model_ft.wv.save_word2vec_format(f"../{outfile}.txt", binary=False)

In [74]:
ft_vectors = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(f"../{outfile}.txt")

In [67]:
ft_vectors.distance("citizen", "kane")

0.2986515760421753

In [68]:
model.distance("citizen", "kane")

0.9603805989027023

In [79]:
len(ft_vectors.get_vector("citizen"))

300