In [2]:
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

import pandas as pd
from nltk.tokenize import RegexpTokenizer



In [4]:
PATH = "../data/"
data = pd.read_csv(f"{PATH}IMDB_Dataset.csv")

In [5]:
sentences = data.review.astype('str').tolist()
tokenizer = RegexpTokenizer(r'\w+')
sentences_tokenized = [w.lower() for w in sentences]
sentences_tokenized = [tokenizer.tokenize(i) for i in sentences_tokenized]

In [6]:
def read_1w_corpus(name, sep="\t"):
    for line in open(name):
        yield line.split(sep)

print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 48 s, sys: 3.31 s, total: 51.4 s
Wall time: 53 s


In [7]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print(f"Epoch {self.epoch} starting.")

    def on_epoch_end(self, model):
        print(f"Epoch {self.epoch} ended.")
        self.epoch += 1
        
class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.losses.append(loss)
        print(f'  Loss: {loss}')
        self.epoch += 1

In [11]:
SIZE = 300
WINDOW = 7
EPOCHS = 5
MIN_COUNT = 2
SG = 1
HS = 0
SEED = 42

In [None]:
epoch_logger = EpochLogger()
loss_logger = LossLogger()

In [33]:
model_ft = Word2Vec(vector_size= SIZE, 
                    window = WINDOW,
                    min_count= MIN_COUNT,
                    epochs=EPOCHS,
                    sg = SG,
                    hs = HS,
                    seed = SEED,
                    callbacks = [epoch_logger])
model_ft.build_vocab(sentences_tokenized)
total_examples = model_ft.corpus_count
model_ft.build_vocab([list(model.key_to_index.keys())], update=True)

In [34]:
model_ft.train(sentences_tokenized, 
               total_examples=total_examples,
               epochs=model_ft.epochs,
               callbacks=[epoch_logger])

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


(44540279, 59884475)