In [30]:
import os
import re
from tqdm import tqdm_notebook as tqdm
import string

import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import make_sampling_table, skipgrams

from keras.layers import Dot, dot, Input, Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model, save_model

from preprocess import clean_text
from utils import load_data, create_tokenizer, save_tokenizer

In [31]:
data_dir = "data/youtube_comments"
train_dir = os.path.join(data_dir, "train")

train_texts, train_labels = load_data(train_dir)

# for i in range(200):
#     print(str(i) + " " + train_texts[i]) 

neg: 100%|██████████| 1881/1881 [00:00<00:00, 5205.79it/s]
pos: 100%|██████████| 2724/2724 [00:00<00:00, 5029.80it/s]


In [32]:
# Tokenizer params
vocab_size = 2000

# skip-grams params
window_size = 3
embed_size = 300
epochs = 10

In [40]:
tokenizer = create_tokenizer(train_texts, vocab_size)
save_tokenizer(tokenizer, "tokenizer.pickle")

Found 5269 unique tokens


In [35]:
sequences = tokenizer.texts_to_sequences(train_texts)
sampling_table = make_sampling_table(vocab_size)

In [36]:
# generate skip-grams
sampling_table = make_sampling_table(vocab_size, sampling_factor=1e-02)
skip_grams = [skipgrams(sequence, 
                        vocabulary_size=vocab_size, 
                        window_size=5, 
                        sampling_table=sampling_table) for sequence in sequences]

skip_grams = [elem for elem in skip_grams if elem[0] != []]

In [37]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, embed_size, input_length=1, name='embedding')

target = embedding(input_target)
target = Reshape((embed_size, 1))(target)
context = embedding(input_context)
context = Reshape((embed_size, 1))(context)

model = Sequential()
# now perform the dot product operation to get a similarity measure
dot_product = dot([target, context], axes=1, normalize=False)
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

# create the primary training model
model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 300)       600000      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 300, 1)       0           embedding[0][0]                  
__________

In [38]:
for epoch in range(epochs):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        loss += model.train_on_batch(X, Y)  

    print("Iteration {}, loss={}".format(epoch, loss))

Iteration 0, loss=2800.714749097824
Iteration 1, loss=2591.191027067136
Iteration 2, loss=2480.511455710046
Iteration 3, loss=2433.1590209156275
Iteration 4, loss=2437.0907619986683
Iteration 5, loss=2447.7856042739004
Iteration 6, loss=2463.1823025792837
Iteration 7, loss=2483.455336138606


KeyboardInterrupt: 

In [17]:
# Save embedding weights
model.layers[2].get_weights()[0].shape
model.save("word2vec_model.h5")