## Notebook on word2vec
The first step to create Skipgram is to preprocess the data such that it has the correct shape. We create two functions that preprocess the data of the Alice in Wonderland textbook, such that they can be used to train Skipgram. 


In [47]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Lambda
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing import sequence
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from itertools import islice
from matplotlib import pylab
from __future__ import division

In [49]:
# DO NOT Modify the lines in this cell
path = 'alice.txt'
corpus = open(path).readlines()
# Removes lines with fewer than 3 words
corpus = [line for line in corpus if line.count(" ") >= 2]

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1

embedded_dim = 100
window_size = 2
window_size_corpus = 4

In [107]:
print corpus

[[305, 7, 38, 1, 92, 595], [11, 13, 253, 3, 106, 30, 470, 8, 342, 76, 16, 379, 20, 1], [828, 2, 8, 343, 136, 3, 54, 134, 57, 596, 6, 23, 829, 65, 1], [323, 16, 379, 13, 830, 24, 5, 23, 45, 683, 57, 1447, 12], [5, 2, 31, 36, 1, 212, 8, 4, 323, 59, 11, 170, 683, 57], [27, 6, 13, 831, 12, 16, 344, 324, 15, 70, 15, 6, 58, 25, 1], [471, 160, 154, 16, 415, 30, 597, 2, 529, 325, 1, 1049], [8, 416, 4, 1448, 1449, 49, 28, 684, 1, 530, 8, 188, 39, 2], [1050, 1, 1450, 56, 279, 4, 148, 92, 22, 1451, 155, 228], [280, 76, 16], [40, 13, 136, 27, 30, 1051, 12, 14, 832, 67, 11, 89, 5, 27], [30, 93, 35, 8, 1, 83, 3, 254, 1, 92, 96, 3, 255, 108, 156], [108, 156, 7, 173, 28, 531, 56, 6, 59, 5, 124, 1052, 5], [1053, 3, 16, 14, 6, 256, 3, 55, 1452, 18, 32, 24, 18, 1, 62], [5, 21, 164, 86, 685, 24, 56, 1, 92, 1453, 180, 4, 417], [35, 8, 78, 1054, 472, 2, 109, 18, 5, 2, 43, 345, 20], [11, 1055, 3, 16, 204, 25, 5, 1454, 598, 16, 324, 14, 6, 23], [103, 128, 238, 4, 92, 22, 346, 4, 1054, 472, 57, 4, 417], [3, 19

In [50]:
#generate data for Skipgram
def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size*2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1
                    
            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    # Add the input word
                    #in_words.append(word)
                    all_in.append(word)
                    # Add one of the context words
                    all_out.append(np_utils.to_categorical(words[i], V))
                    
                                   
    return (np.array(all_in),np.array(all_out))

In [109]:
np_utils.to_categorical(234, 1000)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [51]:
#get x and y's for data
x,y = generate_data_skipgram(corpus,window_size,V)

In [110]:
x[123]

323

In [53]:
skipgram = Sequential()
skipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim, )))
skipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))

In [54]:
skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [56]:
skipgram.fit(x, y, batch_size=128, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12bec1f50>

In [57]:
f = open('vectors_skipgram.txt' ,'w')
f.write(" ".join([str(V-1),str(dim)]))
f.write("\n")

In [58]:
vectors = skipgram.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write(word)
    f.write(" ")
    f.write(" ".join(map(str, list(vectors[i,:]))))
    f.write("\n")
f.close()

In [111]:
weights = skipgram.get_weights()

print "Weights for the embedding layer: ",  weights[0].shape
print "Weights for the dense layer: ",  weights[1].shape
print "Biases for the dense layer: ",  weights[2].shape

Weights for the embedding layer:  (2557, 100)
Weights for the dense layer:  (100, 2557)
Biases for the dense layer:  (2557,)


In [80]:
# Get the embedding matrix
embedding = weights[0]

In [91]:
def embed(word, embedding=embedding, tokenizer=tokenizer):
    # get the index of the word from the tokenizer
    int_word = tokenizer.texts_to_sequences([word])[0]
    # get the size of the dictionary from the embedding matrix
    dict_size = embedding.shape[0]
    # get the one-hot encoding of the word
    bin_word = np_utils.to_categorical(int_word, dict_size)
    return np.dot(bin_word, embedding)

In [100]:
queen = embed('king') - embed('man') + embed('woman')

dist = np.linalg.norm(queen - embed('queen'))

print dist

dist = np.linalg.norm(queen - embed('woman'))

print dist

1.1017426
1.856216
