## Notebook on word2vec
The first step to create Skipgram is to preprocess the data such that it has the correct shape. We create two functions that preprocess the data of the Alice in Wonderland textbook, such that they can be used to train Skipgram. 


In [1]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Lambda
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing import sequence
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from itertools import islice
from matplotlib import pylab
from __future__ import division

Using TensorFlow backend.


In [35]:
# DO NOT Modify the lines in this cell
#path = 'alice.txt'
from google.colab import files
uploaded = files.upload()

corpus = uploaded['alice.txt'].splitlines()
# Removes lines with fewer than 3 words
corpus = [line for line in corpus if line.count(" ") >= 2]

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1

embedded_dim = 100
window_size = 2
window_size_corpus = 4

Saving alice.txt to alice (5).txt


In [0]:
#generate data for Skipgram
def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size*2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1
                    
            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    # Add the input word
                    #in_words.append(word)
                    all_in.append(word)
                    # Add one of the context words
                    all_out.append(np_utils.to_categorical(words[i], V))
                    
                                   
    return (np.array(all_in),np.array(all_out))

In [0]:
#get x and y's for data
x,y = generate_data_skipgram(corpus,window_size,V)

In [0]:
dim = 100
skipgram = Sequential()
skipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim, )))
skipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))

In [0]:
skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [27]:
skipgram.fit(x, y, batch_size=128, epochs=10, verbose=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f06ebf52f50>

In [0]:
f = open('vectors_skipgram.txt' ,'w')
f.write(" ".join([str(V-1),str(dim)]))
f.write("\n")

In [0]:
vectors = skipgram.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write(word)
    f.write(" ")
    f.write(" ".join(map(str, list(vectors[i,:]))))
    f.write("\n")
f.close()

In [28]:
weights = skipgram.get_weights()

print "Weights for the embedding layer: ",  weights[0].shape
print "Weights for the dense layer: ",  weights[1].shape
print "Biases for the dense layer: ",  weights[2].shape

Weights for the embedding layer:  (2557, 100)
Weights for the dense layer:  (100, 2557)
Biases for the dense layer:  (2557,)


In [0]:
# Get the embedding matrix
embedding = weights[0]

In [0]:
def embed(word, embedding=embedding, tokenizer=tokenizer):
    # get the index of the word from the tokenizer
    int_word = tokenizer.texts_to_sequences([word])[0]
    # get the size of the dictionary from the embedding matrix
    dict_size = embedding.shape[0]
    # get the one-hot encoding of the word
    bin_word = np_utils.to_categorical(int_word, dict_size)
    return np.dot(bin_word, embedding)

In [34]:
king = embed('queen') - embed('woman') + embed('man')

dist = np.linalg.norm(king - embed('king'))

print dist

dist = np.linalg.norm(king - embed('queen'))

print dist

dist = np.linalg.norm(embed('man') - embed('woman'))

print dist

dist = np.linalg.norm(embed('king') - embed('queen'))

print dist

0.8934418
0.5167291
0.5167291
0.8722515
