In [45]:
import nltk
import numpy as np
from nltk.corpus import gutenberg
from string import punctuation
import re
from keras.preprocessing import text
import pandas as pd

In [46]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower().strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [47]:
alice = gutenberg.sents('carroll-alice.txt') 
remove_terms = punctuation + '0123456789'
alice = [[word.lower() for word in sent if word not in remove_terms] for sent in alice]
alice = [' '.join(tok_sent) for tok_sent in alice]
alice = list(map(normalize_corpus,alice))
alice = [str(sent) for sent in alice if len(str(sent).split()) > 2]

In [48]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(alice)
word2id = tokenizer.word_index
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in alice]
vocab_size = len(word2id)
embed_size = 100
window_size = 2 

In [25]:
from keras.preprocessing.sequence import skipgrams
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]
pairs, labels = skip_grams[0][0], skip_grams[0][1]

In [69]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

class skipgram(nn.Module):
    
  def __init__(self, vocab_size, embedding_dim=100):
    super(skipgram, self).__init__()
    
    self.u_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)   
    self.v_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True) 
    self.lin = nn.Linear(embedding_dim,1)
 
  def forward(self, u_pos, v_pos ):

    embed_u = self.u_embeddings(torch.Tensor([u_pos]).long())
    embed_v = self.v_embeddings(torch.Tensor([v_pos]).long())
    score  = torch.mul(embed_u, embed_v)
    score = self.lin(score)
    print(score)
    target = F.sigmoid(score).squeeze()
    print(target)
    return target

model = skipgram(vocab_size)
loss_function = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [70]:
for epoch in range(1, 100):
    tloss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        optimizer.zero_grad()
        for v,u,l in zip(pair_first_elem,pair_second_elem,labels):            
            p = model(v, u)
            loss = loss_function(p,torch.Tensor([l]))
            loss += loss.data
        loss.backward()
        tloss+=loss
        optimizer.step()
    print('Epoch:', epoch, '\tLoss:', tloss)

tensor([[0.8911]], grad_fn=<AddmmBackward>)
tensor(0.7091, grad_fn=<SqueezeBackward0>)
tensor([[-0.0023]], grad_fn=<AddmmBackward>)
tensor(0.4994, grad_fn=<SqueezeBackward0>)
tensor([[-1.3037]], grad_fn=<AddmmBackward>)
tensor(0.2135, grad_fn=<SqueezeBackward0>)
tensor([[-0.0292]], grad_fn=<AddmmBackward>)
tensor(0.4927, grad_fn=<SqueezeBackward0>)
tensor([[-0.4561]], grad_fn=<AddmmBackward>)
tensor(0.3879, grad_fn=<SqueezeBackward0>)
tensor([[-0.6752]], grad_fn=<AddmmBackward>)
tensor(0.3373, grad_fn=<SqueezeBackward0>)
tensor([[0.6819]], grad_fn=<AddmmBackward>)
tensor(0.6642, grad_fn=<SqueezeBackward0>)
tensor([[0.0179]], grad_fn=<AddmmBackward>)
tensor(0.5045, grad_fn=<SqueezeBackward0>)
tensor([[0.6263]], grad_fn=<AddmmBackward>)
tensor(0.6516, grad_fn=<SqueezeBackward0>)
tensor([[0.9062]], grad_fn=<AddmmBackward>)
tensor(0.7122, grad_fn=<SqueezeBackward0>)
tensor([[-0.1125]], grad_fn=<AddmmBackward>)
tensor(0.4719, grad_fn=<SqueezeBackward0>)
tensor([[-0.0032]], grad_fn=<AddmmBac

KeyboardInterrupt: 

In [53]:
weights = model.u_embeddings(torch.Tensor([list(range(0,vocab_size))]).long())
pd.DataFrame(weights.view(-1,100).tolist(), index=list(id2word.values())[0:]).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
said,0.179425,-0.185343,-1.183711,0.319763,0.042209,0.623948,-0.561809,-1.943918,-1.053007,0.553827,...,-0.175016,-1.045109,0.439342,-0.133949,0.057302,1.868297,-1.170673,-0.017717,-0.382825,-1.382537
alice,1.880033,0.255972,1.193151,-0.389575,0.123183,0.672558,-1.753153,-1.473195,-0.291848,0.986159,...,-2.370818,-1.635316,0.767438,-0.70073,-0.025119,0.102825,-0.139166,-1.624705,-0.008383,0.323928
little,0.132771,-1.839545,-0.230088,0.125555,0.727814,0.487313,-0.361747,-2.471758,-0.449868,0.372703,...,0.815969,-0.256952,-0.088617,-1.695642,-1.624261,0.43749,-1.392092,-0.698966,0.7908,-0.7819
one,0.297157,-0.756851,0.395955,-0.707532,-1.25133,0.053592,1.581376,0.502552,1.432852,-0.320592,...,0.832119,2.580865,0.488905,0.74585,0.02035,-0.451972,0.436855,-0.264077,-1.41999,0.907573
would,-1.655144,0.310861,0.046035,-1.526546,-0.17615,-1.221822,-1.449021,0.452992,-2.528788,-0.093987,...,-1.77447,0.795702,-0.244212,-1.088123,0.166646,-1.195401,-1.106316,0.192364,0.565457,0.744719


In [54]:
from sklearn.metrics.pairwise import euclidean_distances

weights = weights.view(-1,100)
distance_matrix = euclidean_distances(weights.detach().numpy())

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['little', 'small', 'would', 'child','girl']}

similar_words

{'little': ['tied', 'dish', 'burn', 'sixpence', 'oop'],
 'small': ['eh', 'shoulders', 'straightened', 'came', 'turtle'],
 'would': ['jaws', 'egg', 'stuff', 'send', 'tomorrow'],
 'child': ['hat', 'cattle', 'sulky', 'quietly', 'camomile'],
 'girl': ['catch', 'came', 'teacup', 'puss', 'smaller']}