In [2]:
import nltk
import numpy as np
from nltk.corpus import gutenberg
from string import punctuation
import re
from keras.preprocessing import text
import pandas as pd

Using TensorFlow backend.


In [3]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower().strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [4]:
alice = gutenberg.sents('carroll-alice.txt') 
remove_terms = punctuation + '0123456789'
alice = [[word.lower() for word in sent if word not in remove_terms] for sent in alice]
alice = [' '.join(tok_sent) for tok_sent in alice]
alice = list(map(normalize_corpus,alice))
alice = [str(sent) for sent in alice if len(str(sent).split()) > 2]

In [5]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(alice)
word2id = tokenizer.word_index
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in alice]
vocab_size = len(word2id)
embed_size = 100
window_size = 2 

In [61]:
from keras.preprocessing import sequence
from keras.utils import np_utils

def generate_context_word_pairs(corpus, window_size, vocab_size):
    X = []
    Y = []
    context_length = window_size*2
    for words in wids:
        sentence_length = len(words)
        for index, word in enumerate(words):           
            start = index - window_size
            end = index + window_size + 1
            context = [words[i] for i in range(start, end)if 0 <= i < sentence_length and i != index]
            x = sequence.pad_sequences([context], maxlen=context_length)
            X.append(x)
            Y.append(word)
    return X,Y

In [78]:
import torch
import torch.nn as nn
import numpy as np

class CBOW(torch.nn.Module):

    def __init__(self, inp_size , vocab_size, embedding_dim=100):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 100)
        self.activation_function1 = nn.ReLU()        
        self.linear2 = nn.Linear(100, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(torch.from_numpy(inputs).long())).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out
    
model = CBOW(window_size*2,vocab_size)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [115]:
for epoch in range(1, 100):
    loss = 0.
    i = 0
    X,Y = generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size)
    for x, y in zip(X,Y):
        i += 1
        optimizer.zero_grad()
        log_probs = model(x[0])
        loss = loss_function(log_probs,torch.Tensor([y]).long())
        loss.backward()
        optimizer.step()
        loss += loss.data
    print('Epoch:', epoch, '\tLoss:', loss)

Epoch: 1 	Loss: tensor(1.0376, grad_fn=<AddBackward0>)
Epoch: 2 	Loss: tensor(0.9594, grad_fn=<AddBackward0>)
Epoch: 3 	Loss: tensor(0.8860, grad_fn=<AddBackward0>)
Epoch: 4 	Loss: tensor(0.8448, grad_fn=<AddBackward0>)
Epoch: 5 	Loss: tensor(0.7961, grad_fn=<AddBackward0>)
Epoch: 6 	Loss: tensor(0.7346, grad_fn=<AddBackward0>)
Epoch: 7 	Loss: tensor(0.6995, grad_fn=<AddBackward0>)
Epoch: 8 	Loss: tensor(0.6336, grad_fn=<AddBackward0>)
Epoch: 9 	Loss: tensor(0.6178, grad_fn=<AddBackward0>)
Epoch: 10 	Loss: tensor(0.5756, grad_fn=<AddBackward0>)
Epoch: 11 	Loss: tensor(0.5612, grad_fn=<AddBackward0>)
Epoch: 12 	Loss: tensor(0.5446, grad_fn=<AddBackward0>)
Epoch: 13 	Loss: tensor(0.5130, grad_fn=<AddBackward0>)
Epoch: 14 	Loss: tensor(0.4864, grad_fn=<AddBackward0>)
Epoch: 15 	Loss: tensor(0.4551, grad_fn=<AddBackward0>)
Epoch: 16 	Loss: tensor(0.4380, grad_fn=<AddBackward0>)
Epoch: 17 	Loss: tensor(0.4262, grad_fn=<AddBackward0>)
Epoch: 18 	Loss: tensor(0.4047, grad_fn=<AddBackward0>)
E

KeyboardInterrupt: 

In [116]:
weights = model.embeddings(torch.Tensor([list(range(0,vocab_size))]).long())
pd.DataFrame(weights.view(-1,100).tolist(), index=list(id2word.values())[0:]).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
said,-0.768659,-0.764412,-0.75166,-0.087763,0.488961,-0.163126,0.809996,-0.259608,-1.39441,-0.149467,...,0.091195,0.292485,-0.920918,-0.069922,-0.891817,0.103409,0.99043,0.617224,1.63124,-0.547462
alice,-0.969423,0.66706,0.463161,1.194068,-0.437506,0.280481,-0.690799,-0.509485,0.053489,1.955167,...,0.716134,0.669029,-1.596573,-0.522355,0.528245,0.727068,-0.724045,1.069976,-0.552863,2.143873
little,1.092711,-0.148174,0.208529,-1.195887,-0.461815,0.758844,-0.338614,-0.080432,0.093529,-0.788899,...,-1.243059,-0.081845,-1.045834,1.118203,0.643145,-1.559992,-0.857833,1.104035,-1.704522,-1.247187
one,2.062495,-1.02488,-0.702464,-0.464389,0.455664,0.703211,0.194177,-1.719035,-0.850512,-0.15409,...,1.818426,0.360028,0.058361,-1.510473,0.817229,0.461154,1.330694,-0.747295,-2.014755,-0.613017
would,0.166414,-0.027449,-0.113835,0.551269,0.900583,0.43614,-0.227811,1.848397,0.187505,-1.483287,...,-0.926194,-1.081074,-0.115204,-0.520761,-0.018871,-1.415264,-0.654339,0.67245,0.242389,0.732968


In [117]:
from sklearn.metrics.pairwise import euclidean_distances

weights = weights.view(-1,100)
distance_matrix = euclidean_distances(weights.detach().numpy())

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['little', 'small', 'would', 'child','girl']}

similar_words

{'little': ['farther', 'beginning', 'creep', 'gallons', 'eel'],
 'small': ['rock', 'beginning', 'balanced', 'birthday', 'clean'],
 'would': ['remarks', 'closed', 'fairy', 'trial', 'wild'],
 'child': ['permitted', 'throne', 'apples', 'sheep', 'vegetable'],
 'girl': ['busily', 'pressed', 'care', 'drop', 'eggs']}