In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk

In [12]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens
def prepare_vocab(tokenized_corpus):
    vocabulary = []
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                vocabulary.append(token)
    return vocabulary

In [23]:
corpus = ["Microsoft acquired Github. And who knows what would happen to Github."]
tokenized = tokenize_corpus(corpus)
vocabulary = prepare_vocab(tokenized)
tokenized=tokenized[0]
print(tokenized)
print(vocabulary)

['Microsoft', 'acquired', 'Github.', 'And', 'who', 'knows', 'what', 'would', 'happen', 'to', 'Github.']
['Microsoft', 'acquired', 'Github.', 'And', 'who', 'knows', 'what', 'would', 'happen', 'to']


In [24]:
list(nltk.ngrams(tokenized,5))

[('Microsoft', 'acquired', 'Github.', 'And', 'who'),
 ('acquired', 'Github.', 'And', 'who', 'knows'),
 ('Github.', 'And', 'who', 'knows', 'what'),
 ('And', 'who', 'knows', 'what', 'would'),
 ('who', 'knows', 'what', 'would', 'happen'),
 ('knows', 'what', 'would', 'happen', 'to'),
 ('what', 'would', 'happen', 'to', 'Github.')]

In [25]:
word2index={}
for voca in vocabulary:
    if word2index.get(voca)==None:
        word2index[voca]=len(word2index)
print(word2index)

{'And': 3, 'what': 6, 'would': 7, 'knows': 5, 'Github.': 2, 'acquired': 1, 'who': 4, 'to': 9, 'happen': 8, 'Microsoft': 0}


In [46]:
WINDOW_SIZE = 2
WINDOW_SIZE1 = 3
windows = list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + tokenized + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1))
windows1 = list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE1 + tokenized + ['<DUMMY>'] * WINDOW_SIZE1, WINDOW_SIZE1 * 2 + 1))
print windows
windows1

[('<DUMMY>', '<DUMMY>', 'Microsoft', 'acquired', 'Github.'), ('<DUMMY>', 'Microsoft', 'acquired', 'Github.', 'And'), ('Microsoft', 'acquired', 'Github.', 'And', 'who'), ('acquired', 'Github.', 'And', 'who', 'knows'), ('Github.', 'And', 'who', 'knows', 'what'), ('And', 'who', 'knows', 'what', 'would'), ('who', 'knows', 'what', 'would', 'happen'), ('knows', 'what', 'would', 'happen', 'to'), ('what', 'would', 'happen', 'to', 'Github.'), ('would', 'happen', 'to', 'Github.', '<DUMMY>'), ('happen', 'to', 'Github.', '<DUMMY>', '<DUMMY>')]


[('<DUMMY>', '<DUMMY>', '<DUMMY>', 'Microsoft', 'acquired', 'Github.', 'And'),
 ('<DUMMY>', '<DUMMY>', 'Microsoft', 'acquired', 'Github.', 'And', 'who'),
 ('<DUMMY>', 'Microsoft', 'acquired', 'Github.', 'And', 'who', 'knows'),
 ('Microsoft', 'acquired', 'Github.', 'And', 'who', 'knows', 'what'),
 ('acquired', 'Github.', 'And', 'who', 'knows', 'what', 'would'),
 ('Github.', 'And', 'who', 'knows', 'what', 'would', 'happen'),
 ('And', 'who', 'knows', 'what', 'would', 'happen', 'to'),
 ('who', 'knows', 'what', 'would', 'happen', 'to', 'Github.'),
 ('knows', 'what', 'would', 'happen', 'to', 'Github.', '<DUMMY>'),
 ('what', 'would', 'happen', 'to', 'Github.', '<DUMMY>', '<DUMMY>'),
 ('would', 'happen', 'to', 'Github.', '<DUMMY>', '<DUMMY>', '<DUMMY>')]

In [63]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

print(train_data)

'''new_trigrams = []

for window in windows1:
    c=2
    while c < (WINDOW_SIZE1 * 2 + 1) - 2:
        new_trigrams.append((window[c], window[c+1], window[c+2]))
        c += 2
print new_trigrams'''

[('Microsoft', 'acquired'), ('Microsoft', 'Github.'), ('acquired', 'Microsoft'), ('acquired', 'Github.'), ('acquired', 'And'), ('Github.', 'Microsoft'), ('Github.', 'acquired'), ('Github.', 'And'), ('Github.', 'who'), ('And', 'acquired'), ('And', 'Github.'), ('And', 'who'), ('And', 'knows'), ('who', 'Github.'), ('who', 'And'), ('who', 'knows'), ('who', 'what'), ('knows', 'And'), ('knows', 'who'), ('knows', 'what'), ('knows', 'would'), ('what', 'who'), ('what', 'knows'), ('what', 'would'), ('what', 'happen'), ('would', 'knows'), ('would', 'what'), ('would', 'happen'), ('would', 'to'), ('happen', 'what'), ('happen', 'would'), ('happen', 'to'), ('happen', 'Github.'), ('to', 'would'), ('to', 'happen'), ('to', 'Github.'), ('Github.', 'happen'), ('Github.', 'to')]


'new_trigrams = []\n\nfor window in windows1:\n    c=2\n    while c < (WINDOW_SIZE1 * 2 + 1) - 2:\n        new_trigrams.append((window[c], window[c+1], window[c+2]))\n        c += 2\nprint new_trigrams'

In [64]:
def prepare_word(word, word2index):
    
    return Variable(torch.LongTensor([word2index[word]]))
X_p,y_p=[],[]

for (center,context) in train_data:
    X_p.append(prepare_word(center, word2index).view(1, -1))
    y_p.append(prepare_word(context, word2index).view(1, -1))
    
train_data = list(zip(X_p,y_p))
print  train_data[len(train_data)-1]


(Variable containing:
 2
[torch.LongTensor of size 1x1]
, Variable containing:
 9
[torch.LongTensor of size 1x1]
)


In [66]:
center_embed = nn.Embedding(len(word2index),5)
context_embed = nn.Embedding(len(word2index),5)

print(center_embed.weight)
print(context_embed.weight)

Parameter containing:
-1.3266  1.6527  1.2775 -1.0167 -1.0160
 0.1447  1.2489  1.0638 -2.0848 -0.7556
 0.4026  0.8455 -2.1360 -0.1086 -0.6601
 1.2523 -1.3154 -1.6642  0.9402  0.4511
 0.8422 -0.9307  1.7267 -1.7794  0.0670
-0.9816 -1.7519 -1.7928  0.0142 -1.3582
-2.1999  1.2694 -0.3655  0.5985 -0.0742
-0.6623  2.0116  0.6237 -0.5345  1.4169
 1.2124 -0.2995 -1.9434 -0.2992 -0.5486
-1.2226  0.9033 -1.0982  1.8611 -0.7820
[torch.FloatTensor of size 10x5]

Parameter containing:
 0.5526  0.9490  0.9828 -0.1001 -1.1507
 1.1599 -0.2314 -0.7671 -0.8115 -0.1359
-0.7755  2.0212  0.6821  0.7479  1.9533
-0.0649  0.7564 -0.8560 -0.1514  1.1544
-1.1941 -1.6410 -0.0651  0.1232  1.0290
 0.2994 -0.0119 -0.2766  0.3266  1.1709
 1.0905  0.4083  0.8012  2.2652  0.5704
-0.6686 -0.0272  0.7378 -1.0261 -0.3335
-0.7826 -0.1250  0.4643  0.6210  0.3014
 0.0656 -0.9330 -0.1016  0.9310 -0.0992
[torch.FloatTensor of size 10x5]



In [68]:
center,context = train_data[0]
center1,context1 = train_data[-1]
center_vector = center_embed(center)
context_vector = context_embed(context)
center_vector1 = center_embed(center1)
context_vector1 = context_embed(context1)

print(center_vector)
print(context_vector)
print(center_vector1)
print(context_vector1)

Variable containing:
(0 ,.,.) = 
 -1.3266  1.6527  1.2775 -1.0167 -1.0160
[torch.FloatTensor of size 1x1x5]

Variable containing:
(0 ,.,.) = 
  1.1599 -0.2314 -0.7671 -0.8115 -0.1359
[torch.FloatTensor of size 1x1x5]

Variable containing:
(0 ,.,.) = 
  0.4026  0.8455 -2.1360 -0.1086 -0.6601
[torch.FloatTensor of size 1x1x5]

Variable containing:
(0 ,.,.) = 
  0.0656 -0.9330 -0.1016  0.9310 -0.0992
[torch.FloatTensor of size 1x1x5]



In [71]:
score = torch.exp(context_vector.bmm(center_vector.transpose(1,2))).squeeze(2)
print score
score1 = torch.exp(context_vector1.bmm(center_vector1.transpose(1,2))).squeeze(2)
print score1

Variable containing:
 0.1440
[torch.FloatTensor of size 1x1]

Variable containing:
 0.5593
[torch.FloatTensor of size 1x1]

