In [1]:
corpus = [
    "today is your day",
    "you are off to great places",
    "you are off and away",
    "you have brains in your head",
    "you have feet in your shoes",
    "you can steer yourself any direction you choose",
    "you are on your own",
    "and you know what you know",
    "and you are the guy who will decide where to go"
]

In [9]:
import numpy as np

## creating the learning base

In [45]:
#breaks the corpus into one-word tokens

def tokenize(corpus):
    tokens = []
    for i in (corpus):
        tokens.append(i.split())
    return tokens

In [3]:
tokens = tokenize(corpus)

In [5]:
#correspondence between natural numbers and words

k = 0
word2id = dict()
for i in tokens:
    for word in i:
        if (word not in word2id):
            word2id[word] = k
            k += 1
id2word = dict((v,k) for k,v in word2id.items())

In [7]:
#creates context-center word pairs in the given window

window = 2
context = []

for i in tokens:
    for j in range(len(i)):
        center = i[j]
        id_cent = word2id[center]
        for k in range(max(j - window, 0), j):
            cont = i[k]
            id_cont = word2id[cont]
            context.append([id_cent, id_cont])
        for k in range(j + 1, min(j + window + 1, len(i))):
            cont = i[k]
            id_cont = word2id[cont]
            context.append([id_cent, id_cont])

In [10]:
context = np.array(context)

[[ 0  1]
 [ 0  2]
 [ 1  0]
 [ 1  2]
 [ 1  3]
 [ 2  0]
 [ 2  1]
 [ 2  3]
 [ 3  1]
 [ 3  2]
 [ 4  5]
 [ 4  6]
 [ 5  4]
 [ 5  6]
 [ 5  7]
 [ 6  4]
 [ 6  5]
 [ 6  7]
 [ 6  8]
 [ 7  5]
 [ 7  6]
 [ 7  8]
 [ 7  9]
 [ 8  6]
 [ 8  7]
 [ 8  9]
 [ 9  7]
 [ 9  8]
 [ 4  5]
 [ 4  6]
 [ 5  4]
 [ 5  6]
 [ 5 10]
 [ 6  4]
 [ 6  5]
 [ 6 10]
 [ 6 11]
 [10  5]
 [10  6]
 [10 11]
 [11  6]
 [11 10]
 [ 4 12]
 [ 4 13]
 [12  4]
 [12 13]
 [12 14]
 [13  4]
 [13 12]
 [13 14]
 [13  2]
 [14 12]
 [14 13]
 [14  2]
 [14 15]
 [ 2 13]
 [ 2 14]
 [ 2 15]
 [15 14]
 [15  2]
 [ 4 12]
 [ 4 16]
 [12  4]
 [12 16]
 [12 14]
 [16  4]
 [16 12]
 [16 14]
 [16  2]
 [14 12]
 [14 16]
 [14  2]
 [14 17]
 [ 2 16]
 [ 2 14]
 [ 2 17]
 [17 14]
 [17  2]
 [ 4 18]
 [ 4 19]
 [18  4]
 [18 19]
 [18 20]
 [19  4]
 [19 18]
 [19 20]
 [19 21]
 [20 18]
 [20 19]
 [20 21]
 [20 22]
 [21 19]
 [21 20]
 [21 22]
 [21  4]
 [22 20]
 [22 21]
 [22  4]
 [22 23]
 [ 4 21]
 [ 4 22]
 [ 4 23]
 [23 22]
 [23  4]
 [ 4  5]
 [ 4 24]
 [ 5  4]
 [ 5 24]
 [ 5  2]
 [24  4]
 [24  5]
 

In [11]:
data = np.arange(35)

In [13]:
target = []
for i in data:
    cont = []
    for t in context:
        if (t[0] == i):
            cont.append(t[1])
    target.append(cont)

In [14]:
vocab_size = 35

## training the model

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as dset
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler, Sampler
import torch.functional as F
import torch.nn.functional as F

from torchvision import transforms

import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
#one-hot word representation
def get_input(word_id):
    x = np.zeros(vocab_size)
    x[word_id] = 1.0
    return x

In [23]:
emb_dim = 6

In [25]:
#finding the best lr and epoch number to get the minimal loss

min_loss = 100
for num_epochs in range(101, 120):
    lr = 0.001
    while (lr <= 0.1):
        w1 = Variable(torch.randn(emb_dim, vocab_size).float(), requires_grad = True)
        w2 = Variable(torch.randn(vocab_size, emb_dim).float(), requires_grad = True)
        glob_loss = 0
        for epoch in range(num_epochs):
            loss_val = 0
            for d, target in context:
                x = Variable(torch.from_numpy(get_input(d))).float()
                targ = Variable(torch.from_numpy(np.array([target])).long())

                z1 = torch.matmul(w1, x)
                z2 = torch.matmul(w2, z1)
                log_sfm = F.log_softmax(z2, dim = 0)
                
                loss = F.nll_loss(log_sfm.view(1, -1), targ)
                loss_val += loss.data.item()
                loss.backward()
                w1.data -= lr * w1.grad.data
                w2.data -= lr * w2.grad.data
                w1.grad.data.zero_()
                w2.grad.data.zero_()

            glob_loss = loss_val / len(context)
        if (glob_loss < min_loss):
            min_loss = glob_loss
            print("lr = ", lr, ", epoch = ", num_epochs)
        lr *= 10

lr =  0.001 , epoch =  101
lr =  0.01 , epoch =  101


In [27]:
#to see performance for the best lr-epoch pair

num_epochs = 101
lr = 0.01
w1 = Variable(torch.randn(emb_dim, vocab_size).float(), requires_grad = True)
w2 = Variable(torch.randn(vocab_size, emb_dim).float(), requires_grad = True)
for epoch in range(num_epochs):
    loss_val = 0
    for d, target in context:
        x = Variable(torch.from_numpy(get_input(d))).float()
        targ = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(w1, x)
        z2 = torch.matmul(w2, z1)
        log_sfm = F.log_softmax(z2, dim = 0)

        loss = F.nll_loss(log_sfm.view(1, -1), targ)
        loss_val += loss.data.item()
        loss.backward()
        w1.data -= lr * w1.grad.data
        w2.data -= lr * w2.grad.data
        w1.grad.data.zero_()
        w2.grad.data.zero_()
    if (epoch % 10 == 0):
        print("loss = ", loss_val/len(context))

loss =  4.958078961262758
loss =  3.5261169891247803
loss =  3.1415076968313635
loss =  2.9157948137699874
loss =  2.740347830728553
loss =  2.5890819334435737
loss =  2.4573561268291253
loss =  2.3464382763566642
loss =  2.2564077089572776
loss =  2.1845423421640504
loss =  2.1278072253040885
