In [None]:
import torch
from torch.optim import SGD
from torch.autograd import Variable, profiler
import numpy as np
import torch.functional as F
import torch.nn.functional as F

import pandas as pd

from IPython.core.display import display, HTML, Image
display(HTML("<style>.container { width:98% !important; }</style>"))

In [None]:
corpus = ["""Me and my uncle went ridin' down
To South Colorado, west Texas bound
We stopped over in Santa Fe,
That day on the pony, just about half way
And you know it was the hottest part of the day
I took the horses up to the stall
Went to the barroom, ordered drinks for all
Three days in the saddle, You know my body hurt
It being summer, I took off my shirt
And I tried to wash off some of that dusty dirt
West Texas cowboys, they was all around,
Wheat liquor and money, they loaded down,
So soon after payday, no one seemed ashamed,
You know my uncle, he starts playin' the game,
Hey! A hollow jack and the winner take the hand.
My uncle starts winning, the cowboys got sore,
One of them called him, and then two more,
Accused him of cheatin', oh no it couldn't be,
I know my uncle he's as honest as me,
And I'm as honest as a Denver man can be.
One of them cowboys he starsts to draw,
And I shot him down Lord, He never saw,
Shot me another, Right then he hit the floor,
In the confusion, my uncle grabbed the gold,
And we hightailed it down to Mexico.
Now I love thoe cowboys, I love their gold,
Love my uncle, God rest his soul,
Taught me good Lord, Taught me all I know,
Taught me so well, that I grabbed that gold, and
I left his dead ass there by the side of the road"""]

In [None]:
# build vocabulary
words = []
for sentence in corpus:
    for word in sentence.split():
         if word not in words:
            words.append(word)
        
word2idx = {w:idx for (idx, w) in enumerate(words)}
idx2word = {idx:w for (idx, w) in enumerate(words)}

vocabulary_size = len(word2idx)

In [None]:
## example
#word2idx
#idx2word

In [None]:
def get_word_embedding(word):
    word_vec_one_hot = np.zeros(vocabulary_size)
    word_vec_one_hot[word2idx[word]] = 1
    return word_vec_one_hot

In [None]:
## example
#get_word_embedding('me')

In [None]:
embedding_dims = 10
window_size = 4

In [None]:
def train_generator():
    for sentence in corpus:
        words = sentence.split()
        indices = [word2idx[w] for w in words]
        for i in range(len(indices)):
            # center word, context
            # i is center word index
            for w in range(-window_size, window_size + 1):
                context_idx = i + w
                if context_idx < 0 or context_idx >= len(indices) or i == context_idx:
                    continue
                center_vec_one_hot = np.zeros(vocabulary_size)
                center_vec_one_hot[indices[i]] = 1
                
                context_idx = indices[context_idx]
                yield center_vec_one_hot, context_idx

In [None]:
# Network definition
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)


for epo in range(501):
    avg_loss = 0
    samples = 0
    for data, target in train_generator():
        x = Variable(torch.from_numpy(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
        samples += len(y_true)
        
        a1 = torch.matmul(W1, x)
        a2 = torch.matmul(W2, a1)

        log_softmax = F.log_softmax(a2, dim=0)

        network_pred_dist = F.softmax(log_softmax, dim=0)
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        avg_loss += loss.item()
        loss.backward()

        W1.data -= 0.01 * W1.grad.data
        W2.data -= 0.01 * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
    if epo % 50 == 0:
        print(avg_loss / samples)

In [None]:
%matplotlib inline
from scikitplot.decomposition import plot_pca_2d_projection
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
pca.fit(W1.data.numpy().T)
proj = pca.transform(W1.data.numpy().T)
ax = plot_pca_2d_projection(pca, W1.data.numpy().T, np.array(words), feature_labels=words, figsize=(18,18), text_fontsize=12)
# ax.legend(None)
for i, txt in enumerate(words):
    ax.annotate(txt, (proj[i,0], proj[i,1]), size=16)

In [None]:
def get_word_vector_v(word):
    return W1[:, word2idx[word]].data.numpy()

def get_word_vector_u(word):
    return W2[word2idx[word],:].data.numpy()

In [None]:
# me to we is like uncle to ?

In [None]:
me = 1 * get_word_vector_v('me') + 1 * get_word_vector_u('me')
uncle = 1 * get_word_vector_v('uncle') + 1 * get_word_vector_u('uncle') 
we = 1 * get_word_vector_v('we') + 1 * get_word_vector_u('we') 

yyy = we - me + uncle

In [None]:
from scipy.spatial.distance import cosine
distances = [(v, cosine(yyy, 1 * get_word_vector_u(v) + 1 * get_word_vector_v(v))) for v in words]

In [None]:
# Poland to Warsaw is like Germany to Berlin

In [None]:
distances

In [None]:
# In what context Paris appears?

In [None]:
context_to_predict = get_word_vector_v('as')
hidden = Variable(torch.from_numpy(context_to_predict)).float()
a = torch.matmul(W2, hidden)
probs = F.softmax(a, dim=0).data.numpy()
for context, prob in zip(words, probs):
    print(f'{context}: {prob:.2f}')

In [None]:
series = [pd.Series([*i]) for i in zip(words, probs)]
prob_of_contex_word = pd.concat(series, axis=1).T
prob_of_contex_word.sort_values(1, ascending=False)

In [None]:
# In context of "France" and "is"