In [31]:
import time
from random import random
import random
from preprocess_data import PreprocessData
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
# import torchvision
# from matplotlib import pyplot as plt

if torch.cuda.is_available():
    print('CUDA is available!')
    # Get the index of the current GPU device
    print('Current GPU Device:', torch.cuda.current_device())
    # Get properties of the current GPU
    print('GPU Properties:', torch.cuda.get_device_properties(torch.cuda.current_device()))
else:
    print('CUDA is not available.')

torch.manual_seed(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EPOCH = 30
BATCH_SIZE = 128
LR = 0.001
CONTEXT_SIZE = 2
EMBEDDING_DIM = 80

CUDA is available!
Current GPU Device: 0
GPU Properties: _CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15102MB, multi_processor_count=40)


In [32]:
## Preprocess Data ##
p = PreprocessData()
p.download_data(from_id=1513, limit=50)
words = p.tokenize(remove_stop_words=True)
print(f'Number of words: {len(words)}')

vocab = set(words)
print(f'Vocabulary size: {len(vocab)}')
word_to_idx = {word: i for i, word in enumerate(vocab)}
print(f'Example of word to index: {list(word_to_idx.items())[:5]}')
idx_to_word = {i: word for word, i in word_to_idx.items()}
print(f'Example of index to word: {list(idx_to_word.items())[:5]}')

## Context-Target pairs ##
X = []
Y = []
for i in range(CONTEXT_SIZE, len(words) - CONTEXT_SIZE):
    context = (
            [word_to_idx[words[i - j]] for j in range(1,CONTEXT_SIZE+1)]
            + [word_to_idx[words[i + j]] for j in range(1,CONTEXT_SIZE+1)]
    )
    target = word_to_idx[words[i]]
    X.append(context)
    Y.append(target)
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)
    # data.append((context, target))
print(f'Number of context-target pairs: {len(X)}')
print(f'Example of context-target pair: {X[0]} - {Y}')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 50/50 [00:57<00:00,  1.14s/it]


Number of words: 945357
Vocabulary size: 41380
Example of word to index: [('peanut', 0), ('unreasonable', 1), ('vilest', 2), ('underscore', 3), ('waldorf', 4)]
Example of index to word: [(0, 'peanut'), (1, 'unreasonable'), (2, 'vilest'), (3, 'underscore'), (4, 'waldorf')]
Number of context-target pairs: 945353
Example of context-target pair: tensor([23075, 27234, 24931, 25993], device='cuda:0') - tensor([14785, 24931, 25993,  ..., 23259,   315, 16254], device='cuda:0')


In [33]:
## Model ##

class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(Model, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim).to(device)
        self.linear = nn.Linear(embedding_dim, vocab_size).to(device)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = torch.sum(embeds, dim=1, keepdim=False)
        out = self.linear(embeds)
        return F.log_softmax(out, dim=1)  # softmax compute log probability



model = Model(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)


class SimpleIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, X, Y):
        super(SimpleIterableDataset).__init__()
        self.data = []
        for i in range(len(X)):
            self.data.append( (Y[i], X[i]) )
        random.shuffle(self.data)

    def __iter__(self):
        return iter(self.data)


In [34]:
ds = SimpleIterableDataset(X, Y)
dl = torch.utils.data.DataLoader(ds, batch_size = BATCH_SIZE)

In [35]:
## Training ##

losses = []
model.to(device)
model.train()

for epoch in range(EPOCH):
    start = time.time()
    total_loss = 0
    for labels, features in dl:
        labels = labels.to(device)
        features = features.to(device)
        model.zero_grad()
        log_probs = model(features)
        loss = loss_function(log_probs, labels)
        loss /= len(labels)
        loss.backward()
        optimizer.step()
        total_loss += loss

    losses.append(total_loss)
    print(f'Epoch {epoch+1}/{EPOCH} | Loss: {total_loss:.2f} | Time: {time.time() - start:.2f}s')


Epoch 1/30 | Loss: 478.91 | Time: 34.39s
Epoch 2/30 | Loss: 423.02 | Time: 34.09s
Epoch 3/30 | Loss: 403.39 | Time: 34.23s
Epoch 4/30 | Loss: 390.14 | Time: 34.36s
Epoch 5/30 | Loss: 379.88 | Time: 34.45s
Epoch 6/30 | Loss: 371.36 | Time: 34.52s
Epoch 7/30 | Loss: 363.99 | Time: 34.55s
Epoch 8/30 | Loss: 357.44 | Time: 34.56s
Epoch 9/30 | Loss: 351.51 | Time: 34.60s
Epoch 10/30 | Loss: 346.09 | Time: 34.63s
Epoch 11/30 | Loss: 341.08 | Time: 34.65s
Epoch 12/30 | Loss: 336.41 | Time: 34.67s
Epoch 13/30 | Loss: 332.06 | Time: 34.66s
Epoch 14/30 | Loss: 327.97 | Time: 34.66s
Epoch 15/30 | Loss: 324.12 | Time: 34.66s
Epoch 16/30 | Loss: 320.49 | Time: 34.66s
Epoch 17/30 | Loss: 317.06 | Time: 34.67s
Epoch 18/30 | Loss: 313.81 | Time: 34.66s
Epoch 19/30 | Loss: 310.73 | Time: 34.66s
Epoch 20/30 | Loss: 307.80 | Time: 34.64s
Epoch 21/30 | Loss: 305.03 | Time: 34.62s
Epoch 22/30 | Loss: 302.39 | Time: 34.62s
Epoch 23/30 | Loss: 299.89 | Time: 34.61s
Epoch 24/30 | Loss: 297.50 | Time: 34.60s
E

In [83]:
    # COSINE SIMILARITY
    from sklearn.metrics.pairwise import cosine_similarity



In [82]:

arr = ['king', 'queen', 'man', 'woman', 'castle', 'lion', 'cat', 'pet']

dict_vector = dict()
for w in arr:
  dict_vector[w] = model.embeddings.weight[word_to_idx[w]].detach().cpu().numpy().reshape(1, -1)


res = cosine_similarity(dict_vector['lion'], dict_vector['lion'])
print(f'Similarity between lion and lion: {res[0][0]:.2f}')

res = cosine_similarity(dict_vector['lion'], dict_vector['cat'])
print(f'Similarity between lion and cat: {res[0][0]:.2f}')


res = cosine_similarity(dict_vector['lion'], dict_vector['pet'])
print(f'Similarity between lion and pet: {res[0][0]:.2f}')

res = cosine_similarity(dict_vector['king'], dict_vector['queen'])
print(f'Similarity between king and queen: {res[0][0]:.2f}')

king = dict_vector['king'].reshape(-1)
man = dict_vector['man'].reshape(-1)
woman = dict_vector['woman'].reshape(-1)
new_vector = king - man + woman
res = cosine_similarity(dict_vector['queen'], new_vector.reshape(1,-1))
print(f'Similarity between queen and new_vector: {res[0][0]:.2f}')









Similarity between lion and lion: 1.00
Similarity between lion and cat: 0.17
Similarity between lion and pet: 0.15
Similarity between king and queen: 0.25
Similarity between queen and new_vector: 0.20
