In [4]:
import time
from random import random
from preprocess_data import PreprocessData
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# import torchvision
# from matplotlib import pyplot as plt

if torch.cuda.is_available():
    print('CUDA is available!')
    # Get the index of the current GPU device
    print('Current GPU Device:', torch.cuda.current_device())
    # Get properties of the current GPU
    print('GPU Properties:', torch.cuda.get_device_properties(torch.cuda.current_device()))
else:
    print('CUDA is not available.')

torch.manual_seed(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EPOCH = 100
BATCH_SIZE = 64
LR = 0.01
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

CUDA is not available.


In [9]:
## Preprocess Data ## 
p = PreprocessData()
p.download_data(from_id=1513, limit=5)
words = p.tokenize(remove_stop_words=True)
print(f'Number of words: {len(words)}')

## Context-Target pairs ##
data = []
for i in range(CONTEXT_SIZE, len(words) - CONTEXT_SIZE):
    context = (
            [words[i - (j - 1)] for j in range(CONTEXT_SIZE)]
            + [words[i + (j + 1)] for j in range(CONTEXT_SIZE)]
    )
    target = words[i]
    data.append((context, target))
print(f'Number of context-target pairs: {len(data)}')
print(f'Example of context-target pair: {data[0]}')

vocab = set(words)
word_to_ix = {word: i for i, word in enumerate(vocab)}

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vadimmusatskov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vadimmusatskov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 5/5 [00:07<00:00,  1.50s/it]


Number of words: 68525
Number of context-target pairs: 68521
Example of context-target pair: (['ebook', 'gutenberg', 'ebook', 'tragedy'], 'gutenberg')


In [10]:
## Model ##

class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(Model, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim).to(device)
        self.linear = nn.Linear(embedding_dim, vocab_size).to(device)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = torch.sum(embeds, dim=0, keepdim=True)
        out = self.linear(embeds)
        return F.log_softmax(out, dim=1)  # softmax compute log probability


loss_function = nn.NLLLoss()
model = Model(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

optimizer = optim.Adam(model.parameters(), lr=LR)

    
def generate_batches(data_, batch_size):
    for i in range(0, len(data_), batch_size):
        yield data_[i: i + batch_size]


In [11]:
## Training ##
# TRAIN:
losses = []
for epoch in range(EPOCH):
    start = time.time()
    total_loss = 0
    for batch in generate_batches(data, BATCH_SIZE):
        batch_loss = 0
        # Accumulate gradients over the batch
        model.zero_grad()
        for context, target in batch:
            context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).to(device)
            log_probs = model(context_idxs)
            loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long).to(device))
            batch_loss += loss
        batch_loss /= len(batch)
        batch_loss.backward()
        optimizer.step()
        total_loss += batch_loss.item()
    
    losses.append(total_loss)
    print(f'Epoch {epoch+1}/{EPOCH} | Loss: {total_loss:.2f} | Time: {time.time() - start:.2f}s')


Epoch 1/100 | Loss: 7851.97 | Time: 28.87s
Epoch 2/100 | Loss: 6665.59 | Time: 28.17s
Epoch 3/100 | Loss: 6164.05 | Time: 28.97s
Epoch 4/100 | Loss: 5819.56 | Time: 28.59s
Epoch 5/100 | Loss: 5566.97 | Time: 28.81s
Epoch 6/100 | Loss: 5371.24 | Time: 26.97s
Epoch 7/100 | Loss: 5217.74 | Time: 27.01s
Epoch 8/100 | Loss: 5088.81 | Time: 28.20s
Epoch 9/100 | Loss: 4984.50 | Time: 28.37s
Epoch 10/100 | Loss: 4894.37 | Time: 27.96s


KeyboardInterrupt: 