In [90]:
import time
from random import random
import random
from preprocess_data import PreprocessData
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
# import torchvision
# from matplotlib import pyplot as plt

if torch.cuda.is_available():
    print('CUDA is available!')
    # Get the index of the current GPU device
    print('Current GPU Device:', torch.cuda.current_device())
    # Get properties of the current GPU
    print('GPU Properties:', torch.cuda.get_device_properties(torch.cuda.current_device()))
else:
    print('CUDA is not available.')

torch.manual_seed(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EPOCH = 50
BATCH_SIZE = 32
LR = 0.01
CONTEXT_SIZE = 3
EMBEDDING_DIM = 20

CUDA is not available.


In [91]:
## Preprocess Data ## 
p = PreprocessData()
p.download_data(from_id=1513, limit=10)
words = p.tokenize(remove_stop_words=True)
print(f'Number of words: {len(words)}')

vocab = set(words)
print(f'Vocabulary size: {len(vocab)}')
word_to_idx = {word: i for i, word in enumerate(vocab)}
print(f'Example of word to index: {list(word_to_idx.items())[:5]}')
idx_to_word = {i: word for word, i in word_to_idx.items()}
print(f'Example of index to word: {list(idx_to_word.items())[:5]}')

## Context-Target pairs ##
X = []
Y = []
for i in range(CONTEXT_SIZE, len(words) - CONTEXT_SIZE):
    context = (
            [word_to_idx[words[i - j]] for j in range(1,CONTEXT_SIZE+1)]
            + [word_to_idx[words[i + j]] for j in range(1,CONTEXT_SIZE+1)]
    )
    target = word_to_idx[words[i]]
    X.append(context)
    Y.append(target)
X = torch.tensor(X)
Y = torch.tensor(Y)
    # data.append((context, target))
print(f'Number of context-target pairs: {len(X)}')
print(f'Example of context-target pair: {X[0]} - {Y}')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vadimmusatskov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vadimmusatskov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 10/10 [00:15<00:00,  1.50s/it]


Number of words: 139796
Vocabulary size: 12510
Example of word to index: [('cozeners', 0), ('leave', 1), ('slaught', 2), ('bleats', 3), ('beaver', 4)]
Example of index to word: [(0, 'cozeners'), (1, 'leave'), (2, 'slaught'), (3, 'bleats'), (4, 'beaver')]
Number of context-target pairs: 139790
Example of context-target pair: tensor([7448, 6732, 2736, 5248, 3531, 7938]) - tensor([7553, 5248, 3531,  ..., 1915, 6048, 1120])


In [92]:
## Model ##

class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(Model, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim).to(device)
        self.linear = nn.Linear(embedding_dim, vocab_size).to(device)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = torch.sum(embeds, dim=1, keepdim=False)
        out = self.linear(embeds)
        return F.log_softmax(out, dim=1)  # softmax compute log probability


loss_function = nn.NLLLoss()
model = Model(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

optimizer = optim.Adam(model.parameters(), lr=LR)

    
# def generate_batches(data_, batch_size):
#     for i in range(0, len(data_), batch_size):
#         yield data_[i: i + batch_size]
        
class SimpleIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, X, Y):
        super(SimpleIterableDataset).__init__()
        self.data = []
        for i in range(len(X)):
            self.data.append( (Y[i], X[i]) )
        random.shuffle(self.data)

    def __iter__(self):
        return iter(self.data)


In [93]:
ds = SimpleIterableDataset(X, Y)
dl = torch.utils.data.DataLoader(ds, batch_size = BATCH_SIZE)

In [94]:
## Training ##
# TRAIN:
losses = []
for epoch in range(EPOCH):
    start = time.time()
    total_loss = 0
    # for batch in generate_batches(data, BATCH_SIZE):
    for labels, features in dl:
        model.zero_grad()
        log_probs = model(features)
        loss = loss_function(log_probs, labels)
        loss /= len(labels)
        loss.backward()
        optimizer.step()
        total_loss += loss
    
    losses.append(total_loss)
    print(f'Epoch {epoch+1}/{EPOCH} | Loss: {total_loss:.2f} | Time: {time.time() - start:.2f}s')


Epoch 1/50 | Loss: 1069.96 | Time: 13.44s
Epoch 2/50 | Loss: 924.25 | Time: 13.53s
Epoch 3/50 | Loss: 864.46 | Time: 13.15s
Epoch 4/50 | Loss: 825.60 | Time: 13.42s
Epoch 5/50 | Loss: 796.64 | Time: 13.61s
Epoch 6/50 | Loss: 774.60 | Time: 14.36s
Epoch 7/50 | Loss: 757.06 | Time: 14.24s
Epoch 8/50 | Loss: 742.97 | Time: 13.74s
Epoch 9/50 | Loss: 731.48 | Time: 13.38s
Epoch 10/50 | Loss: 722.21 | Time: 13.46s
Epoch 11/50 | Loss: 714.30 | Time: 13.32s
Epoch 12/50 | Loss: 707.69 | Time: 13.81s
Epoch 13/50 | Loss: 702.31 | Time: 13.78s
Epoch 14/50 | Loss: 697.71 | Time: 14.16s
Epoch 15/50 | Loss: 693.35 | Time: 14.63s
Epoch 16/50 | Loss: 690.05 | Time: 14.54s
Epoch 17/50 | Loss: 686.86 | Time: 14.24s
Epoch 18/50 | Loss: 684.43 | Time: 14.97s
Epoch 19/50 | Loss: 681.68 | Time: 14.98s
Epoch 20/50 | Loss: 679.70 | Time: 15.16s
Epoch 21/50 | Loss: 677.29 | Time: 17.71s
Epoch 22/50 | Loss: 675.55 | Time: 16.34s
Epoch 23/50 | Loss: 673.61 | Time: 16.92s
Epoch 24/50 | Loss: 672.22 | Time: 18.00s
