DataPrepairing

In [5]:
import nltk
import torch
import torch.nn as nn
from nltk.tokenize import RegexpTokenizer
from torch.utils.data import Dataset, DataLoader
from WordDataset import WordDataset
from Translator import Translator
from tqdm import tqdm
import matplotlib.pyplot as plt

device = torch.device('cuda')

In [6]:
source = []
target = []

with open('rus.txt') as f:
    for line in f:
        t, s = line.split('\t')[:2]
        target.append(t)
        source.append(s)

tokenizer = RegexpTokenizer(r'\w+')

target_bag_of_words = []
source_bag_of_words = []

target_sentences = []
source_sentences = []

for i in range(len(target)):
    t_sent = target[i]
    s_sent = source[i]
    t_tokens = tokenizer.tokenize(t_sent)
    s_tokens = tokenizer.tokenize(s_sent)
    
    target_bag_of_words.extend(t_tokens)
    source_bag_of_words.extend(s_tokens)

    target_sentences.append(t_tokens)
    source_sentences.append(s_tokens)
    

special_symbols = ['<SOS>', '<EOS>', '<PAD>', '<UNK>']

target_bag_of_words.extend(special_symbols)
source_bag_of_words.extend(special_symbols)
target_bag_of_words = set(target_bag_of_words)
source_bag_of_words = set(source_bag_of_words)

source_word2ind = {word: ind for ind, word in enumerate(source_bag_of_words)}
target_word2ind = {word: ind for ind, word in enumerate(target_bag_of_words)}
source_ind2word = {ind: word for ind, word in enumerate(source_bag_of_words)}
target_ind2word = {ind: word for ind, word in enumerate(target_bag_of_words)}


In [7]:
del target_bag_of_words
del source_bag_of_words
del special_symbols
del tokenizer

In [8]:
max_len = max(max([len(sentence) for sentence in target_sentences]), max([len(sentence) for sentence in source_sentences]))

In [9]:
dataset = WordDataset(source_sentences, target_sentences, source_word2ind, target_word2ind, max_len = max_len)
dataloader = DataLoader(dataset, batch_size=512, shuffle=True)

In [10]:
def train_model(model, criterion, optimizer, dataloader, num_epoch):
    model.train()
    losses = []
    for epoch in range(1, num_epoch+1):
        print(f'epoch:{epoch}')
        for source, target in tqdm(dataloader):
            optimizer.zero_grad()
            
            target_input = target[:, :-1].to(device)
            target_output = target[:, 1:].to(device).flatten(start_dim = 0, end_dim = 1)

            outp = model(source.to(device), target_input).squeeze()
            outp = outp.flatten(start_dim = 0, end_dim = 1)

            
            loss = criterion(outp.to(device), target_output)
            loss.backward()
            optimizer.step()
            
            losses.append(loss.item())
    
    return losses
            

In [11]:
print(len(target_word2ind))

18061


In [12]:
model = Translator(len(source_word2ind), len(target_word2ind), 100).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [13]:
losses = train_model(model, criterion, optimizer, dataloader, 1)

epoch:1


  0%|          | 1/1420 [00:00<09:50,  2.40it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.76 GiB. GPU 0 has a total capacity of 7.74 GiB of which 97.88 MiB is free. Including non-PyTorch memory, this process has 7.45 GiB memory in use. Of the allocated memory 5.52 GiB is allocated by PyTorch, and 1.81 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
plt.plot(losses)
plt.show()