## Домашнее задание 4
* Сравнить LSTM, RNN и GRU на задаче предсказания части речи (качество предсказания, скорость обучения, время инференса модели)
* *к первой задаче добавить bidirectional

In [87]:
import time

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [88]:
class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):
	#open file
        with open(data_dir + train_lang + '.train', 'r') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]
	    #init vocabs of tokens for encoding { token:  id}
        self.target_vocab = {} # {p: 1, a: 2, r: 3, pu: 4}
        self.word_vocab = {} # {cat: 1, sat: 2, on: 3, mat: 4, '.': 5}
        self.char_vocab = {} # {c: 1, a: 2, t: 3, ' ': 4, s: 5}
	    
        # Cat sat on mat. -> [1, 2, 3, 4, 5]
        # p    a  r  p pu -> [1, 2, 3, 1, 4]
        # chars  -> [1, 2, 3, 4, 5, 2, 3, 4]

	    #init encoded sequences lists (processed data)
        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        # n=1 because first value is padding
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }

dataset = DatasetSeq('')        

In [89]:
def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(torch.as_tensor(item['data']))
        target.append(torch.as_tensor(item['target']))
    data = pad_sequence(data, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}
     

In [90]:

class RNNPredictorV2(nn.Module):
    # ну я тут добавил параметр, которым будет задаваться класс, модель с которым нам нужно создать
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes, cl):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # соответственно здесь - инициируется как раз объект этого класса, то есть модель в каждом случае  будет разная
        self.rnn = cl(emb_dim, hidden_dim, batch_first=True)
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
    
    def forward(self, x):
        emb = self.word_emb(x) # B x T x Emb_dim
        hidden, _ = self.rnn(emb) # B x T x Hid, B x 1 x Hid
        pred = self.clf(self.do(hidden)) # B x T x N_classes

        return pred

In [91]:
#hyper params
vocab_size = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
#TODO try to use other model parameters
emb_dim = 128
hidden = 128
n_epochs = 10
batch_size = 100
device = 'cpu'

In [92]:
# здесь я формирую три объекта модели и соответсвующего оптимизатора
model_rnn = RNNPredictorV2(vocab_size, emb_dim, hidden, n_classes, nn.RNN).to(device)
model_rnn.train()
optim_rnn = torch.optim.Adam(model_rnn.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

model_lstm = RNNPredictorV2(vocab_size, emb_dim, hidden, n_classes, nn.LSTM).to(device)
model_lstm.train()
optim_lstm = torch.optim.Adam(model_lstm.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

model_gru = RNNPredictorV2(vocab_size, emb_dim, hidden, n_classes, nn.GRU).to(device)
model_gru.train()
optim_gru = torch.optim.Adam(model_gru.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [93]:
epoch_stats_df = pd.DataFrame(columns=['Epoch','RNN', 'LSTM', 'GRU'])

Unnamed: 0,Epoch,RNN,LSTM,GRU


In [94]:
# служебный класс для прогона "эпохи"
def process_epoch(model: nn.Module, optimizer: torch.optim.Optimizer):
    time_before = time.time()
    for i, batch in enumerate(dataloader):
        optimizer.zero_grad()
        predict = model(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1),
                         )
        loss.backward()
        optimizer.step()
    time_after = time.time()
    return time_after-time_before


# обучающий цикл. он будет один, за одну эпоху обучаем все три модели. продолжительность эпохи длякаджой модели записываем в датафрейм
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    ### rnn
    rnn_epoch_duration = process_epoch(model_rnn, optim_rnn)
    lstm_epoch_duration = process_epoch(model_lstm, optim_lstm)
    gru_epoch_duration = process_epoch(model_lstm, optim_gru)
    print(f'rnn epoch:{epoch}| rnn: {rnn_epoch_duration}, lstm epoch: {lstm_epoch_duration}, gru: {gru_epoch_duration}')
    epoch_stats_df = epoch_stats_df.append({'Epoch': epoch, 'RNN':rnn_epoch_duration, 'LSTM':lstm_epoch_duration, 'GRU':gru_epoch_duration},ignore_index=True)

print('here is dataframe with epoch epochs statistics')
epoch_stats_df

rnn epoch:0| rnn: 19.609256982803345, lstm epoch: 51.160086154937744, gru: 51.86091494560242


  epoch_stats_df = epoch_stats_df.append({'Epoch': epoch, 'RNN':rnn_epoch_duration, 'LSTM':lstm_epoch_duration, 'GRU':gru_epoch_duration},ignore_index=True)


rnn epoch:1| rnn: 30.140256881713867, lstm epoch: 65.5700318813324, gru: 54.27470898628235


  epoch_stats_df = epoch_stats_df.append({'Epoch': epoch, 'RNN':rnn_epoch_duration, 'LSTM':lstm_epoch_duration, 'GRU':gru_epoch_duration},ignore_index=True)


rnn epoch:2| rnn: 20.537453174591064, lstm epoch: 64.59994006156921, gru: 55.21620178222656


  epoch_stats_df = epoch_stats_df.append({'Epoch': epoch, 'RNN':rnn_epoch_duration, 'LSTM':lstm_epoch_duration, 'GRU':gru_epoch_duration},ignore_index=True)


KeyboardInterrupt: 

In [None]:
epoch_stats_df

In [None]:
epoch_stats_df_with_index = epoch_stats_df.set_index('Epoch')
epoch_stats_df_with_index.plot(title = 'Статистика времени обучения', kind='bar' )
