In [25]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import datetime
import torch.nn as nn
from torch.utils.data import DataLoader

class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):

        with open(data_dir + train_lang + '.train', 'r') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]

        self.target_vocab = {}
        self.word_vocab = {}
        self.char_vocab = {}

        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }
# seq1 = [1, 2, 3] -> [1, 2, 3, 0]
# seq2 = [7, 5, 4, 2]

def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(torch.as_tensor(item['data']))
        target.append(torch.as_tensor(item['target']))
    data = pad_sequence(data, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}


def collate_fn_char(input_data):
    data = []
    chars = []
    targets = []
    max_len = 0
    for item in input_data:
        if len(item['data']) > max_len:
            max_len = len(item['data'])
        data.append(torch.as_tensor(item['data']))
        chars.append(item['char'])
        targets.append(torch.as_tensor(item['target']))
    chars_seq = [[torch.as_tensor([0]) for _ in range(len(input_data))] for _ in range(max_len)]
    for j in range(len(input_data)):
        for i in range(max_len):
            if len(chars[j]) > i:
                chars_seq[i][j] = torch.as_tensor(chars[j][i])
    for j in range(max_len):
        chars_seq[j] = pad_sequence(chars_seq[j], batch_first=True, padding_value=0)
    data = pad_sequence(data, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return {'data': data, 'chars': chars_seq, 'target': targets}



In [34]:
data_dir = '/Users/Vampire/Downloads/Reload_NN3-master-3/lesson4/'
train_lang = 'en'

dataset = DatasetSeq(data_dir)

#hyper params
vocab_len = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
cuda_device = -1
batch_size = 100
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

ds_item = dataset[156]
#model
decode_words = [k for k in dataset.word_vocab]
print([decode_words[i] for i in ds_item['data']])


#using GRUCell
class POS_predictor(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru_cell = nn.GRUCell(emb_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)
        self.hidden_dim = hidden_dim

    def forward(self, x):  # B x T
        b, t = x.size()
        emb_x = self.emb(x) # B x T x V
        hidden = torch.zeros((b, self.hidden_dim))
        gru_out = []
        for i in range(t):
            hidden = self.gru_cell(emb_x[:, i, :], hidden) # B x Hid
            gru_out.append(hidden.unsqueeze(1)) # B x 1 x Hid
        gru_out = torch.cat(gru_out, dim=1) # B x T x Hid
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred

#usng GRU

# emb = [
#     [1.2,3.4,1.2],
#     [7.2,6.4,4.7],
#     [2.8,3.4,9.2],
# ]
# seq = [2, 1, 2]
# [[2.8,3.4,9.2], [7.2,6.4,4.7], [2.8,3.4,9.2]]

class POS_predictorV2(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim//2, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        gru_out, _ = self.gru(emb_x)
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred



['Shiites', 'the', 'preacher', 'hate', 'overheard', 'likewise', 'Ani', 'Zaman', 'operative', 'Jihad', 'like', 'Rest', 'it', 'Kurds', 'preacher', 'problem', '[']


In [56]:
%%time
model = POS_predictorV2(vocab_len, 200, 512, n_classes)
model.train()
model = model.to(device)

#optimizer
optim = torch.optim.Adam(model.parameters(), lr=0.001)
#lr scheduler


#loss
loss_func = nn.CrossEntropyLoss()
#dataloder
for epoch in range(20):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        optim.zero_grad()
        data = batch['data'].to(device)  # B x T
        pred = model(data)
        loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1).to(device))
        loss.backward()
        # if step % 5:
        optim.step()

        if step % 1000:
            print(loss)
    print(epoch)
    torch.save({'model': model.state_dict()}, '/Users/Vampire/Downloads/Reload_NN3-master-3/lesson4/epoch_%d.pth.tar' % epoch)



tensor(1.5635, grad_fn=<NllLossBackward0>)
tensor(0.8540, grad_fn=<NllLossBackward0>)
tensor(0.8442, grad_fn=<NllLossBackward0>)
tensor(0.5750, grad_fn=<NllLossBackward0>)
tensor(0.8117, grad_fn=<NllLossBackward0>)
tensor(0.7138, grad_fn=<NllLossBackward0>)
tensor(0.7261, grad_fn=<NllLossBackward0>)
tensor(1.0477, grad_fn=<NllLossBackward0>)
tensor(0.9220, grad_fn=<NllLossBackward0>)
tensor(0.6499, grad_fn=<NllLossBackward0>)
tensor(0.6765, grad_fn=<NllLossBackward0>)
tensor(0.8581, grad_fn=<NllLossBackward0>)
tensor(0.4653, grad_fn=<NllLossBackward0>)
tensor(0.6993, grad_fn=<NllLossBackward0>)
tensor(0.7284, grad_fn=<NllLossBackward0>)
tensor(0.5743, grad_fn=<NllLossBackward0>)
tensor(0.7224, grad_fn=<NllLossBackward0>)
tensor(0.7238, grad_fn=<NllLossBackward0>)
tensor(0.5118, grad_fn=<NllLossBackward0>)
tensor(0.5417, grad_fn=<NllLossBackward0>)
tensor(0.5779, grad_fn=<NllLossBackward0>)
tensor(0.7185, grad_fn=<NllLossBackward0>)
tensor(0.5657, grad_fn=<NllLossBackward0>)
tensor(0.57

tensor(0.1602, grad_fn=<NllLossBackward0>)
tensor(0.1197, grad_fn=<NllLossBackward0>)
tensor(0.1835, grad_fn=<NllLossBackward0>)
tensor(0.1753, grad_fn=<NllLossBackward0>)
tensor(0.1395, grad_fn=<NllLossBackward0>)
tensor(0.1530, grad_fn=<NllLossBackward0>)
tensor(0.1739, grad_fn=<NllLossBackward0>)
tensor(0.1827, grad_fn=<NllLossBackward0>)
tensor(0.1348, grad_fn=<NllLossBackward0>)
tensor(0.1061, grad_fn=<NllLossBackward0>)
tensor(0.1886, grad_fn=<NllLossBackward0>)
tensor(0.1782, grad_fn=<NllLossBackward0>)
tensor(0.1515, grad_fn=<NllLossBackward0>)
tensor(0.1992, grad_fn=<NllLossBackward0>)
tensor(0.2231, grad_fn=<NllLossBackward0>)
tensor(0.1838, grad_fn=<NllLossBackward0>)
tensor(0.1538, grad_fn=<NllLossBackward0>)
tensor(0.1473, grad_fn=<NllLossBackward0>)
tensor(0.1146, grad_fn=<NllLossBackward0>)
tensor(0.1182, grad_fn=<NllLossBackward0>)
0
tensor(0.1584, grad_fn=<NllLossBackward0>)
tensor(0.1528, grad_fn=<NllLossBackward0>)
tensor(0.1815, grad_fn=<NllLossBackward0>)
tensor(0.

tensor(0.1309, grad_fn=<NllLossBackward0>)
tensor(0.1601, grad_fn=<NllLossBackward0>)
tensor(0.0907, grad_fn=<NllLossBackward0>)
tensor(0.1139, grad_fn=<NllLossBackward0>)
tensor(0.1286, grad_fn=<NllLossBackward0>)
tensor(0.0961, grad_fn=<NllLossBackward0>)
tensor(0.1051, grad_fn=<NllLossBackward0>)
tensor(0.1307, grad_fn=<NllLossBackward0>)
tensor(0.0739, grad_fn=<NllLossBackward0>)
tensor(0.1278, grad_fn=<NllLossBackward0>)
tensor(0.0933, grad_fn=<NllLossBackward0>)
tensor(0.1134, grad_fn=<NllLossBackward0>)
tensor(0.0885, grad_fn=<NllLossBackward0>)
tensor(0.0959, grad_fn=<NllLossBackward0>)
tensor(0.0613, grad_fn=<NllLossBackward0>)
tensor(0.1085, grad_fn=<NllLossBackward0>)
tensor(0.0698, grad_fn=<NllLossBackward0>)
tensor(0.1302, grad_fn=<NllLossBackward0>)
tensor(0.0831, grad_fn=<NllLossBackward0>)
tensor(0.1344, grad_fn=<NllLossBackward0>)
tensor(0.1206, grad_fn=<NllLossBackward0>)
tensor(0.0875, grad_fn=<NllLossBackward0>)
tensor(0.1245, grad_fn=<NllLossBackward0>)
tensor(0.09

tensor(0.0809, grad_fn=<NllLossBackward0>)
tensor(0.0988, grad_fn=<NllLossBackward0>)
tensor(0.1103, grad_fn=<NllLossBackward0>)
tensor(0.0738, grad_fn=<NllLossBackward0>)
tensor(0.1004, grad_fn=<NllLossBackward0>)
tensor(0.0893, grad_fn=<NllLossBackward0>)
tensor(0.0958, grad_fn=<NllLossBackward0>)
tensor(0.1223, grad_fn=<NllLossBackward0>)
tensor(0.0743, grad_fn=<NllLossBackward0>)
tensor(0.0833, grad_fn=<NllLossBackward0>)
tensor(0.0827, grad_fn=<NllLossBackward0>)
tensor(0.0531, grad_fn=<NllLossBackward0>)
tensor(0.0951, grad_fn=<NllLossBackward0>)
tensor(0.0673, grad_fn=<NllLossBackward0>)
tensor(0.0860, grad_fn=<NllLossBackward0>)
tensor(0.0881, grad_fn=<NllLossBackward0>)
tensor(0.0903, grad_fn=<NllLossBackward0>)
tensor(0.0610, grad_fn=<NllLossBackward0>)
tensor(0.0712, grad_fn=<NllLossBackward0>)
tensor(0.0819, grad_fn=<NllLossBackward0>)
tensor(0.0833, grad_fn=<NllLossBackward0>)
tensor(0.0949, grad_fn=<NllLossBackward0>)
tensor(0.0863, grad_fn=<NllLossBackward0>)
tensor(0.08

tensor(0.0652, grad_fn=<NllLossBackward0>)
tensor(0.0621, grad_fn=<NllLossBackward0>)
tensor(0.0623, grad_fn=<NllLossBackward0>)
tensor(0.0670, grad_fn=<NllLossBackward0>)
tensor(0.0697, grad_fn=<NllLossBackward0>)
tensor(0.0431, grad_fn=<NllLossBackward0>)
tensor(0.0600, grad_fn=<NllLossBackward0>)
tensor(0.0792, grad_fn=<NllLossBackward0>)
tensor(0.0423, grad_fn=<NllLossBackward0>)
tensor(0.0779, grad_fn=<NllLossBackward0>)
tensor(0.0527, grad_fn=<NllLossBackward0>)
tensor(0.0431, grad_fn=<NllLossBackward0>)
tensor(0.0576, grad_fn=<NllLossBackward0>)
tensor(0.0508, grad_fn=<NllLossBackward0>)
tensor(0.0915, grad_fn=<NllLossBackward0>)
tensor(0.0441, grad_fn=<NllLossBackward0>)
tensor(0.0942, grad_fn=<NllLossBackward0>)
tensor(0.0597, grad_fn=<NllLossBackward0>)
tensor(0.0579, grad_fn=<NllLossBackward0>)
tensor(0.0696, grad_fn=<NllLossBackward0>)
tensor(0.0460, grad_fn=<NllLossBackward0>)
tensor(0.0427, grad_fn=<NllLossBackward0>)
tensor(0.0774, grad_fn=<NllLossBackward0>)
tensor(0.03

tensor(0.0328, grad_fn=<NllLossBackward0>)
tensor(0.0393, grad_fn=<NllLossBackward0>)
tensor(0.0436, grad_fn=<NllLossBackward0>)
tensor(0.0398, grad_fn=<NllLossBackward0>)
tensor(0.0308, grad_fn=<NllLossBackward0>)
tensor(0.0226, grad_fn=<NllLossBackward0>)
tensor(0.0430, grad_fn=<NllLossBackward0>)
tensor(0.0475, grad_fn=<NllLossBackward0>)
tensor(0.0613, grad_fn=<NllLossBackward0>)
tensor(0.0440, grad_fn=<NllLossBackward0>)
tensor(0.0440, grad_fn=<NllLossBackward0>)
tensor(0.0622, grad_fn=<NllLossBackward0>)
tensor(0.0359, grad_fn=<NllLossBackward0>)
tensor(0.0347, grad_fn=<NllLossBackward0>)
tensor(0.0474, grad_fn=<NllLossBackward0>)
tensor(0.0637, grad_fn=<NllLossBackward0>)
tensor(0.0529, grad_fn=<NllLossBackward0>)
tensor(0.0347, grad_fn=<NllLossBackward0>)
tensor(0.0354, grad_fn=<NllLossBackward0>)
tensor(0.0379, grad_fn=<NllLossBackward0>)
tensor(0.0620, grad_fn=<NllLossBackward0>)
tensor(0.0505, grad_fn=<NllLossBackward0>)
tensor(0.0510, grad_fn=<NllLossBackward0>)
tensor(0.05

tensor(0.0330, grad_fn=<NllLossBackward0>)
tensor(0.0309, grad_fn=<NllLossBackward0>)
tensor(0.0269, grad_fn=<NllLossBackward0>)
tensor(0.0250, grad_fn=<NllLossBackward0>)
tensor(0.0355, grad_fn=<NllLossBackward0>)
tensor(0.0289, grad_fn=<NllLossBackward0>)
tensor(0.0577, grad_fn=<NllLossBackward0>)
tensor(0.0299, grad_fn=<NllLossBackward0>)
tensor(0.0441, grad_fn=<NllLossBackward0>)
tensor(0.0431, grad_fn=<NllLossBackward0>)
tensor(0.0443, grad_fn=<NllLossBackward0>)
tensor(0.0288, grad_fn=<NllLossBackward0>)
tensor(0.0401, grad_fn=<NllLossBackward0>)
tensor(0.0318, grad_fn=<NllLossBackward0>)
tensor(0.0177, grad_fn=<NllLossBackward0>)
tensor(0.0345, grad_fn=<NllLossBackward0>)
tensor(0.0441, grad_fn=<NllLossBackward0>)
tensor(0.0393, grad_fn=<NllLossBackward0>)
tensor(0.0454, grad_fn=<NllLossBackward0>)
tensor(0.0343, grad_fn=<NllLossBackward0>)
tensor(0.0432, grad_fn=<NllLossBackward0>)
tensor(0.0253, grad_fn=<NllLossBackward0>)
tensor(0.0471, grad_fn=<NllLossBackward0>)
tensor(0.02

tensor(0.0247, grad_fn=<NllLossBackward0>)
tensor(0.0282, grad_fn=<NllLossBackward0>)
tensor(0.0251, grad_fn=<NllLossBackward0>)
tensor(0.0245, grad_fn=<NllLossBackward0>)
tensor(0.0206, grad_fn=<NllLossBackward0>)
tensor(0.0227, grad_fn=<NllLossBackward0>)
tensor(0.0229, grad_fn=<NllLossBackward0>)
tensor(0.0318, grad_fn=<NllLossBackward0>)
tensor(0.0309, grad_fn=<NllLossBackward0>)
tensor(0.0261, grad_fn=<NllLossBackward0>)
tensor(0.0243, grad_fn=<NllLossBackward0>)
tensor(0.0270, grad_fn=<NllLossBackward0>)
tensor(0.0181, grad_fn=<NllLossBackward0>)
tensor(0.0314, grad_fn=<NllLossBackward0>)
tensor(0.0322, grad_fn=<NllLossBackward0>)
tensor(0.0256, grad_fn=<NllLossBackward0>)
tensor(0.0219, grad_fn=<NllLossBackward0>)
tensor(0.0345, grad_fn=<NllLossBackward0>)
tensor(0.0229, grad_fn=<NllLossBackward0>)
tensor(0.0266, grad_fn=<NllLossBackward0>)
tensor(0.0362, grad_fn=<NllLossBackward0>)
tensor(0.0227, grad_fn=<NllLossBackward0>)
tensor(0.0217, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0270, grad_fn=<NllLossBackward0>)
tensor(0.0219, grad_fn=<NllLossBackward0>)
tensor(0.0256, grad_fn=<NllLossBackward0>)
tensor(0.0168, grad_fn=<NllLossBackward0>)
tensor(0.0210, grad_fn=<NllLossBackward0>)
tensor(0.0211, grad_fn=<NllLossBackward0>)
tensor(0.0254, grad_fn=<NllLossBackward0>)
tensor(0.0108, grad_fn=<NllLossBackward0>)
tensor(0.0215, grad_fn=<NllLossBackward0>)
tensor(0.0216, grad_fn=<NllLossBackward0>)
tensor(0.0250, grad_fn=<NllLossBackward0>)
tensor(0.0270, grad_fn=<NllLossBackward0>)
tensor(0.0203, grad_fn=<NllLossBackward0>)
tensor(0.0258, grad_fn=<NllLossBackward0>)
tensor(0.0203, grad_fn=<NllLossBackward0>)
tensor(0.0208, grad_fn=<NllLossBackward0>)
tensor(0.0081, grad_fn=<NllLossBackward0>)
tensor(0.0206, grad_fn=<NllLossBackward0>)
tensor(0.0182, grad_fn=<NllLossBackward0>)
tensor(0.0214, grad_fn=<NllLossBackward0>)
tensor(0.0282, grad_fn=<NllLossBackward0>)
tensor(0.0184, grad_fn=<NllLossBackward0>)
tensor(0.0155, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0165, grad_fn=<NllLossBackward0>)
tensor(0.0170, grad_fn=<NllLossBackward0>)
tensor(0.0107, grad_fn=<NllLossBackward0>)
tensor(0.0172, grad_fn=<NllLossBackward0>)
tensor(0.0082, grad_fn=<NllLossBackward0>)
tensor(0.0127, grad_fn=<NllLossBackward0>)
tensor(0.0191, grad_fn=<NllLossBackward0>)
tensor(0.0140, grad_fn=<NllLossBackward0>)
tensor(0.0129, grad_fn=<NllLossBackward0>)
tensor(0.0211, grad_fn=<NllLossBackward0>)
tensor(0.0128, grad_fn=<NllLossBackward0>)
tensor(0.0110, grad_fn=<NllLossBackward0>)
tensor(0.0161, grad_fn=<NllLossBackward0>)
tensor(0.0101, grad_fn=<NllLossBackward0>)
tensor(0.0091, grad_fn=<NllLossBackward0>)
tensor(0.0139, grad_fn=<NllLossBackward0>)
tensor(0.0056, grad_fn=<NllLossBackward0>)
tensor(0.0135, grad_fn=<NllLossBackward0>)
tensor(0.0113, grad_fn=<NllLossBackward0>)
tensor(0.0145, grad_fn=<NllLossBackward0>)
tensor(0.0169, grad_fn=<NllLossBackward0>)
tensor(0.0134, grad_fn=<NllLossBackward0>)
tensor(0.0217, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0118, grad_fn=<NllLossBackward0>)
tensor(0.0115, grad_fn=<NllLossBackward0>)
tensor(0.0103, grad_fn=<NllLossBackward0>)
tensor(0.0110, grad_fn=<NllLossBackward0>)
tensor(0.0089, grad_fn=<NllLossBackward0>)
tensor(0.0108, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0142, grad_fn=<NllLossBackward0>)
tensor(0.0133, grad_fn=<NllLossBackward0>)
tensor(0.0076, grad_fn=<NllLossBackward0>)
tensor(0.0115, grad_fn=<NllLossBackward0>)
tensor(0.0112, grad_fn=<NllLossBackward0>)
tensor(0.0071, grad_fn=<NllLossBackward0>)
tensor(0.0128, grad_fn=<NllLossBackward0>)
tensor(0.0095, grad_fn=<NllLossBackward0>)
tensor(0.0099, grad_fn=<NllLossBackward0>)
tensor(0.0111, grad_fn=<NllLossBackward0>)
tensor(0.0103, grad_fn=<NllLossBackward0>)
tensor(0.0115, grad_fn=<NllLossBackward0>)
tensor(0.0120, grad_fn=<NllLossBackward0>)
tensor(0.0139, grad_fn=<NllLossBackward0>)
tensor(0.0116, grad_fn=<NllLossBackward0>)
tensor(0.0106, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0086, grad_fn=<NllLossBackward0>)
tensor(0.0136, grad_fn=<NllLossBackward0>)
tensor(0.0093, grad_fn=<NllLossBackward0>)
tensor(0.0098, grad_fn=<NllLossBackward0>)
9
tensor(0.0075, grad_fn=<NllLossBackward0>)
tensor(0.0066, grad_fn=<NllLossBackward0>)
tensor(0.0091, grad_fn=<NllLossBackward0>)
tensor(0.0070, grad_fn=<NllLossBackward0>)
tensor(0.0089, grad_fn=<NllLossBackward0>)
tensor(0.0088, grad_fn=<NllLossBackward0>)
tensor(0.0070, grad_fn=<NllLossBackward0>)
tensor(0.0064, grad_fn=<NllLossBackward0>)
tensor(0.0070, grad_fn=<NllLossBackward0>)
tensor(0.0072, grad_fn=<NllLossBackward0>)
tensor(0.0089, grad_fn=<NllLossBackward0>)
tensor(0.0067, grad_fn=<NllLossBackward0>)
tensor(0.0073, grad_fn=<NllLossBackward0>)
tensor(0.0065, grad_fn=<NllLossBackward0>)
tensor(0.0069, grad_fn=<NllLossBackward0>)
tensor(0.0082, grad_fn=<NllLossBackward0>)
tensor(0.0054, grad_fn=<NllLossBackward0>)
tensor(0.0068, grad_fn=<NllLossBackward0>)
tensor(0.0074, grad_fn=<NllLossBackward0>)
tensor(0.

tensor(0.0061, grad_fn=<NllLossBackward0>)
tensor(0.0133, grad_fn=<NllLossBackward0>)
tensor(0.0068, grad_fn=<NllLossBackward0>)
tensor(0.0082, grad_fn=<NllLossBackward0>)
tensor(0.0090, grad_fn=<NllLossBackward0>)
tensor(0.0068, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0062, grad_fn=<NllLossBackward0>)
tensor(0.0070, grad_fn=<NllLossBackward0>)
tensor(0.0100, grad_fn=<NllLossBackward0>)
tensor(0.0073, grad_fn=<NllLossBackward0>)
tensor(0.0075, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0098, grad_fn=<NllLossBackward0>)
tensor(0.0065, grad_fn=<NllLossBackward0>)
tensor(0.0075, grad_fn=<NllLossBackward0>)
tensor(0.0042, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0089, grad_fn=<NllLossBackward0>)
tensor(0.0047, grad_fn=<NllLossBackward0>)
tensor(0.0052, grad_fn=<NllLossBackward0>)
tensor(0.0124, grad_fn=<NllLossBackward0>)
tensor(0.0052, grad_fn=<NllLossBackward0>)
10
tensor(0

tensor(0.0043, grad_fn=<NllLossBackward0>)
tensor(0.0046, grad_fn=<NllLossBackward0>)
tensor(0.0037, grad_fn=<NllLossBackward0>)
tensor(0.0045, grad_fn=<NllLossBackward0>)
tensor(0.0056, grad_fn=<NllLossBackward0>)
tensor(0.0043, grad_fn=<NllLossBackward0>)
tensor(0.0063, grad_fn=<NllLossBackward0>)
tensor(0.0081, grad_fn=<NllLossBackward0>)
tensor(0.0057, grad_fn=<NllLossBackward0>)
tensor(0.0066, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0054, grad_fn=<NllLossBackward0>)
tensor(0.0064, grad_fn=<NllLossBackward0>)
tensor(0.0066, grad_fn=<NllLossBackward0>)
tensor(0.0053, grad_fn=<NllLossBackward0>)
tensor(0.0039, grad_fn=<NllLossBackward0>)
tensor(0.0069, grad_fn=<NllLossBackward0>)
tensor(0.0042, grad_fn=<NllLossBackward0>)
tensor(0.0045, grad_fn=<NllLossBackward0>)
tensor(0.0039, grad_fn=<NllLossBackward0>)
tensor(0.0058, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.0047, grad_fn=<NllLossBackward0>)
tensor(0.0063, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0043, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0052, grad_fn=<NllLossBackward0>)
tensor(0.0043, grad_fn=<NllLossBackward0>)
tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.0059, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0047, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0037, grad_fn=<NllLossBackward0>)
tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0046, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0022, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0017, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0032, grad_fn=<NllLossBackward0>)
tensor(0.0022, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0030, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0030, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.0030, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0017, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0017, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0017, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0017, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
19
CPU times: user 17min 34s, sys: 5min 44s, total: 23min 19s
Wall time: 13min 30s


In [None]:
#Без bidirectional
##CPU times: user 11min 4s, sys: 3min 3s, total: 14min 7s
##Wall time: 7min 5s

##bidirectional
#CPU times: user 17min 34s, sys: 5min 44s, total: 23min 19s
#Wall time: 13min 30s

In [36]:
%%time
# inference
sequence = [2,36,2,14,4,24]
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(sequence).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = pred.argmax(-1)




CPU times: user 3.16 ms, sys: 3.11 ms, total: 6.27 ms
Wall time: 4.45 ms


In [37]:
%%time
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    end = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])

['PART', 'DET', 'CCONJ', 'AUX', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']
CPU times: user 2.07 ms, sys: 1.57 ms, total: 3.64 ms
Wall time: 2.2 ms


In [40]:
#usng GRU

# emb = [
#     [1.2,3.4,1.2],
#     [7.2,6.4,4.7],
#     [2.8,3.4,9.2],
# ]
# seq = [2, 1, 2]
# [[2.8,3.4,9.2], [7.2,6.4,4.7], [2.8,3.4,9.2]]

class POS_predictorV2_RNN(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru = nn.RNN(emb_dim, hidden_dim//2, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        gru_out, _ = self.gru(emb_x)
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred




In [55]:
%%time
model = POS_predictorV2_RNN(vocab_len, 200, 512, n_classes)
model.train()
model = model.to(device)

#optimizer
optim = torch.optim.Adam(model.parameters(), lr=0.001)
#lr scheduler


#loss
loss_func = nn.CrossEntropyLoss()
#dataloder
for epoch in range(20):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        optim.zero_grad()
        data = batch['data'].to(device)  # B x T
        pred = model(data)
        loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1).to(device))
        loss.backward()
        # if step % 5:
        optim.step()

        if step % 1000:
            print(loss)
    print(epoch)
    torch.save({'model': model.state_dict()}, '/Users/Vampire/Downloads/Reload_NN3-master-3/lesson4/epoch_%d.pth.tar' % epoch)




tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(0.9316, grad_fn=<NllLossBackward0>)
tensor(0.7340, grad_fn=<NllLossBackward0>)
tensor(0.8500, grad_fn=<NllLossBackward0>)
tensor(0.8252, grad_fn=<NllLossBackward0>)
tensor(0.8043, grad_fn=<NllLossBackward0>)
tensor(0.5825, grad_fn=<NllLossBackward0>)
tensor(0.5469, grad_fn=<NllLossBackward0>)
tensor(0.7349, grad_fn=<NllLossBackward0>)
tensor(0.7124, grad_fn=<NllLossBackward0>)
tensor(0.8227, grad_fn=<NllLossBackward0>)
tensor(0.6860, grad_fn=<NllLossBackward0>)
tensor(0.6238, grad_fn=<NllLossBackward0>)
tensor(0.4711, grad_fn=<NllLossBackward0>)
tensor(0.6571, grad_fn=<NllLossBackward0>)
tensor(0.4552, grad_fn=<NllLossBackward0>)
tensor(0.4288, grad_fn=<NllLossBackward0>)
tensor(0.6145, grad_fn=<NllLossBackward0>)
tensor(0.3066, grad_fn=<NllLossBackward0>)
tensor(0.5236, grad_fn=<NllLossBackward0>)
tensor(0.4276, grad_fn=<NllLossBackward0>)
tensor(0.4640, grad_fn=<NllLossBackward0>)
tensor(0.5440, grad_fn=<NllLossBackward0>)
tensor(0.59

tensor(0.1276, grad_fn=<NllLossBackward0>)
tensor(0.2145, grad_fn=<NllLossBackward0>)
tensor(0.1872, grad_fn=<NllLossBackward0>)
tensor(0.1251, grad_fn=<NllLossBackward0>)
tensor(0.1956, grad_fn=<NllLossBackward0>)
tensor(0.1902, grad_fn=<NllLossBackward0>)
tensor(0.1907, grad_fn=<NllLossBackward0>)
tensor(0.1387, grad_fn=<NllLossBackward0>)
tensor(0.2081, grad_fn=<NllLossBackward0>)
tensor(0.1980, grad_fn=<NllLossBackward0>)
tensor(0.1794, grad_fn=<NllLossBackward0>)
tensor(0.2061, grad_fn=<NllLossBackward0>)
tensor(0.1054, grad_fn=<NllLossBackward0>)
tensor(0.1702, grad_fn=<NllLossBackward0>)
tensor(0.2128, grad_fn=<NllLossBackward0>)
tensor(0.1751, grad_fn=<NllLossBackward0>)
tensor(0.1809, grad_fn=<NllLossBackward0>)
tensor(0.1503, grad_fn=<NllLossBackward0>)
0
tensor(0.1502, grad_fn=<NllLossBackward0>)
tensor(0.1898, grad_fn=<NllLossBackward0>)
tensor(0.1567, grad_fn=<NllLossBackward0>)
tensor(0.1592, grad_fn=<NllLossBackward0>)
tensor(0.1294, grad_fn=<NllLossBackward0>)
tensor(0.

tensor(0.1229, grad_fn=<NllLossBackward0>)
tensor(0.2141, grad_fn=<NllLossBackward0>)
tensor(0.1190, grad_fn=<NllLossBackward0>)
tensor(0.1200, grad_fn=<NllLossBackward0>)
tensor(0.0767, grad_fn=<NllLossBackward0>)
tensor(0.1064, grad_fn=<NllLossBackward0>)
tensor(0.1229, grad_fn=<NllLossBackward0>)
tensor(0.1524, grad_fn=<NllLossBackward0>)
tensor(0.1122, grad_fn=<NllLossBackward0>)
tensor(0.1250, grad_fn=<NllLossBackward0>)
tensor(0.1018, grad_fn=<NllLossBackward0>)
tensor(0.1093, grad_fn=<NllLossBackward0>)
tensor(0.1261, grad_fn=<NllLossBackward0>)
tensor(0.0883, grad_fn=<NllLossBackward0>)
tensor(0.1370, grad_fn=<NllLossBackward0>)
tensor(0.1342, grad_fn=<NllLossBackward0>)
tensor(0.1310, grad_fn=<NllLossBackward0>)
tensor(0.1097, grad_fn=<NllLossBackward0>)
tensor(0.1160, grad_fn=<NllLossBackward0>)
tensor(0.1391, grad_fn=<NllLossBackward0>)
tensor(0.0736, grad_fn=<NllLossBackward0>)
tensor(0.1250, grad_fn=<NllLossBackward0>)
tensor(0.1330, grad_fn=<NllLossBackward0>)
tensor(0.10

tensor(0.0669, grad_fn=<NllLossBackward0>)
tensor(0.1008, grad_fn=<NllLossBackward0>)
tensor(0.1143, grad_fn=<NllLossBackward0>)
tensor(0.0764, grad_fn=<NllLossBackward0>)
tensor(0.1099, grad_fn=<NllLossBackward0>)
tensor(0.0624, grad_fn=<NllLossBackward0>)
tensor(0.0781, grad_fn=<NllLossBackward0>)
tensor(0.1037, grad_fn=<NllLossBackward0>)
tensor(0.0839, grad_fn=<NllLossBackward0>)
tensor(0.0780, grad_fn=<NllLossBackward0>)
tensor(0.1011, grad_fn=<NllLossBackward0>)
tensor(0.1056, grad_fn=<NllLossBackward0>)
tensor(0.0936, grad_fn=<NllLossBackward0>)
tensor(0.1113, grad_fn=<NllLossBackward0>)
tensor(0.0767, grad_fn=<NllLossBackward0>)
tensor(0.1070, grad_fn=<NllLossBackward0>)
tensor(0.0999, grad_fn=<NllLossBackward0>)
tensor(0.1357, grad_fn=<NllLossBackward0>)
tensor(0.0712, grad_fn=<NllLossBackward0>)
tensor(0.1057, grad_fn=<NllLossBackward0>)
tensor(0.0903, grad_fn=<NllLossBackward0>)
tensor(0.0955, grad_fn=<NllLossBackward0>)
tensor(0.0408, grad_fn=<NllLossBackward0>)
tensor(0.09

tensor(0.1001, grad_fn=<NllLossBackward0>)
tensor(0.0771, grad_fn=<NllLossBackward0>)
tensor(0.0539, grad_fn=<NllLossBackward0>)
tensor(0.0758, grad_fn=<NllLossBackward0>)
tensor(0.0612, grad_fn=<NllLossBackward0>)
tensor(0.0702, grad_fn=<NllLossBackward0>)
tensor(0.0751, grad_fn=<NllLossBackward0>)
tensor(0.0807, grad_fn=<NllLossBackward0>)
tensor(0.0446, grad_fn=<NllLossBackward0>)
tensor(0.0569, grad_fn=<NllLossBackward0>)
tensor(0.0536, grad_fn=<NllLossBackward0>)
tensor(0.0912, grad_fn=<NllLossBackward0>)
tensor(0.0733, grad_fn=<NllLossBackward0>)
tensor(0.0994, grad_fn=<NllLossBackward0>)
tensor(0.0869, grad_fn=<NllLossBackward0>)
tensor(0.0413, grad_fn=<NllLossBackward0>)
tensor(0.0833, grad_fn=<NllLossBackward0>)
tensor(0.1000, grad_fn=<NllLossBackward0>)
tensor(0.0897, grad_fn=<NllLossBackward0>)
tensor(0.1051, grad_fn=<NllLossBackward0>)
tensor(0.0332, grad_fn=<NllLossBackward0>)
tensor(0.0724, grad_fn=<NllLossBackward0>)
tensor(0.0831, grad_fn=<NllLossBackward0>)
tensor(0.06

tensor(0.0665, grad_fn=<NllLossBackward0>)
tensor(0.0565, grad_fn=<NllLossBackward0>)
tensor(0.0742, grad_fn=<NllLossBackward0>)
tensor(0.0655, grad_fn=<NllLossBackward0>)
tensor(0.0521, grad_fn=<NllLossBackward0>)
tensor(0.0394, grad_fn=<NllLossBackward0>)
tensor(0.0689, grad_fn=<NllLossBackward0>)
tensor(0.0686, grad_fn=<NllLossBackward0>)
tensor(0.0482, grad_fn=<NllLossBackward0>)
tensor(0.0740, grad_fn=<NllLossBackward0>)
tensor(0.0569, grad_fn=<NllLossBackward0>)
tensor(0.0953, grad_fn=<NllLossBackward0>)
tensor(0.0547, grad_fn=<NllLossBackward0>)
tensor(0.0492, grad_fn=<NllLossBackward0>)
tensor(0.0552, grad_fn=<NllLossBackward0>)
tensor(0.0588, grad_fn=<NllLossBackward0>)
tensor(0.0615, grad_fn=<NllLossBackward0>)
tensor(0.0534, grad_fn=<NllLossBackward0>)
tensor(0.0555, grad_fn=<NllLossBackward0>)
tensor(0.0841, grad_fn=<NllLossBackward0>)
tensor(0.0683, grad_fn=<NllLossBackward0>)
tensor(0.0670, grad_fn=<NllLossBackward0>)
tensor(0.0533, grad_fn=<NllLossBackward0>)
tensor(0.05

tensor(0.0697, grad_fn=<NllLossBackward0>)
tensor(0.0442, grad_fn=<NllLossBackward0>)
tensor(0.0400, grad_fn=<NllLossBackward0>)
tensor(0.0870, grad_fn=<NllLossBackward0>)
tensor(0.0545, grad_fn=<NllLossBackward0>)
tensor(0.0642, grad_fn=<NllLossBackward0>)
tensor(0.0529, grad_fn=<NllLossBackward0>)
tensor(0.0521, grad_fn=<NllLossBackward0>)
tensor(0.0558, grad_fn=<NllLossBackward0>)
tensor(0.0372, grad_fn=<NllLossBackward0>)
tensor(0.0440, grad_fn=<NllLossBackward0>)
tensor(0.0653, grad_fn=<NllLossBackward0>)
tensor(0.0323, grad_fn=<NllLossBackward0>)
tensor(0.0438, grad_fn=<NllLossBackward0>)
tensor(0.0382, grad_fn=<NllLossBackward0>)
tensor(0.0734, grad_fn=<NllLossBackward0>)
tensor(0.0367, grad_fn=<NllLossBackward0>)
tensor(0.0586, grad_fn=<NllLossBackward0>)
tensor(0.0570, grad_fn=<NllLossBackward0>)
tensor(0.0653, grad_fn=<NllLossBackward0>)
tensor(0.0630, grad_fn=<NllLossBackward0>)
tensor(0.0584, grad_fn=<NllLossBackward0>)
tensor(0.0651, grad_fn=<NllLossBackward0>)
tensor(0.06

tensor(0.0463, grad_fn=<NllLossBackward0>)
tensor(0.0320, grad_fn=<NllLossBackward0>)
tensor(0.0600, grad_fn=<NllLossBackward0>)
tensor(0.0410, grad_fn=<NllLossBackward0>)
tensor(0.0473, grad_fn=<NllLossBackward0>)
tensor(0.0453, grad_fn=<NllLossBackward0>)
tensor(0.0482, grad_fn=<NllLossBackward0>)
tensor(0.0416, grad_fn=<NllLossBackward0>)
tensor(0.0448, grad_fn=<NllLossBackward0>)
tensor(0.0482, grad_fn=<NllLossBackward0>)
tensor(0.0151, grad_fn=<NllLossBackward0>)
tensor(0.0499, grad_fn=<NllLossBackward0>)
tensor(0.0429, grad_fn=<NllLossBackward0>)
tensor(0.0317, grad_fn=<NllLossBackward0>)
tensor(0.0415, grad_fn=<NllLossBackward0>)
tensor(0.0540, grad_fn=<NllLossBackward0>)
tensor(0.0302, grad_fn=<NllLossBackward0>)
tensor(0.0445, grad_fn=<NllLossBackward0>)
tensor(0.0387, grad_fn=<NllLossBackward0>)
tensor(0.0589, grad_fn=<NllLossBackward0>)
tensor(0.0577, grad_fn=<NllLossBackward0>)
tensor(0.0476, grad_fn=<NllLossBackward0>)
tensor(0.0343, grad_fn=<NllLossBackward0>)
tensor(0.04

tensor(0.0292, grad_fn=<NllLossBackward0>)
tensor(0.0318, grad_fn=<NllLossBackward0>)
tensor(0.0436, grad_fn=<NllLossBackward0>)
tensor(0.0519, grad_fn=<NllLossBackward0>)
tensor(0.0340, grad_fn=<NllLossBackward0>)
tensor(0.0247, grad_fn=<NllLossBackward0>)
tensor(0.0309, grad_fn=<NllLossBackward0>)
tensor(0.0368, grad_fn=<NllLossBackward0>)
tensor(0.0362, grad_fn=<NllLossBackward0>)
tensor(0.0503, grad_fn=<NllLossBackward0>)
tensor(0.0372, grad_fn=<NllLossBackward0>)
tensor(0.0387, grad_fn=<NllLossBackward0>)
tensor(0.0403, grad_fn=<NllLossBackward0>)
tensor(0.0478, grad_fn=<NllLossBackward0>)
tensor(0.0386, grad_fn=<NllLossBackward0>)
tensor(0.0297, grad_fn=<NllLossBackward0>)
tensor(0.0388, grad_fn=<NllLossBackward0>)
tensor(0.0276, grad_fn=<NllLossBackward0>)
tensor(0.0498, grad_fn=<NllLossBackward0>)
tensor(0.0416, grad_fn=<NllLossBackward0>)
tensor(0.0452, grad_fn=<NllLossBackward0>)
tensor(0.0326, grad_fn=<NllLossBackward0>)
tensor(0.0428, grad_fn=<NllLossBackward0>)
tensor(0.02

tensor(0.0297, grad_fn=<NllLossBackward0>)
tensor(0.0223, grad_fn=<NllLossBackward0>)
tensor(0.0286, grad_fn=<NllLossBackward0>)
tensor(0.0429, grad_fn=<NllLossBackward0>)
tensor(0.0326, grad_fn=<NllLossBackward0>)
tensor(0.0283, grad_fn=<NllLossBackward0>)
tensor(0.0257, grad_fn=<NllLossBackward0>)
tensor(0.0246, grad_fn=<NllLossBackward0>)
tensor(0.0360, grad_fn=<NllLossBackward0>)
tensor(0.0321, grad_fn=<NllLossBackward0>)
tensor(0.0311, grad_fn=<NllLossBackward0>)
tensor(0.0326, grad_fn=<NllLossBackward0>)
tensor(0.0214, grad_fn=<NllLossBackward0>)
tensor(0.0183, grad_fn=<NllLossBackward0>)
tensor(0.0420, grad_fn=<NllLossBackward0>)
tensor(0.0266, grad_fn=<NllLossBackward0>)
tensor(0.0196, grad_fn=<NllLossBackward0>)
tensor(0.0284, grad_fn=<NllLossBackward0>)
tensor(0.0327, grad_fn=<NllLossBackward0>)
tensor(0.0343, grad_fn=<NllLossBackward0>)
tensor(0.0297, grad_fn=<NllLossBackward0>)
tensor(0.0280, grad_fn=<NllLossBackward0>)
tensor(0.0217, grad_fn=<NllLossBackward0>)
tensor(0.03

tensor(0.0185, grad_fn=<NllLossBackward0>)
tensor(0.0232, grad_fn=<NllLossBackward0>)
tensor(0.0308, grad_fn=<NllLossBackward0>)
tensor(0.0250, grad_fn=<NllLossBackward0>)
tensor(0.0116, grad_fn=<NllLossBackward0>)
tensor(0.0211, grad_fn=<NllLossBackward0>)
tensor(0.0263, grad_fn=<NllLossBackward0>)
tensor(0.0195, grad_fn=<NllLossBackward0>)
tensor(0.0268, grad_fn=<NllLossBackward0>)
tensor(0.0270, grad_fn=<NllLossBackward0>)
tensor(0.0237, grad_fn=<NllLossBackward0>)
tensor(0.0207, grad_fn=<NllLossBackward0>)
tensor(0.0206, grad_fn=<NllLossBackward0>)
tensor(0.0194, grad_fn=<NllLossBackward0>)
tensor(0.0325, grad_fn=<NllLossBackward0>)
tensor(0.0264, grad_fn=<NllLossBackward0>)
tensor(0.0293, grad_fn=<NllLossBackward0>)
tensor(0.0228, grad_fn=<NllLossBackward0>)
tensor(0.0246, grad_fn=<NllLossBackward0>)
tensor(0.0201, grad_fn=<NllLossBackward0>)
tensor(0.0103, grad_fn=<NllLossBackward0>)
tensor(0.0314, grad_fn=<NllLossBackward0>)
tensor(0.0201, grad_fn=<NllLossBackward0>)
tensor(0.02

tensor(0.0197, grad_fn=<NllLossBackward0>)
tensor(0.0179, grad_fn=<NllLossBackward0>)
tensor(0.0167, grad_fn=<NllLossBackward0>)
tensor(0.0135, grad_fn=<NllLossBackward0>)
tensor(0.0221, grad_fn=<NllLossBackward0>)
tensor(0.0130, grad_fn=<NllLossBackward0>)
tensor(0.0206, grad_fn=<NllLossBackward0>)
tensor(0.0162, grad_fn=<NllLossBackward0>)
tensor(0.0105, grad_fn=<NllLossBackward0>)
tensor(0.0158, grad_fn=<NllLossBackward0>)
tensor(0.0113, grad_fn=<NllLossBackward0>)
tensor(0.0303, grad_fn=<NllLossBackward0>)
tensor(0.0180, grad_fn=<NllLossBackward0>)
tensor(0.0157, grad_fn=<NllLossBackward0>)
tensor(0.0188, grad_fn=<NllLossBackward0>)
tensor(0.0145, grad_fn=<NllLossBackward0>)
tensor(0.0178, grad_fn=<NllLossBackward0>)
tensor(0.0146, grad_fn=<NllLossBackward0>)
tensor(0.0179, grad_fn=<NllLossBackward0>)
tensor(0.0141, grad_fn=<NllLossBackward0>)
tensor(0.0151, grad_fn=<NllLossBackward0>)
tensor(0.0200, grad_fn=<NllLossBackward0>)
tensor(0.0121, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0152, grad_fn=<NllLossBackward0>)
tensor(0.0131, grad_fn=<NllLossBackward0>)
tensor(0.0219, grad_fn=<NllLossBackward0>)
tensor(0.0148, grad_fn=<NllLossBackward0>)
tensor(0.0179, grad_fn=<NllLossBackward0>)
tensor(0.0197, grad_fn=<NllLossBackward0>)
tensor(0.0209, grad_fn=<NllLossBackward0>)
tensor(0.0168, grad_fn=<NllLossBackward0>)
tensor(0.0265, grad_fn=<NllLossBackward0>)
tensor(0.0233, grad_fn=<NllLossBackward0>)
tensor(0.0288, grad_fn=<NllLossBackward0>)
tensor(0.0237, grad_fn=<NllLossBackward0>)
tensor(0.0191, grad_fn=<NllLossBackward0>)
tensor(0.0207, grad_fn=<NllLossBackward0>)
10
tensor(0.0203, grad_fn=<NllLossBackward0>)
tensor(0.0092, grad_fn=<NllLossBackward0>)
tensor(0.0158, grad_fn=<NllLossBackward0>)
tensor(0.0105, grad_fn=<NllLossBackward0>)
tensor(0.0104, grad_fn=<NllLossBackward0>)
tensor(0.0211, grad_fn=<NllLossBackward0>)
tensor(0.0183, grad_fn=<NllLossBackward0>)
tensor(0.0159, grad_fn=<NllLossBackward0>)
tensor(0.0126, grad_fn=<NllLossBackward0>)
tensor(0

tensor(0.0158, grad_fn=<NllLossBackward0>)
tensor(0.0167, grad_fn=<NllLossBackward0>)
tensor(0.0128, grad_fn=<NllLossBackward0>)
tensor(0.0136, grad_fn=<NllLossBackward0>)
tensor(0.0229, grad_fn=<NllLossBackward0>)
tensor(0.0151, grad_fn=<NllLossBackward0>)
tensor(0.0274, grad_fn=<NllLossBackward0>)
tensor(0.0157, grad_fn=<NllLossBackward0>)
tensor(0.0121, grad_fn=<NllLossBackward0>)
tensor(0.0133, grad_fn=<NllLossBackward0>)
tensor(0.0111, grad_fn=<NllLossBackward0>)
tensor(0.0096, grad_fn=<NllLossBackward0>)
tensor(0.0136, grad_fn=<NllLossBackward0>)
tensor(0.0186, grad_fn=<NllLossBackward0>)
tensor(0.0105, grad_fn=<NllLossBackward0>)
tensor(0.0216, grad_fn=<NllLossBackward0>)
tensor(0.0206, grad_fn=<NllLossBackward0>)
tensor(0.0136, grad_fn=<NllLossBackward0>)
tensor(0.0166, grad_fn=<NllLossBackward0>)
tensor(0.0241, grad_fn=<NllLossBackward0>)
tensor(0.0230, grad_fn=<NllLossBackward0>)
tensor(0.0175, grad_fn=<NllLossBackward0>)
tensor(0.0181, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0170, grad_fn=<NllLossBackward0>)
tensor(0.0102, grad_fn=<NllLossBackward0>)
tensor(0.0099, grad_fn=<NllLossBackward0>)
tensor(0.0118, grad_fn=<NllLossBackward0>)
tensor(0.0119, grad_fn=<NllLossBackward0>)
tensor(0.0077, grad_fn=<NllLossBackward0>)
tensor(0.0110, grad_fn=<NllLossBackward0>)
tensor(0.0100, grad_fn=<NllLossBackward0>)
tensor(0.0128, grad_fn=<NllLossBackward0>)
tensor(0.0167, grad_fn=<NllLossBackward0>)
tensor(0.0110, grad_fn=<NllLossBackward0>)
tensor(0.0109, grad_fn=<NllLossBackward0>)
tensor(0.0112, grad_fn=<NllLossBackward0>)
tensor(0.0116, grad_fn=<NllLossBackward0>)
tensor(0.0129, grad_fn=<NllLossBackward0>)
tensor(0.0137, grad_fn=<NllLossBackward0>)
tensor(0.0044, grad_fn=<NllLossBackward0>)
tensor(0.0129, grad_fn=<NllLossBackward0>)
tensor(0.0140, grad_fn=<NllLossBackward0>)
tensor(0.0149, grad_fn=<NllLossBackward0>)
tensor(0.0107, grad_fn=<NllLossBackward0>)
tensor(0.0129, grad_fn=<NllLossBackward0>)
tensor(0.0089, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0102, grad_fn=<NllLossBackward0>)
tensor(0.0153, grad_fn=<NllLossBackward0>)
tensor(0.0077, grad_fn=<NllLossBackward0>)
tensor(0.0088, grad_fn=<NllLossBackward0>)
tensor(0.0053, grad_fn=<NllLossBackward0>)
tensor(0.0104, grad_fn=<NllLossBackward0>)
tensor(0.0138, grad_fn=<NllLossBackward0>)
tensor(0.0091, grad_fn=<NllLossBackward0>)
tensor(0.0098, grad_fn=<NllLossBackward0>)
tensor(0.0164, grad_fn=<NllLossBackward0>)
tensor(0.0121, grad_fn=<NllLossBackward0>)
tensor(0.0112, grad_fn=<NllLossBackward0>)
tensor(0.0101, grad_fn=<NllLossBackward0>)
tensor(0.0112, grad_fn=<NllLossBackward0>)
tensor(0.0133, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0074, grad_fn=<NllLossBackward0>)
tensor(0.0085, grad_fn=<NllLossBackward0>)
tensor(0.0051, grad_fn=<NllLossBackward0>)
tensor(0.0112, grad_fn=<NllLossBackward0>)
tensor(0.0087, grad_fn=<NllLossBackward0>)
tensor(0.0102, grad_fn=<NllLossBackward0>)
tensor(0.0111, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0054, grad_fn=<NllLossBackward0>)
tensor(0.0068, grad_fn=<NllLossBackward0>)
tensor(0.0044, grad_fn=<NllLossBackward0>)
tensor(0.0107, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0111, grad_fn=<NllLossBackward0>)
tensor(0.0072, grad_fn=<NllLossBackward0>)
tensor(0.0076, grad_fn=<NllLossBackward0>)
tensor(0.0100, grad_fn=<NllLossBackward0>)
tensor(0.0065, grad_fn=<NllLossBackward0>)
tensor(0.0054, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0073, grad_fn=<NllLossBackward0>)
tensor(0.0095, grad_fn=<NllLossBackward0>)
tensor(0.0092, grad_fn=<NllLossBackward0>)
tensor(0.0058, grad_fn=<NllLossBackward0>)
tensor(0.0067, grad_fn=<NllLossBackward0>)
tensor(0.0087, grad_fn=<NllLossBackward0>)
tensor(0.0080, grad_fn=<NllLossBackward0>)
tensor(0.0111, grad_fn=<NllLossBackward0>)
tensor(0.0054, grad_fn=<NllLossBackward0>)
tensor(0.0070, grad_fn=<NllLossBackward0>)
tensor(0.0100, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0060, grad_fn=<NllLossBackward0>)
tensor(0.0060, grad_fn=<NllLossBackward0>)
tensor(0.0047, grad_fn=<NllLossBackward0>)
tensor(0.0037, grad_fn=<NllLossBackward0>)
tensor(0.0063, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0060, grad_fn=<NllLossBackward0>)
tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0048, grad_fn=<NllLossBackward0>)
tensor(0.0055, grad_fn=<NllLossBackward0>)
tensor(0.0064, grad_fn=<NllLossBackward0>)
tensor(0.0054, grad_fn=<NllLossBackward0>)
tensor(0.0042, grad_fn=<NllLossBackward0>)
tensor(0.0040, grad_fn=<NllLossBackward0>)
tensor(0.0055, grad_fn=<NllLossBackward0>)
tensor(0.0055, grad_fn=<NllLossBackward0>)
tensor(0.0044, grad_fn=<NllLossBackward0>)
tensor(0.0058, grad_fn=<NllLossBackward0>)
tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.0047, grad_fn=<NllLossBackward0>)
tensor(0.0084, grad_fn=<NllLossBackward0>)
tensor(0.0063, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0040, grad_fn=<NllLossBackward0>)
tensor(0.0044, grad_fn=<NllLossBackward0>)
tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.0040, grad_fn=<NllLossBackward0>)
tensor(0.0044, grad_fn=<NllLossBackward0>)
tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0057, grad_fn=<NllLossBackward0>)
tensor(0.0042, grad_fn=<NllLossBackward0>)
tensor(0.0052, grad_fn=<NllLossBackward0>)
tensor(0.0040, grad_fn=<NllLossBackward0>)
tensor(0.0030, grad_fn=<NllLossBackward0>)
tensor(0.0055, grad_fn=<NllLossBackward0>)
tensor(0.0045, grad_fn=<NllLossBackward0>)
tensor(0.0082, grad_fn=<NllLossBackward0>)
tensor(0.0031, grad_fn=<NllLossBackward0>)
tensor(0.0069, grad_fn=<NllLossBackward0>)
tensor(0.0037, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0055, grad_fn=<NllLossBackward0>)
tensor(0.0047, grad_fn=<NllLossBackward0>)
tensor(0.0031, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0035, grad_fn=<NllLossBackward0>)
tensor(0.0031, grad_fn=<NllLossBackward0>)
tensor(0.0036, grad_fn=<NllLossBackward0>)
tensor(0.0054, grad_fn=<NllLossBackward0>)
tensor(0.0039, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.0043, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0030, grad_fn=<NllLossBackward0>)
tensor(0.0042, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.0036, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0031, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0022, grad_fn=<NllLossBackward0>)
tensor(0.0045, grad_fn=<NllLossBackward0>)
tensor(0.0034, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0030, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0022, grad_fn=<NllLossBackward0>)
tensor(0.0017, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0022, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.00

In [None]:
#Без bidirectional
#CPU times: user 7min 39s, sys: 1min 47s, total: 9min 27s
#Wall time: 3min 29s

#Bidirectional
#CPU times: user 10min 56s, sys: 2min 50s, total: 13min 46s
#Wall time: 5min 54s

In [16]:
%%time
# inference
sequence = [2,36,2,14,4,24]
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(sequence).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = pred.argmax(-1)

CPU times: user 2.22 ms, sys: 1.22 ms, total: 3.44 ms
Wall time: 2.29 ms


In [17]:
%%time
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    end = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])

['PART', 'DET', 'CCONJ', 'CCONJ', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']
CPU times: user 1.57 ms, sys: 1.54 ms, total: 3.1 ms
Wall time: 1.85 ms


In [52]:
class POS_predictorV2_LSTM(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru = nn.LSTM(emb_dim, hidden_dim//2, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        gru_out, _ = self.gru(emb_x)
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred

In [54]:
%%time
model = POS_predictorV2_LSTM(vocab_len, 200, 512, n_classes)
model.train()
model = model.to(device)

#optimizer
optim = torch.optim.Adam(model.parameters(), lr=0.001)
#lr scheduler


#loss
loss_func = nn.CrossEntropyLoss()
#dataloder
for epoch in range(20):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        optim.zero_grad()
        data = batch['data'].to(device)  # B x T
        pred = model(data)
        loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1).to(device))
        loss.backward()
        # if step % 5:
        optim.step()

        if step % 1000:
            print(loss)
    print(epoch)
    torch.save({'model': model.state_dict()}, '/Users/Vampire/Downloads/Reload_NN3-master-3/lesson4/epoch_%d.pth.tar' % epoch)



tensor(1.9418, grad_fn=<NllLossBackward0>)
tensor(1.3856, grad_fn=<NllLossBackward0>)
tensor(0.9912, grad_fn=<NllLossBackward0>)
tensor(0.5485, grad_fn=<NllLossBackward0>)
tensor(0.8328, grad_fn=<NllLossBackward0>)
tensor(0.7883, grad_fn=<NllLossBackward0>)
tensor(0.7540, grad_fn=<NllLossBackward0>)
tensor(0.8598, grad_fn=<NllLossBackward0>)
tensor(0.6143, grad_fn=<NllLossBackward0>)
tensor(0.6767, grad_fn=<NllLossBackward0>)
tensor(0.4563, grad_fn=<NllLossBackward0>)
tensor(0.7479, grad_fn=<NllLossBackward0>)
tensor(0.7206, grad_fn=<NllLossBackward0>)
tensor(0.7246, grad_fn=<NllLossBackward0>)
tensor(0.7546, grad_fn=<NllLossBackward0>)
tensor(0.4647, grad_fn=<NllLossBackward0>)
tensor(0.5295, grad_fn=<NllLossBackward0>)
tensor(0.4484, grad_fn=<NllLossBackward0>)
tensor(0.6404, grad_fn=<NllLossBackward0>)
tensor(0.7860, grad_fn=<NllLossBackward0>)
tensor(0.6323, grad_fn=<NllLossBackward0>)
tensor(0.7270, grad_fn=<NllLossBackward0>)
tensor(0.7169, grad_fn=<NllLossBackward0>)
tensor(0.66

tensor(0.1287, grad_fn=<NllLossBackward0>)
tensor(0.2173, grad_fn=<NllLossBackward0>)
tensor(0.1795, grad_fn=<NllLossBackward0>)
tensor(0.1701, grad_fn=<NllLossBackward0>)
tensor(0.1277, grad_fn=<NllLossBackward0>)
tensor(0.1464, grad_fn=<NllLossBackward0>)
tensor(0.1636, grad_fn=<NllLossBackward0>)
tensor(0.1878, grad_fn=<NllLossBackward0>)
tensor(0.1351, grad_fn=<NllLossBackward0>)
tensor(0.1907, grad_fn=<NllLossBackward0>)
tensor(0.1245, grad_fn=<NllLossBackward0>)
tensor(0.1535, grad_fn=<NllLossBackward0>)
tensor(0.2090, grad_fn=<NllLossBackward0>)
tensor(0.2044, grad_fn=<NllLossBackward0>)
tensor(0.1580, grad_fn=<NllLossBackward0>)
tensor(0.1238, grad_fn=<NllLossBackward0>)
tensor(0.2118, grad_fn=<NllLossBackward0>)
tensor(0.1818, grad_fn=<NllLossBackward0>)
tensor(0.1547, grad_fn=<NllLossBackward0>)
tensor(0.1659, grad_fn=<NllLossBackward0>)
0
tensor(0.1954, grad_fn=<NllLossBackward0>)
tensor(0.1366, grad_fn=<NllLossBackward0>)
tensor(0.1664, grad_fn=<NllLossBackward0>)
tensor(0.

tensor(0.0639, grad_fn=<NllLossBackward0>)
tensor(0.1202, grad_fn=<NllLossBackward0>)
tensor(0.0974, grad_fn=<NllLossBackward0>)
tensor(0.1038, grad_fn=<NllLossBackward0>)
tensor(0.0864, grad_fn=<NllLossBackward0>)
tensor(0.0787, grad_fn=<NllLossBackward0>)
tensor(0.1364, grad_fn=<NllLossBackward0>)
tensor(0.0813, grad_fn=<NllLossBackward0>)
tensor(0.1235, grad_fn=<NllLossBackward0>)
tensor(0.1197, grad_fn=<NllLossBackward0>)
tensor(0.1144, grad_fn=<NllLossBackward0>)
tensor(0.1108, grad_fn=<NllLossBackward0>)
tensor(0.1091, grad_fn=<NllLossBackward0>)
tensor(0.1690, grad_fn=<NllLossBackward0>)
tensor(0.0977, grad_fn=<NllLossBackward0>)
tensor(0.0675, grad_fn=<NllLossBackward0>)
tensor(0.0934, grad_fn=<NllLossBackward0>)
tensor(0.1063, grad_fn=<NllLossBackward0>)
tensor(0.1102, grad_fn=<NllLossBackward0>)
tensor(0.1564, grad_fn=<NllLossBackward0>)
tensor(0.0654, grad_fn=<NllLossBackward0>)
tensor(0.1578, grad_fn=<NllLossBackward0>)
tensor(0.1384, grad_fn=<NllLossBackward0>)
tensor(0.10

tensor(0.0710, grad_fn=<NllLossBackward0>)
tensor(0.0854, grad_fn=<NllLossBackward0>)
tensor(0.0946, grad_fn=<NllLossBackward0>)
tensor(0.0743, grad_fn=<NllLossBackward0>)
tensor(0.1209, grad_fn=<NllLossBackward0>)
tensor(0.0840, grad_fn=<NllLossBackward0>)
tensor(0.0938, grad_fn=<NllLossBackward0>)
tensor(0.0674, grad_fn=<NllLossBackward0>)
tensor(0.0584, grad_fn=<NllLossBackward0>)
tensor(0.0707, grad_fn=<NllLossBackward0>)
tensor(0.0743, grad_fn=<NllLossBackward0>)
tensor(0.0703, grad_fn=<NllLossBackward0>)
tensor(0.0741, grad_fn=<NllLossBackward0>)
tensor(0.1101, grad_fn=<NllLossBackward0>)
tensor(0.0660, grad_fn=<NllLossBackward0>)
tensor(0.0799, grad_fn=<NllLossBackward0>)
tensor(0.0913, grad_fn=<NllLossBackward0>)
tensor(0.0878, grad_fn=<NllLossBackward0>)
tensor(0.0793, grad_fn=<NllLossBackward0>)
tensor(0.0750, grad_fn=<NllLossBackward0>)
tensor(0.0897, grad_fn=<NllLossBackward0>)
tensor(0.0951, grad_fn=<NllLossBackward0>)
tensor(0.0890, grad_fn=<NllLossBackward0>)
tensor(0.03

tensor(0.0465, grad_fn=<NllLossBackward0>)
tensor(0.0679, grad_fn=<NllLossBackward0>)
tensor(0.0837, grad_fn=<NllLossBackward0>)
tensor(0.0757, grad_fn=<NllLossBackward0>)
tensor(0.0500, grad_fn=<NllLossBackward0>)
tensor(0.0556, grad_fn=<NllLossBackward0>)
tensor(0.0841, grad_fn=<NllLossBackward0>)
tensor(0.0791, grad_fn=<NllLossBackward0>)
tensor(0.0661, grad_fn=<NllLossBackward0>)
tensor(0.0695, grad_fn=<NllLossBackward0>)
tensor(0.0836, grad_fn=<NllLossBackward0>)
tensor(0.0704, grad_fn=<NllLossBackward0>)
tensor(0.0533, grad_fn=<NllLossBackward0>)
tensor(0.0709, grad_fn=<NllLossBackward0>)
tensor(0.0444, grad_fn=<NllLossBackward0>)
tensor(0.0449, grad_fn=<NllLossBackward0>)
tensor(0.0279, grad_fn=<NllLossBackward0>)
tensor(0.0592, grad_fn=<NllLossBackward0>)
tensor(0.0553, grad_fn=<NllLossBackward0>)
tensor(0.0889, grad_fn=<NllLossBackward0>)
tensor(0.0694, grad_fn=<NllLossBackward0>)
tensor(0.0699, grad_fn=<NllLossBackward0>)
tensor(0.0771, grad_fn=<NllLossBackward0>)
tensor(0.05

tensor(0.0501, grad_fn=<NllLossBackward0>)
tensor(0.0486, grad_fn=<NllLossBackward0>)
tensor(0.0512, grad_fn=<NllLossBackward0>)
tensor(0.0647, grad_fn=<NllLossBackward0>)
tensor(0.0250, grad_fn=<NllLossBackward0>)
tensor(0.0371, grad_fn=<NllLossBackward0>)
tensor(0.0336, grad_fn=<NllLossBackward0>)
tensor(0.0620, grad_fn=<NllLossBackward0>)
tensor(0.0556, grad_fn=<NllLossBackward0>)
tensor(0.0586, grad_fn=<NllLossBackward0>)
tensor(0.0405, grad_fn=<NllLossBackward0>)
tensor(0.0347, grad_fn=<NllLossBackward0>)
tensor(0.0464, grad_fn=<NllLossBackward0>)
tensor(0.0401, grad_fn=<NllLossBackward0>)
tensor(0.0455, grad_fn=<NllLossBackward0>)
tensor(0.0735, grad_fn=<NllLossBackward0>)
tensor(0.0456, grad_fn=<NllLossBackward0>)
tensor(0.0587, grad_fn=<NllLossBackward0>)
tensor(0.0424, grad_fn=<NllLossBackward0>)
tensor(0.0500, grad_fn=<NllLossBackward0>)
tensor(0.0469, grad_fn=<NllLossBackward0>)
tensor(0.0629, grad_fn=<NllLossBackward0>)
tensor(0.0472, grad_fn=<NllLossBackward0>)
tensor(0.04

tensor(0.0235, grad_fn=<NllLossBackward0>)
tensor(0.0292, grad_fn=<NllLossBackward0>)
tensor(0.0438, grad_fn=<NllLossBackward0>)
tensor(0.0281, grad_fn=<NllLossBackward0>)
tensor(0.0231, grad_fn=<NllLossBackward0>)
tensor(0.0380, grad_fn=<NllLossBackward0>)
tensor(0.0316, grad_fn=<NllLossBackward0>)
tensor(0.0256, grad_fn=<NllLossBackward0>)
tensor(0.0366, grad_fn=<NllLossBackward0>)
tensor(0.0277, grad_fn=<NllLossBackward0>)
tensor(0.0353, grad_fn=<NllLossBackward0>)
tensor(0.0239, grad_fn=<NllLossBackward0>)
tensor(0.0287, grad_fn=<NllLossBackward0>)
tensor(0.0420, grad_fn=<NllLossBackward0>)
tensor(0.0176, grad_fn=<NllLossBackward0>)
tensor(0.0332, grad_fn=<NllLossBackward0>)
tensor(0.0539, grad_fn=<NllLossBackward0>)
tensor(0.0370, grad_fn=<NllLossBackward0>)
tensor(0.0338, grad_fn=<NllLossBackward0>)
tensor(0.0359, grad_fn=<NllLossBackward0>)
tensor(0.0327, grad_fn=<NllLossBackward0>)
tensor(0.0425, grad_fn=<NllLossBackward0>)
tensor(0.0441, grad_fn=<NllLossBackward0>)
tensor(0.02

tensor(0.0274, grad_fn=<NllLossBackward0>)
tensor(0.0257, grad_fn=<NllLossBackward0>)
tensor(0.0286, grad_fn=<NllLossBackward0>)
tensor(0.0227, grad_fn=<NllLossBackward0>)
tensor(0.0220, grad_fn=<NllLossBackward0>)
tensor(0.0172, grad_fn=<NllLossBackward0>)
tensor(0.0325, grad_fn=<NllLossBackward0>)
tensor(0.0318, grad_fn=<NllLossBackward0>)
tensor(0.0157, grad_fn=<NllLossBackward0>)
tensor(0.0212, grad_fn=<NllLossBackward0>)
tensor(0.0224, grad_fn=<NllLossBackward0>)
tensor(0.0521, grad_fn=<NllLossBackward0>)
tensor(0.0292, grad_fn=<NllLossBackward0>)
tensor(0.0195, grad_fn=<NllLossBackward0>)
tensor(0.0226, grad_fn=<NllLossBackward0>)
tensor(0.0322, grad_fn=<NllLossBackward0>)
tensor(0.0251, grad_fn=<NllLossBackward0>)
tensor(0.0210, grad_fn=<NllLossBackward0>)
tensor(0.0195, grad_fn=<NllLossBackward0>)
tensor(0.0313, grad_fn=<NllLossBackward0>)
tensor(0.0288, grad_fn=<NllLossBackward0>)
tensor(0.0315, grad_fn=<NllLossBackward0>)
tensor(0.0325, grad_fn=<NllLossBackward0>)
tensor(0.05

tensor(0.0202, grad_fn=<NllLossBackward0>)
tensor(0.0219, grad_fn=<NllLossBackward0>)
tensor(0.0187, grad_fn=<NllLossBackward0>)
tensor(0.0168, grad_fn=<NllLossBackward0>)
tensor(0.0177, grad_fn=<NllLossBackward0>)
tensor(0.0248, grad_fn=<NllLossBackward0>)
tensor(0.0188, grad_fn=<NllLossBackward0>)
tensor(0.0185, grad_fn=<NllLossBackward0>)
tensor(0.0216, grad_fn=<NllLossBackward0>)
tensor(0.0186, grad_fn=<NllLossBackward0>)
tensor(0.0218, grad_fn=<NllLossBackward0>)
tensor(0.0238, grad_fn=<NllLossBackward0>)
tensor(0.0171, grad_fn=<NllLossBackward0>)
tensor(0.0276, grad_fn=<NllLossBackward0>)
tensor(0.0233, grad_fn=<NllLossBackward0>)
tensor(0.0142, grad_fn=<NllLossBackward0>)
tensor(0.0183, grad_fn=<NllLossBackward0>)
tensor(0.0292, grad_fn=<NllLossBackward0>)
tensor(0.0160, grad_fn=<NllLossBackward0>)
tensor(0.0241, grad_fn=<NllLossBackward0>)
tensor(0.0165, grad_fn=<NllLossBackward0>)
tensor(0.0167, grad_fn=<NllLossBackward0>)
tensor(0.0220, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0133, grad_fn=<NllLossBackward0>)
tensor(0.0114, grad_fn=<NllLossBackward0>)
tensor(0.0172, grad_fn=<NllLossBackward0>)
tensor(0.0175, grad_fn=<NllLossBackward0>)
tensor(0.0194, grad_fn=<NllLossBackward0>)
tensor(0.0157, grad_fn=<NllLossBackward0>)
tensor(0.0138, grad_fn=<NllLossBackward0>)
tensor(0.0171, grad_fn=<NllLossBackward0>)
tensor(0.0213, grad_fn=<NllLossBackward0>)
tensor(0.0150, grad_fn=<NllLossBackward0>)
tensor(0.0141, grad_fn=<NllLossBackward0>)
tensor(0.0182, grad_fn=<NllLossBackward0>)
tensor(0.0127, grad_fn=<NllLossBackward0>)
tensor(0.0146, grad_fn=<NllLossBackward0>)
tensor(0.0244, grad_fn=<NllLossBackward0>)
tensor(0.0119, grad_fn=<NllLossBackward0>)
tensor(0.0159, grad_fn=<NllLossBackward0>)
tensor(0.0140, grad_fn=<NllLossBackward0>)
tensor(0.0126, grad_fn=<NllLossBackward0>)
tensor(0.0264, grad_fn=<NllLossBackward0>)
tensor(0.0087, grad_fn=<NllLossBackward0>)
tensor(0.0154, grad_fn=<NllLossBackward0>)
tensor(0.0170, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0091, grad_fn=<NllLossBackward0>)
tensor(0.0080, grad_fn=<NllLossBackward0>)
tensor(0.0093, grad_fn=<NllLossBackward0>)
tensor(0.0071, grad_fn=<NllLossBackward0>)
tensor(0.0115, grad_fn=<NllLossBackward0>)
tensor(0.0120, grad_fn=<NllLossBackward0>)
tensor(0.0099, grad_fn=<NllLossBackward0>)
tensor(0.0132, grad_fn=<NllLossBackward0>)
tensor(0.0068, grad_fn=<NllLossBackward0>)
tensor(0.0097, grad_fn=<NllLossBackward0>)
tensor(0.0077, grad_fn=<NllLossBackward0>)
tensor(0.0113, grad_fn=<NllLossBackward0>)
tensor(0.0067, grad_fn=<NllLossBackward0>)
tensor(0.0057, grad_fn=<NllLossBackward0>)
tensor(0.0076, grad_fn=<NllLossBackward0>)
tensor(0.0133, grad_fn=<NllLossBackward0>)
tensor(0.0104, grad_fn=<NllLossBackward0>)
tensor(0.0148, grad_fn=<NllLossBackward0>)
tensor(0.0120, grad_fn=<NllLossBackward0>)
tensor(0.0088, grad_fn=<NllLossBackward0>)
tensor(0.0073, grad_fn=<NllLossBackward0>)
tensor(0.0128, grad_fn=<NllLossBackward0>)
tensor(0.0109, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0116, grad_fn=<NllLossBackward0>)
tensor(0.0138, grad_fn=<NllLossBackward0>)
tensor(0.0141, grad_fn=<NllLossBackward0>)
tensor(0.0112, grad_fn=<NllLossBackward0>)
tensor(0.0113, grad_fn=<NllLossBackward0>)
9
tensor(0.0052, grad_fn=<NllLossBackward0>)
tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor(0.0053, grad_fn=<NllLossBackward0>)
tensor(0.0072, grad_fn=<NllLossBackward0>)
tensor(0.0086, grad_fn=<NllLossBackward0>)
tensor(0.0071, grad_fn=<NllLossBackward0>)
tensor(0.0115, grad_fn=<NllLossBackward0>)
tensor(0.0058, grad_fn=<NllLossBackward0>)
tensor(0.0064, grad_fn=<NllLossBackward0>)
tensor(0.0077, grad_fn=<NllLossBackward0>)
tensor(0.0048, grad_fn=<NllLossBackward0>)
tensor(0.0099, grad_fn=<NllLossBackward0>)
tensor(0.0109, grad_fn=<NllLossBackward0>)
tensor(0.0070, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0060, grad_fn=<NllLossBackward0>)
tensor(0.0052, grad_fn=<NllLossBackward0>)
tensor(0.0098, grad_fn=<NllLossBackward0>)
tensor(0.

tensor(0.0081, grad_fn=<NllLossBackward0>)
tensor(0.0094, grad_fn=<NllLossBackward0>)
tensor(0.0098, grad_fn=<NllLossBackward0>)
tensor(0.0073, grad_fn=<NllLossBackward0>)
tensor(0.0056, grad_fn=<NllLossBackward0>)
tensor(0.0085, grad_fn=<NllLossBackward0>)
tensor(0.0093, grad_fn=<NllLossBackward0>)
tensor(0.0078, grad_fn=<NllLossBackward0>)
tensor(0.0081, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0099, grad_fn=<NllLossBackward0>)
tensor(0.0139, grad_fn=<NllLossBackward0>)
tensor(0.0119, grad_fn=<NllLossBackward0>)
tensor(0.0098, grad_fn=<NllLossBackward0>)
tensor(0.0063, grad_fn=<NllLossBackward0>)
tensor(0.0083, grad_fn=<NllLossBackward0>)
tensor(0.0093, grad_fn=<NllLossBackward0>)
tensor(0.0071, grad_fn=<NllLossBackward0>)
tensor(0.0111, grad_fn=<NllLossBackward0>)
tensor(0.0082, grad_fn=<NllLossBackward0>)
tensor(0.0096, grad_fn=<NllLossBackward0>)
tensor(0.0067, grad_fn=<NllLossBackward0>)
tensor(0.0071, grad_fn=<NllLossBackward0>)
tensor(0.01

tensor(0.0039, grad_fn=<NllLossBackward0>)
tensor(0.0040, grad_fn=<NllLossBackward0>)
tensor(0.0076, grad_fn=<NllLossBackward0>)
tensor(0.0051, grad_fn=<NllLossBackward0>)
tensor(0.0040, grad_fn=<NllLossBackward0>)
tensor(0.0042, grad_fn=<NllLossBackward0>)
tensor(0.0047, grad_fn=<NllLossBackward0>)
tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.0055, grad_fn=<NllLossBackward0>)
tensor(0.0071, grad_fn=<NllLossBackward0>)
tensor(0.0080, grad_fn=<NllLossBackward0>)
tensor(0.0064, grad_fn=<NllLossBackward0>)
tensor(0.0060, grad_fn=<NllLossBackward0>)
tensor(0.0042, grad_fn=<NllLossBackward0>)
tensor(0.0070, grad_fn=<NllLossBackward0>)
tensor(0.0066, grad_fn=<NllLossBackward0>)
tensor(0.0055, grad_fn=<NllLossBackward0>)
tensor(0.0114, grad_fn=<NllLossBackward0>)
tensor(0.0058, grad_fn=<NllLossBackward0>)
tensor(0.0081, grad_fn=<NllLossBackward0>)
tensor(0.0064, grad_fn=<NllLossBackward0>)
tensor(0.0049, grad_fn=<NllLossBackward0>)
tensor(0.0047, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0030, grad_fn=<NllLossBackward0>)
tensor(0.0036, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0043, grad_fn=<NllLossBackward0>)
tensor(0.0043, grad_fn=<NllLossBackward0>)
tensor(0.0030, grad_fn=<NllLossBackward0>)
tensor(0.0031, grad_fn=<NllLossBackward0>)
tensor(0.0046, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0044, grad_fn=<NllLossBackward0>)
tensor(0.0036, grad_fn=<NllLossBackward0>)
tensor(0.0037, grad_fn=<NllLossBackward0>)
tensor(0.0038, grad_fn=<NllLossBackward0>)
tensor(0.0032, grad_fn=<NllLossBackward0>)
tensor(0.0042, grad_fn=<NllLossBackward0>)
tensor(0.0035, grad_fn=<NllLossBackward0>)
tensor(0.0052, grad_fn=<NllLossBackward0>)
tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0032, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0022, grad_fn=<NllLossBackward0>)
tensor(0.0017, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0027, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0036, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0028, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0024, grad_fn=<NllLossBackward0>)
tensor(0.0041, grad_fn=<NllLossBackward0>)
tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0025, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0026, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0022, grad_fn=<NllLossBackward0>)
tensor(0.0029, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0014, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0017, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0011, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0020, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0019, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0033, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0012, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0015, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0016, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0009, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0010, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0013, grad_fn=<NllLossBackward0>)
tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.00

tensor(0.0003, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor(0.0018, grad_fn=<NllLossBackward0>)
tensor(0.0007, grad_fn=<NllLossBackward0>)
tensor(0.0021, grad_fn=<NllLossBackward0>)
tensor(0.0004, grad_fn=<NllLossBackward0>)
tensor(0.0005, grad_fn=<NllLossBackward0>)
19
CPU times: user 21min 14s, sys: 6min 12s, total: 27min 27s
Wall time: 15min 57s


In [20]:
#Без Bidirectional
#CPU times: user 12min 45s, sys: 3min 29s, total: 16min 14s
#Wall time: 8min 36s

#Bidirectional
#CPU times: user 21min 14s, sys: 6min 12s, total: 27min 27s
#Wall time: 15min 57s

In [21]:
%%time
# inference
sequence = [2,36,2,14,4,24]
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(sequence).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = pred.argmax(-1)

CPU times: user 1.68 ms, sys: 1.3 ms, total: 2.99 ms
Wall time: 1.98 ms


In [22]:
%%time
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    end = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])

['PART', 'DET', 'CCONJ', 'NUM', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']
CPU times: user 1.79 ms, sys: 1.29 ms, total: 3.08 ms
Wall time: 1.67 ms
