In [2]:
import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from torch.nn.utils.rnn import pad_sequence

In [3]:
class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):

        with open(data_dir + train_lang + '.train', 'r', encoding='utf-8') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]

        self.target_vocab = {}
        self.word_vocab = {}
        self.char_vocab = {}

        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }
# seq1 = [1, 2, 3] -> [1, 2, 3, 0]
# seq2 = [7, 5, 4, 2]

def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(torch.as_tensor(item['data']))
        target.append(torch.as_tensor(item['target']))
    data = pad_sequence(data, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}

In [5]:
data_dir = '/content/sample_data/'
train_lang = 'en'
dataset = DatasetSeq(data_dir)

In [6]:
vocab_len = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
cuda_device = 10
batch_size = 200
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

GRU

In [69]:
class POS_predictor_GRU(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        gru_out, _ = self.gru(emb_x)
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred

model = POS_predictor_GRU(vocab_len, 200, 256, n_classes)
model.train()

POS_predictor_GRU(
  (emb): Embedding(29588, 200)
  (gru): GRU(200, 256, batch_first=True)
  (classifier): Linear(in_features=256, out_features=18, bias=True)
)

In [70]:
optim = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()
#dataloder
for epoch in range(20):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        optim.zero_grad()
        data = batch['data']  # B x T
        pred = model(data)
        loss = loss_func(pred.view(-1, n_classes), batch['target'].view(-1))
        loss.backward()
        # if step % 5:
        optim.step()

        if step % 100 == 0 :
            print(loss)
    print(epoch)

tensor(2.7545, grad_fn=<NllLossBackward0>)
tensor(0.2952, grad_fn=<NllLossBackward0>)
0
tensor(0.2921, grad_fn=<NllLossBackward0>)
tensor(0.1388, grad_fn=<NllLossBackward0>)
1
tensor(0.1644, grad_fn=<NllLossBackward0>)
tensor(0.1657, grad_fn=<NllLossBackward0>)
2
tensor(0.0773, grad_fn=<NllLossBackward0>)
tensor(0.1318, grad_fn=<NllLossBackward0>)
3
tensor(0.0855, grad_fn=<NllLossBackward0>)
tensor(0.0887, grad_fn=<NllLossBackward0>)
4
tensor(0.0617, grad_fn=<NllLossBackward0>)
tensor(0.0783, grad_fn=<NllLossBackward0>)
5
tensor(0.0503, grad_fn=<NllLossBackward0>)
tensor(0.0913, grad_fn=<NllLossBackward0>)
6
tensor(0.0658, grad_fn=<NllLossBackward0>)
tensor(0.0595, grad_fn=<NllLossBackward0>)
7
tensor(0.0603, grad_fn=<NllLossBackward0>)
tensor(0.0552, grad_fn=<NllLossBackward0>)
8
tensor(0.0396, grad_fn=<NllLossBackward0>)
tensor(0.0458, grad_fn=<NllLossBackward0>)
9
tensor(0.0341, grad_fn=<NllLossBackward0>)
tensor(0.0290, grad_fn=<NllLossBackward0>)
10
tensor(0.0352, grad_fn=<NllLoss

In [71]:
# inference
sequence = [2,36,2,14,4,24]
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(sequence).unsqueeze(0)) # 1 x T x N_classes
    labels = predict.argmax(-1)

In [72]:
labels

tensor([[2, 7, 2, 4, 2, 2]])

In [73]:
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(tokens).unsqueeze(0)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    end = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])

['PART', 'DET', 'CCONJ', 'AUX', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']


RNN

In [74]:
class POS_predictor_RNN(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        rnn_out, _ = self.rnn(emb_x)
        pred = self.classifier(torch.dropout(rnn_out, 0.1, self.training))

        return pred

model_RNN = POS_predictor_RNN(vocab_len, 250, 300, n_classes)
model_RNN.train()



POS_predictor_RNN(
  (emb): Embedding(29588, 250)
  (rnn): RNN(250, 300, batch_first=True)
  (classifier): Linear(in_features=300, out_features=18, bias=True)
)

In [75]:
optim_RNN = torch.optim.Adam(model_RNN.parameters(), lr=0.001)
loss_func_RNN = nn.CrossEntropyLoss()

for epoch in range(20):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        optim_RNN.zero_grad()
        data = batch['data']  # B x T
        pred = model_RNN(data)
        loss_RNN = loss_func_RNN(pred.view(-1, n_classes), batch['target'].view(-1))
        loss_RNN.backward()
        # if step % 5:
        optim_RNN.step()

        if step % 100 == 0 :
            print(loss_RNN)
    print(epoch)

tensor(2.8381, grad_fn=<NllLossBackward0>)
tensor(0.1891, grad_fn=<NllLossBackward0>)
0
tensor(0.1382, grad_fn=<NllLossBackward0>)
tensor(0.1672, grad_fn=<NllLossBackward0>)
1
tensor(0.1341, grad_fn=<NllLossBackward0>)
tensor(0.1784, grad_fn=<NllLossBackward0>)
2
tensor(0.0883, grad_fn=<NllLossBackward0>)
tensor(0.1264, grad_fn=<NllLossBackward0>)
3
tensor(0.0969, grad_fn=<NllLossBackward0>)
tensor(0.0904, grad_fn=<NllLossBackward0>)
4
tensor(0.0875, grad_fn=<NllLossBackward0>)
tensor(0.1147, grad_fn=<NllLossBackward0>)
5
tensor(0.0720, grad_fn=<NllLossBackward0>)
tensor(0.0575, grad_fn=<NllLossBackward0>)
6
tensor(0.0635, grad_fn=<NllLossBackward0>)
tensor(0.0728, grad_fn=<NllLossBackward0>)
7
tensor(0.0561, grad_fn=<NllLossBackward0>)
tensor(0.0491, grad_fn=<NllLossBackward0>)
8
tensor(0.0499, grad_fn=<NllLossBackward0>)
tensor(0.0584, grad_fn=<NllLossBackward0>)
9
tensor(0.0387, grad_fn=<NllLossBackward0>)
tensor(0.0395, grad_fn=<NllLossBackward0>)
10
tensor(0.0326, grad_fn=<NllLoss

In [76]:
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model_RNN.eval()
    predict = model_RNN(torch.tensor(tokens).unsqueeze(0)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    end = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])

['PART', 'DET', 'CCONJ', 'NUM', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']


RNN

In [77]:
class POS_predictor_LSTM(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        lstm_out, _ = self.lstm(emb_x)
        pred = self.classifier(torch.dropout(lstm_out, 0.1, self.training))

        return pred

model_LSTM = POS_predictor_LSTM(vocab_len, 250, 300, n_classes)
model_LSTM.train()


POS_predictor_LSTM(
  (emb): Embedding(29588, 250)
  (lstm): LSTM(250, 300, batch_first=True)
  (classifier): Linear(in_features=300, out_features=18, bias=True)
)

In [78]:
optim_LSTM = torch.optim.Adam(model_LSTM.parameters(), lr=0.001)
loss_func_LSTM = nn.CrossEntropyLoss()

for epoch in range(20):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        optim_LSTM.zero_grad()
        data = batch['data']  # B x T
        pred = model_LSTM(data)
        loss_LSTM = loss_func_LSTM(pred.view(-1, n_classes), batch['target'].view(-1))
        loss_LSTM.backward()
        # if step % 5:
        optim_LSTM.step()

        if step % 100 == 0 :
            print(loss_LSTM)
    print(epoch)

tensor(2.9761, grad_fn=<NllLossBackward0>)
tensor(0.3434, grad_fn=<NllLossBackward0>)
0
tensor(0.3014, grad_fn=<NllLossBackward0>)
tensor(0.2122, grad_fn=<NllLossBackward0>)
1
tensor(0.1915, grad_fn=<NllLossBackward0>)
tensor(0.1530, grad_fn=<NllLossBackward0>)
2
tensor(0.1157, grad_fn=<NllLossBackward0>)
tensor(0.1053, grad_fn=<NllLossBackward0>)
3
tensor(0.1291, grad_fn=<NllLossBackward0>)
tensor(0.0658, grad_fn=<NllLossBackward0>)
4
tensor(0.0480, grad_fn=<NllLossBackward0>)
tensor(0.0557, grad_fn=<NllLossBackward0>)
5
tensor(0.0707, grad_fn=<NllLossBackward0>)
tensor(0.0643, grad_fn=<NllLossBackward0>)
6
tensor(0.0413, grad_fn=<NllLossBackward0>)
tensor(0.0681, grad_fn=<NllLossBackward0>)
7
tensor(0.0208, grad_fn=<NllLossBackward0>)
tensor(0.0427, grad_fn=<NllLossBackward0>)
8
tensor(0.0384, grad_fn=<NllLossBackward0>)
tensor(0.0342, grad_fn=<NllLossBackward0>)
9
tensor(0.0361, grad_fn=<NllLossBackward0>)
tensor(0.0631, grad_fn=<NllLossBackward0>)
10
tensor(0.0329, grad_fn=<NllLoss

In [79]:
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model_LSTM.eval()
    predict = model_LSTM(torch.tensor(tokens).unsqueeze(0)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    end = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])

['PART', 'DET', 'CCONJ', 'CCONJ', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']


RNN+char

In [7]:
def collate_fn_char(input_data):
    data = []
    chars = []
    targets = []
    max_len = 0
    for item in input_data:
        if len(item['data']) > max_len:
            max_len = len(item['data'])
        data.append(torch.as_tensor(item['data']))
        chars.append(item['char'])
        targets.append(torch.as_tensor(item['target']))
    chars_seq = [[torch.as_tensor([0]) for _ in range(len(input_data))] for _ in range(max_len)]
    for j in range(len(input_data)): #для каждого элемента (словосочетания) в датасете
        for i in range(max_len):
            if len(chars[j]) > i: #пока длина посл-ти слов 1 элемента (словосочетания) вписывается в макс.длину
                chars_seq[i][j] = torch.as_tensor(chars[j][i]) #добавляем слово в словарь
    for j in range(max_len):
        chars_seq[j] = pad_sequence(chars_seq[j], batch_first=True, padding_value=0) #дозаполняем нулями короткие ССЧ
    data = pad_sequence(data, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return {'data': data, 'chars': chars_seq, 'target': targets}

In [8]:
class CharModel(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x): # B x T
        emb_x = self.emb(x)  # B x T x V
        _, out = self.rnn(emb_x) # 1 x B x Hid

        return out

In [9]:
class POS_predictorV2Chars(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_classes: int,
                 n_chars: int,
                 char_emb_dim: int,
                 char_hidden_dim: int,
                 ):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim + char_hidden_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.classifier = nn.Linear(hidden_dim, n_classes, bias=True)
        self.char_rnn = CharModel(n_chars, char_emb_dim, char_hidden_dim)

    def forward(self, x, x_chars):  # B x T
        emb_x = self.emb(x)  # B x T x V
        chars = [self.char_rnn(word.to(emb_x.device)).squeeze().unsqueeze(1) for word in x_chars]
        chars = torch.cat(chars, dim=1)
        gru_out, _ = self.gru(torch.cat((emb_x, chars), dim=-1))
        pred = self.classifier(torch.dropout(gru_out, 0.1, self.training))

        return pred

In [10]:
model_char = POS_predictorV2Chars(vocab_len, 200, 256, n_classes, n_chars, 32, 64)
model_char.train()

optim_char = torch.optim.Adam(model_char.parameters(), lr=0.001)

loss_func_char = nn.CrossEntropyLoss()

for epoch in range(20):
    dataloader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn_char,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    )
    for step, batch in enumerate(dataloader):
        #
        optim_char.zero_grad()
        data = batch['data']  # B x T
        pred = model_char(data, batch['chars'])
        loss_char = loss_func_char(pred.view(-1, n_classes), batch['target'].view(-1))
        loss_char.backward()
        # if step % 5:
        optim_char.step()
        #
        if step % 100 == 0:
            print(loss_char)
    print(epoch)

tensor(3.0350, grad_fn=<NllLossBackward0>)
tensor(0.1693, grad_fn=<NllLossBackward0>)
0
tensor(0.2545, grad_fn=<NllLossBackward0>)
tensor(0.1286, grad_fn=<NllLossBackward0>)
1
tensor(0.1667, grad_fn=<NllLossBackward0>)
tensor(0.0944, grad_fn=<NllLossBackward0>)
2
tensor(0.0787, grad_fn=<NllLossBackward0>)
tensor(0.1035, grad_fn=<NllLossBackward0>)
3
tensor(0.0560, grad_fn=<NllLossBackward0>)
tensor(0.0320, grad_fn=<NllLossBackward0>)
4
tensor(0.0615, grad_fn=<NllLossBackward0>)
tensor(0.0760, grad_fn=<NllLossBackward0>)
5
tensor(0.0525, grad_fn=<NllLossBackward0>)
tensor(0.0476, grad_fn=<NllLossBackward0>)
6
tensor(0.0522, grad_fn=<NllLossBackward0>)
tensor(0.0646, grad_fn=<NllLossBackward0>)
7
tensor(0.0439, grad_fn=<NllLossBackward0>)
tensor(0.0393, grad_fn=<NllLossBackward0>)
8
tensor(0.0320, grad_fn=<NllLossBackward0>)
tensor(0.0376, grad_fn=<NllLossBackward0>)
9
tensor(0.0353, grad_fn=<NllLossBackward0>)
tensor(0.0516, grad_fn=<NllLossBackward0>)
10
tensor(0.0395, grad_fn=<NllLoss

In [11]:
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens1 = [dataset.word_vocab[w] for w in words]
tokens2 = []
for w in words:
  tokens2.append([dataset.char_vocab[w[i]] for i in range(len(w))])

In [12]:
tokens3=[]
for j in range(len(tokens2)):
  tokens3.append(tokens2[j]+[0 for i in range(7-len(tokens2[j]))])

In [13]:
start = datetime.datetime.now()
with torch.no_grad():
    model_char.eval()
    predict = model_char(torch.tensor(tokens1).unsqueeze(1), torch.tensor(tokens3).unsqueeze(0)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    end = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])

['PART', 'DET', 'CCONJ', 'AUX', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']
