![image](./task2.png)

- [src](https://habr.com/ru/companies/wunderfund/articles/331310/)

# LSTM testing

In [12]:
import numpy as np
import torch

from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm


In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device


'cuda'

In [1]:
import warnings
warnings.filterwarnings("ignore")


## Tokenizer and POS tagging

In [15]:
import nerus as ns
from transformers import AutoTokenizer
from transformers import AutoModel

MAX_SENTENCE_LENGTH = 50

class NerusTokenizer:
    words_to_id: dict
    ids_to_word: dict
    word_next_id: int = 0
    tag_to_id: dict
    lstm_tokens: list[str] = ["[PAD]", "[CLS]", "[SEP]", "[UNK]"]
    def __init__(self):
        self.words_to_id = {}
        self.tag_to_id = {}
        self.ids_to_word = {}
        for sys_id, sys_tag in enumerate(self.lstm_tokens):
            self.ids_to_word[sys_id] = sys_tag
            self.words_to_id[sys_tag] = sys_id
            self.tag_to_id[sys_tag] = sys_id
        self.word_next_id = self.lstm_tokens.__len__()

    def expand_vocabulary(self, sents: list[ns.NerusSent]):
        for sent in sents:
            for token in sent.tokens:
                if token.text in self.words_to_id.keys():
                    continue
                self.ids_to_word[self.word_next_id] = token.text
                self.words_to_id[token.text] = self.word_next_id
                self.word_next_id += 1
                                                                      # tokenized sentences   # tags strings
    def tokenize_sent_pos_tagging(self, sents: list[ns.NerusSent]) -> tuple[list[list[int]], list[list[str]]]:
        sentences_ids = []
        pos_tags_txts = []
        unknown_id = self.words_to_id["[UNK]"]

        for sent in sents:
            sentence_id = [self.words_to_id["[CLS]"]]
            pos_tag_txt = ["[CLS]"]
            for token in sent.tokens:
                word_text = token.text
                pos_tag   = token.pos

                word_id = self.words_to_id.get(word_text, None)
                sentence_id.append(word_id) if word_id is not None else sentence_id.append(unknown_id)
                pos_tag_txt.append(pos_tag) if word_id is not None else pos_tag_txt.append("[UNK]")

            assert sentence_id.__len__() == pos_tag_txt.__len__()

            # separating and padding
            pad_id = self.words_to_id["[PAD]"]
            sep_id = self.words_to_id["[SEP]"]

            if len(sentence_id) < MAX_SENTENCE_LENGTH:
                sentence_id.append(sep_id)
                sentence_id.extend([pad_id]  * (MAX_SENTENCE_LENGTH - len(sentence_id)))
                pos_tag_txt.append("[SEP]")
                pos_tag_txt.extend(["[PAD]"] * (MAX_SENTENCE_LENGTH - len(pos_tag_txt)))
            else:
                sentence_id = sentence_id[:MAX_SENTENCE_LENGTH]
                sentence_id[-1] = sep_id
                pos_tag_txt = pos_tag_txt[:MAX_SENTENCE_LENGTH]
                pos_tag_txt[-1] = "[SEP]"

            assert sentence_id.__len__() == pos_tag_txt.__len__()

            sentences_ids.append(sentence_id)
            pos_tags_txts.append(pos_tag_txt)

        return sentences_ids, pos_tags_txts

    def expand_pos_tagging_vocab(self, sents: ns.NerusSent):
        new_tags = set()
        for sent in sents:
            for token in sent.tokens:
                new_tags.add(token.pos)
        new_tags = [tag for tag in new_tags if tag not in self.tag_to_id.keys()]
        for tag in new_tags:
            self.tag_to_id[tag] = self.tag_to_id.__len__()

    def tokenize_pos_tags(self, text_tags: list[list[str]]) -> list[list[int]]:
        tags_id = []
        for sent_tags in text_tags:
            sent_tags_id = []
            for tag in sent_tags:
                if tag not in self.tag_to_id.keys():
                    msg = f"Bad tag when put ids on tags; TAG: {tag}"
                    raise RuntimeError(msg)
                sent_tags_id.append(self.tag_to_id[tag])
            tags_id.append(sent_tags_id)

        return tags_id

    def decode_sentence(self, sent: list[int]) -> str:
        return " ".join([self.ids_to_word[id] for id in sent])

    def __len__(self) -> int:
        return self.words_to_id.__len__()


## Handy code to train model

In [5]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()

    output = model(x_batch.to(device))

    # need to adjust output sizes to evaluate loss_function on batch
    output  = output.reshape(-1, output.shape[2])
    y_batch = y_batch.reshape(-1)

    loss = loss_function(output, y_batch.to(model.device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()


In [6]:

def train_epoch(generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for batch_of_x, batch_of_y in generator:

        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)

        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)

        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss/total


In [7]:
def trainer(count_of_epoch,
            batch_size,
            dataset,
            model,
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)

    iterations = tqdm(range(count_of_epoch), desc='epoch', leave=False)
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        batch_generator = tqdm(
            torch.utils.data.DataLoader(dataset=dataset,
                                        batch_size=batch_size,
                                        shuffle=True, pin_memory=True),
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size>0))
        epoch_loss = train_epoch (
                    generator=batch_generator,
                    model=model,
                    loss_function=loss_function,
                    optimizer=optima,
                    callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})


In [8]:

def evaluate_model_score(model, dataset, batch_size: int, loss_function, device = 'cpu'):
    batch_generator = torch.utils.data.DataLoader(dataset, batch_size=batch_size, pin_memory=True)

    pred = []
    real = []
    test_loss = 0
    total_samples = 0

    for x_batch, y_batch in batch_generator:
        output = model(x_batch.to(device))

        output_loss  = output.reshape(-1, output.shape[2])
        y_batch = y_batch.reshape(-1)

        #evaluate loss on batch
        batch_loss = loss_function(output_loss, y_batch.to(model.device))
        test_loss += batch_loss.cpu().item() * len(x_batch)
        total_samples += len(x_batch)

        # add prediction results for classification report
        output_report = torch.argmax(output, -1)
        pred.extend(torch.reshape(output_report, (-1,)).cpu().numpy())
        real.extend(y_batch.cpu().numpy())

    pred = np.array(pred)
    real = np.array(real)

    # need to mask and remove from real tags PAD, CLS, SEP, they are needed only to support LSTM architecture
    mask = np.isin(real, [0, 1, 2], invert=True)
    return classification_report(real[mask], pred[mask]), test_loss / total_samples


In [21]:
from itertools import islice as head
from nerus import load_nerus

MAX_SAMPLES = 1000

dataset = head(load_nerus("dataset/nerus_lenta.conllu.gz"), MAX_SAMPLES)
nerus_tokenizer = NerusTokenizer()

for sample in dataset:
    nerus_tokenizer.expand_vocabulary(sample.sents)
    nerus_tokenizer.expand_pos_tagging_vocab(sample.sents)

print(nerus_tokenizer.tag_to_id)

dataset_words_id = []
dataset_tags_id = []
dataset = head(load_nerus("dataset/nerus_lenta.conllu.gz"), MAX_SAMPLES)
for i, sample in enumerate(dataset):
    words_id, tags = nerus_tokenizer.tokenize_sent_pos_tagging(sample.sents)
    tags_id = nerus_tokenizer.tokenize_pos_tags(tags)

    dataset_words_id.extend(words_id)
    dataset_tags_id.extend(tags_id)

print(f"Tokenized: {dataset_words_id[0]}")
print(f"Decoded: {nerus_tokenizer.decode_sentence(dataset_words_id[0])}")
print(f"Vocabulary total size {len(nerus_tokenizer)}")


{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3, 'NUM': 4, 'PUNCT': 5, 'SCONJ': 6, 'ADP': 7, 'VERB': 8, 'PROPN': 9, 'CCONJ': 10, 'ADJ': 11, 'ADV': 12, 'PRON': 13, 'NOUN': 14, 'DET': 15, 'PART': 16, 'AUX': 17, 'X': 18, 'SYM': 19, 'INTJ': 20}
Tokenized: [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 11, 22, 23, 24, 25, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded: [CLS] Вице-премьер по социальным вопросам Татьяна Голикова рассказала , в каких регионах России зафиксирована наиболее высокая смертность от рака , сообщает РИА Новости . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Vocabulary total size 38665


## Devide data train/test

In [16]:
from sklearn.model_selection import train_test_split

tags_tensor  = torch.tensor(dataset_tags_id)
sents_tensor = torch.tensor(dataset_words_id)

words_train, words_test = train_test_split(sents_tensor, test_size=0.2, random_state=52)
tags_train, tags_test = train_test_split(tags_tensor, test_size=0.2, random_state=52)

train_dataset_pt = torch.utils.data.TensorDataset(words_train, tags_train.long())
test_dataset_pt = torch.utils.data.TensorDataset(words_test, tags_test.long())

print(tags_tensor.shape)
print(sents_tensor.shape)


torch.Size([11645, 50])
torch.Size([11645, 50])


## Model description and initialization

In [None]:
class RNNclassifier(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
    def __init__(self, output_dim, emb_dim=10, hidden_dim=10,
                 num_layers=3, bidirectional=False, p=0.7, batchnorm=False):
        super(RNNclassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.biderect = bidirectional
        self.dropout = p
        self.batchnorm = batchnorm
        self.vocab_size = len(nerus_tokenizer)

        self.lstm = torch.nn.LSTM(emb_dim, hidden_dim, num_layers,
                                     bidirectional=bidirectional,
                                     batch_first=True, dropout=p)

        self.embending = torch.nn.Embedding(self.vocab_size, emb_dim, padding_idx=0)

        self.norm = batchnorm
        if batchnorm:
            self.normalization = torch.nn.BatchNorm1d(MAX_SENTENCE_LENGTH)

        self.linear = torch.nn.Linear(int(bidirectional + 1)*hidden_dim, output_dim)

    def forward(self, input):
        encoded = self.embending(input)

        lstm_out, _ = self.lstm(encoded.float())
        if self.norm:
            lstm_out = self.normalization(lstm_out)
        return self.linear(lstm_out)

    def __str__(self):
        return \
        f"layer_size={self.hidden_dim}" +     \
        f"_layers_num={self.num_layers}" +    \
        f"_bidirect={int(self.biderect)}" +   \
        f"_dropout={self.dropout}" +          \
        f"_batchnorm={int(self.batchnorm)}" + \
        f"_vocab_size={self.vocab_size}"


In [23]:
class TensorboardCallback:
    def __init__(self, writer, test_dataset, loss_function, delimeter=100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.test_dataset = test_dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar(f'LOSS/train/{model}', loss, self.step)

        if self.step % self.delimeter == 0:
            self.writer.add_graph(model, torch.zeros(1, MAX_SENTENCE_LENGTH, dtype=torch.long).to(model.device))

            model_metrics, model_loss = evaluate_model_score(model, self.test_dataset, self.batch_size, self.loss_function, model.device)

            self.writer.add_scalar(f'LOSS/test/{model}', model_loss, self.step)
            self.writer.add_text(f'METRICS/{model}', str(model_metrics), self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)


# Train model with report

### Default parameters

In [24]:
config = dict()
config['output_dim'] = nerus_tokenizer.tag_to_id.__len__()
config['emb_dim'] = 128
config['hidden_dim'] = 256
config['num_layers'] = 2
config['bidirectional'] = True
config['batchnorm'] = False
config['p'] = 0.2


## 1. Model learning dependency on layer size

In [None]:
models_to_train = []
hidden_dims = [32, 64, 128, 256]
dims_config = config.copy()


for dim in hidden_dims:
    dims_config['hidden_dim'] = dim
    model = RNNclassifier(**dims_config)
    model.to(device)
    models_to_train.append(model)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=nerus_tokenizer.tag_to_id["[PAD]"])
optimizer = torch.optim.Adam

for num_layers, model in zip(hidden_dims, models_to_train):
    writer = SummaryWriter(log_dir=f'tensorboard/layer_width/{num_layers}')
    nerus_callback = TensorboardCallback(writer, test_dataset_pt, loss_function)

    trainer(count_of_epoch=10,
        batch_size=64,
        dataset=train_dataset_pt,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.001,
        callback=nerus_callback)


## 2. Model learning dependency on number of layers

In [None]:
models_to_train = []
num_layers = [2, 4, 8, 16]
lay_config = config.copy()

for num in num_layers:
    lay_config['num_layers'] = num
    model = RNNclassifier(**lay_config)
    model.to(device)
    models_to_train.append(model)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=nerus_tokenizer.tag_to_id["[PAD]"])
optimizer = torch.optim.Adam

for num_layers, model in zip(num_layers, models_to_train):
    writer = SummaryWriter(log_dir=f'tensorboard/num_layers/{num_layers}')
    nerus_callback = TensorboardCallback(writer, test_dataset_pt, loss_function)

    trainer(count_of_epoch=10,
        batch_size=64,
        dataset=train_dataset_pt,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.001,
        callback=nerus_callback)


## 3. Model dependency on dropout

In [None]:
models_to_train = []
dropouts = [0.2, 0.5, 0.7, 0.9]
p_config = config.copy()

for d in dropouts:
    p_config['p'] = d
    model = RNNclassifier(**p_config)
    model.to(device)
    models_to_train.append(model)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=nerus_tokenizer.tag_to_id["[PAD]"])
optimizer = torch.optim.Adam

for d, model in zip(dropouts, models_to_train):
    writer = SummaryWriter(log_dir=f'tensorboard/dropout/{int(d * 10)}')
    nerus_callback = TensorboardCallback(writer, test_dataset_pt, loss_function)

    trainer(count_of_epoch=10,
        batch_size=64,
        dataset=train_dataset_pt,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.001,
        callback=nerus_callback)


## 4. Model dependency on batchnorm

In [None]:
models_to_train = []
batch_norms = [False, True]
batch_config = config.copy()

for norm in batch_norms:
    batch_config['batchnorm'] = norm
    model = RNNclassifier(**batch_config)
    model.to(device)
    models_to_train.append(model)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=nerus_tokenizer.tag_to_id["[PAD]"])
optimizer = torch.optim.Adam

for norm, model in zip(batch_norms, models_to_train):
    writer = SummaryWriter(log_dir=f'tensorboard/batchnorm/{int(norm)}')
    nerus_callback = TensorboardCallback(writer, test_dataset_pt, loss_function)

    trainer(count_of_epoch=10,
        batch_size=64,
        dataset=train_dataset_pt,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.001,
        callback=nerus_callback)


## 5. Model learning dependency on embending dim

In [None]:
models_to_train = []
emb_dims = [10, 50, 100, 150, 256]
emb_config = config.copy()

for dim in emb_dims:
    emb_config['emb_dim'] = dim
    model = RNNclassifier(**emb_config)
    model.to(device)
    models_to_train.append(model)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=nerus_tokenizer.tag_to_id["[PAD]"])
optimizer = torch.optim.Adam

for dim, model in zip(emb_dims, models_to_train):
    writer = SummaryWriter(log_dir=f'tensorboard/embending_dim/{dim}')
    nerus_callback = TensorboardCallback(writer, test_dataset_pt, loss_function)

    trainer(count_of_epoch=10,
        batch_size=64,
        dataset=train_dataset_pt,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.001,
        callback=nerus_callback)


## Load report and investigate dependencies

In [None]:
%reload_ext tensorboard
%tensorboard --logdir tensorboard/


ModuleNotFoundError: No module named 'tensorboardS'