# Моделирование языка с использованием многослойного LSTM

In [1]:
import requests
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Split
from tokenizers.decoders import Replace

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

## Описание

Модель будет работать на уровне символов (в противоположность моделям на уровне слов). Это означает, что, получив на вход последовательность из одного или нескольких символов, модель должна будет предсказать следующий символ.

Символьная языковая модель имеет следующие преимущества:
- Меньше пространство предсказаний. В языке есть только ограниченный набор символов, в отличие от слов, которых тысяча.
- Устойчивее к ситуациям, когда слово не встречалось в тренировочном наборе (out-of-vocabulary) и лучше понимает базовые механики языка (в том числе пунктуацию).

С другой стороны, модели на уровне символов должны учиться распознавать целые последовательности букв, чтобы понимать отдельные слова (например, соединять «c», «a» и «t» в «cat»). Это делает обучение менее эффективным и может снижать качество работы модели.

## Сырой текст

**Art of War by Sun Tzu**

In [2]:
!gdown -c "https://drive.google.com/uc?id=1IvmDXJQEAtTm1qZEC0ckcyq-SGXxfPDq"

Skipping already downloaded file art_of_war.txt


In [3]:
with open("art_of_war.txt", "r") as f:
    art_of_war = f.read()

In [4]:
print(art_of_war[:300], "\n")
print(len(art_of_war))

1. Sun Tzŭ said: The art of war is of vital importance to the State.

2. It is a matter of life and death, a road either to safety or to
ruin. Hence it is a subject of inquiry which can on no account be
neglected.

3. The art of war, then, is governed by five constant factors, to be
taken into accou 

61054


## Токенизация

Простой посимвольный токенизатор.

In [5]:
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Split(r"", behavior="isolated")
tokenizer.decoder = Replace("", "")

tokenizer_trainer = WordLevelTrainer(special_tokens=["[UNK]"])

In [6]:
tokenizer.train_from_iterator([art_of_war], tokenizer_trainer)

In [7]:
print("Размер словаря: ", tokenizer.get_vocab_size())

Размер словаря:  57


Преобразуем весь текст в последовательность индексов.

In [8]:
seq = tokenizer.encode(art_of_war)

In [9]:
type(seq)

tokenizers.Encoding

In [10]:
print("Длина векторизованного текста:", len(seq.ids))

Длина векторизованного текста: 61054


In [11]:
seq.tokens[:10]

['1', '.', ' ', 's', 'u', 'n', ' ', 't', 'z', 'ŭ']

## Создание датасета

Наши данные - это одна длинная последовательность. Его нужно разделить на обучающие образцы.

На вход модель получает последовательность из `seq_len` символов из срезе `[idx:idx + seq_len]`. Таргетами будут символы из среза `[idx + 1 : idx + self.seq_len + 1]`.

Например:
- Вход: "she swam in the lak_";
- Таргет/метка: "_he swam in the lake".

Также будем использовать `one-hot` представления входных символов, так как словарь маленький и векторная близость не имеет смысла.

In [12]:
class SequencesDataset(data.Dataset):
    def __init__(self, seq: list[int], seq_len: int, vocab_size: int):
        self.seq = torch.tensor(seq, dtype=torch.long)
        self.seq_len = seq_len
        # vocab_size include padding index
        self.ohe = torch.eye(vocab_size, dtype=torch.float)
        self.size = len(self.seq) - self.seq_len

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        x = self.seq[idx : idx + self.seq_len]
        y = self.seq[idx + 1 : idx + self.seq_len + 1]
        return self.ohe[x], y

## Модель

In [13]:
class LanguageModel(nn.Module):
    def __init__(
        self,
        vocab_size,
    ):
        super().__init__()

        self.lstm = nn.LSTM(
            vocab_size, 128, num_layers=2, dropout=0.2, batch_first=True
        )

        self.out = nn.Linear(128, vocab_size)

    def forward(self, x):
        y, (h_n, c_n) = self.lstm(x)
        # y: (batch_size, seq_len, hidden_size)
        out = self.out(y)
        return out

## Обучение модели

In [14]:
class ModelTrainer:
    def __init__(
        self,
        model: nn.Module,
        criterion: nn.Module,
        optimizer: optim.Optimizer,
        train_dataloader: data.DataLoader,
        val_dataloader: data.DataLoader,
        epochs: int,
        patience: int,
        device: torch.device,
    ):
        self.model = model.to(device)
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        self.epochs = epochs
        self.patience = patience
        self.device = device

    def compute_metrics(self, y_pred, y_true, loss):
        accuracy = accuracy_score(y_true, y_pred)
        perplexity = math.exp(loss) if loss < 20 else float("inf")
        return {"accuracy": accuracy, "perplexity": perplexity}

    def _run_epoch(self, dataloader, train: bool = True):
        if train:
            self.model.train()
        else:
            self.model.eval()

        epoch_loss = 0.0
        epoch_predicts = []
        epoch_labels = []

        ctx = torch.enable_grad() if train else torch.no_grad()
        with ctx:
            for X, y in tqdm(dataloader, desc="Train" if train else "Val"):
                X, y = X.to(self.device), y.to(self.device)

                outputs = self.model(X)  # (batch, seq_len, vocab_size)
                loss = self.criterion(outputs.permute(0, 2, 1), y)

                if train:
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                epoch_loss += loss.item()

                predicts = (
                    outputs.detach().cpu().argmax(dim=-1).numpy().ravel()
                )
                labels = y.detach().cpu().numpy().ravel()
                epoch_predicts.extend(predicts)
                epoch_labels.extend(labels)

        epoch_loss /= len(dataloader)
        metrics = self.compute_metrics(
            epoch_predicts, epoch_labels, epoch_loss
        )
        return epoch_loss, metrics

    def _train_epoch(self):
        return self._run_epoch(self.train_dataloader, train=True)

    def _validate(self, dataloader=None):
        if dataloader is None:
            dataloader = self.val_dataloader
        return self._run_epoch(dataloader, train=False)

    def test(
        self, test_dataloader: data.DataLoader
    ) -> tuple[float, dict[str, float | int]]:
        return self._validate(test_dataloader)

    def __append_to_history(self, **kwargs):
        for k in self.history:
            self.history[k].append(kwargs[k])

    def train(self):
        self.history = dict(train_loss=[], train_accuracy=[])
        best_score = float("inf")
        no_improve = 0
        best_model = {
            k: v.cpu().clone() for k, v in self.model.state_dict().items()
        }
        for epoch in range(self.epochs):
            train_loss, train_metrics = self._train_epoch()
            self.__append_to_history(
                train_loss=train_loss,
                train_accuracy=train_metrics["accuracy"],
                train_perplexity=train_metrics["perplexity"],
            )

            print(f"Epoch {epoch + 1}/{self.epochs}")
            print(
                f"Train Loss:       {train_loss:.3f}\n"
                f"Train Perplexity: {train_metrics['perplexity']:.3f}\n"
                f"Train Accuracy:   {train_metrics['accuracy']:.3f}"
            )
            print()
            if train_metrics["perplexity"] < best_score:
                best_score = train_metrics["perplexity"]
                best_model = {
                    k: v.cpu().clone()
                    for k, v in self.model.state_dict().items()
                }
                no_improve = 0
            else:
                no_improve += 1

            if no_improve >= self.patience:
                print(f"Early stopping after {epoch + 1} epoch")
                self.model.load_state_dict(best_model)
                break

        print(f"Best Perplexity: {best_score:.4f}")
        return self.history

In [15]:
model = LanguageModel(tokenizer.get_vocab_size())

batch_size = 16
SEQ_LEN = 100
train_dataset = SequencesDataset(
    seq.ids, seq_len=SEQ_LEN, vocab_size=tokenizer.get_vocab_size()
)
train_dataloader = data.DataLoader(train_dataset, batch_size, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

epochs = 30
patience = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
trainer = ModelTrainer(
    model,
    criterion,
    optimizer,
    train_dataloader,
    None,
    epochs,
    patience,
    device,
)

Device: cuda


In [16]:
history = trainer.train()

Train: 100%|██████████| 3810/3810 [00:10<00:00, 372.18it/s]


Epoch 1/30
Train Loss:       1.871
Train Perplexity: 6.494
Train Accuracy:   0.451



Train: 100%|██████████| 3810/3810 [00:10<00:00, 348.33it/s]


Epoch 2/30
Train Loss:       1.119
Train Perplexity: 3.060
Train Accuracy:   0.658



Train: 100%|██████████| 3810/3810 [00:09<00:00, 417.04it/s]


Epoch 3/30
Train Loss:       0.814
Train Perplexity: 2.257
Train Accuracy:   0.747



Train: 100%|██████████| 3810/3810 [00:09<00:00, 394.38it/s]


Epoch 4/30
Train Loss:       0.666
Train Perplexity: 1.947
Train Accuracy:   0.792



Train: 100%|██████████| 3810/3810 [00:10<00:00, 347.05it/s]


Epoch 5/30
Train Loss:       0.581
Train Perplexity: 1.788
Train Accuracy:   0.819



Train: 100%|██████████| 3810/3810 [00:09<00:00, 417.34it/s]


Epoch 6/30
Train Loss:       0.526
Train Perplexity: 1.692
Train Accuracy:   0.837



Train: 100%|██████████| 3810/3810 [00:09<00:00, 397.68it/s]


Epoch 7/30
Train Loss:       0.487
Train Perplexity: 1.627
Train Accuracy:   0.849



Train: 100%|██████████| 3810/3810 [00:10<00:00, 356.74it/s]


Epoch 8/30
Train Loss:       0.458
Train Perplexity: 1.581
Train Accuracy:   0.859



Train: 100%|██████████| 3810/3810 [00:09<00:00, 410.96it/s]


Epoch 9/30
Train Loss:       0.435
Train Perplexity: 1.546
Train Accuracy:   0.866



Train: 100%|██████████| 3810/3810 [00:09<00:00, 388.94it/s]


Epoch 10/30
Train Loss:       0.417
Train Perplexity: 1.517
Train Accuracy:   0.872



Train: 100%|██████████| 3810/3810 [00:10<00:00, 352.83it/s]


Epoch 11/30
Train Loss:       0.402
Train Perplexity: 1.495
Train Accuracy:   0.877



Train: 100%|██████████| 3810/3810 [00:09<00:00, 405.08it/s]


Epoch 12/30
Train Loss:       0.389
Train Perplexity: 1.476
Train Accuracy:   0.881



Train: 100%|██████████| 3810/3810 [00:09<00:00, 391.51it/s]


Epoch 13/30
Train Loss:       0.378
Train Perplexity: 1.460
Train Accuracy:   0.885



Train: 100%|██████████| 3810/3810 [00:10<00:00, 358.49it/s]


Epoch 14/30
Train Loss:       0.369
Train Perplexity: 1.446
Train Accuracy:   0.888



Train: 100%|██████████| 3810/3810 [00:09<00:00, 386.12it/s]


Epoch 15/30
Train Loss:       0.361
Train Perplexity: 1.435
Train Accuracy:   0.890



Train: 100%|██████████| 3810/3810 [00:09<00:00, 386.73it/s]


Epoch 16/30
Train Loss:       0.353
Train Perplexity: 1.424
Train Accuracy:   0.893



Train: 100%|██████████| 3810/3810 [00:09<00:00, 399.78it/s]


Epoch 17/30
Train Loss:       0.347
Train Perplexity: 1.415
Train Accuracy:   0.895



Train: 100%|██████████| 3810/3810 [00:09<00:00, 422.93it/s]


Epoch 18/30
Train Loss:       0.342
Train Perplexity: 1.408
Train Accuracy:   0.896



Train: 100%|██████████| 3810/3810 [00:10<00:00, 375.96it/s]


Epoch 19/30
Train Loss:       0.336
Train Perplexity: 1.399
Train Accuracy:   0.898



Train: 100%|██████████| 3810/3810 [00:10<00:00, 364.60it/s]


Epoch 20/30
Train Loss:       0.331
Train Perplexity: 1.392
Train Accuracy:   0.900



Train: 100%|██████████| 3810/3810 [00:09<00:00, 393.69it/s]


Epoch 21/30
Train Loss:       0.327
Train Perplexity: 1.387
Train Accuracy:   0.901



Train: 100%|██████████| 3810/3810 [00:09<00:00, 398.69it/s]


Epoch 22/30
Train Loss:       0.323
Train Perplexity: 1.381
Train Accuracy:   0.902



Train: 100%|██████████| 3810/3810 [00:10<00:00, 361.62it/s]


Epoch 23/30
Train Loss:       0.319
Train Perplexity: 1.376
Train Accuracy:   0.904



Train: 100%|██████████| 3810/3810 [00:09<00:00, 395.50it/s]


Epoch 24/30
Train Loss:       0.316
Train Perplexity: 1.372
Train Accuracy:   0.904



Train: 100%|██████████| 3810/3810 [00:09<00:00, 404.73it/s]


Epoch 25/30
Train Loss:       0.313
Train Perplexity: 1.367
Train Accuracy:   0.906



Train: 100%|██████████| 3810/3810 [00:09<00:00, 397.36it/s]


Epoch 26/30
Train Loss:       0.309
Train Perplexity: 1.363
Train Accuracy:   0.906



Train: 100%|██████████| 3810/3810 [00:09<00:00, 411.52it/s]


Epoch 27/30
Train Loss:       0.307
Train Perplexity: 1.359
Train Accuracy:   0.907



Train: 100%|██████████| 3810/3810 [00:09<00:00, 395.39it/s]


Epoch 28/30
Train Loss:       0.304
Train Perplexity: 1.355
Train Accuracy:   0.908



Train: 100%|██████████| 3810/3810 [00:09<00:00, 401.88it/s]


Epoch 29/30
Train Loss:       0.301
Train Perplexity: 1.352
Train Accuracy:   0.909



Train: 100%|██████████| 3810/3810 [00:09<00:00, 404.13it/s]


Epoch 30/30
Train Loss:       0.299
Train Perplexity: 1.349
Train Accuracy:   0.910

Best Perplexity: 1.3486


## Генерация текста

Функция ниже будет принимать текст и генерировать несколько последующих символов.

Будем использовать также параметр **температуры**. Следующий символ будет взять из вероятностного распределения. Температурой мы можем влиять на случайность.

Если температура < 1 распределение вероятностей становиться "более четким" и модель будет более тщательно воспроизводить текст.

Если повысить температуру, то распределение выравнивается, и повышается шанс того, что модель выберет что-то неожиданное.

На практике достаточно высокая температура приводит к бессмыслице.

In [17]:
def generate_text(
    model: nn.Module,
    tokenizer: Tokenizer,
    seed_text: str,
    num_chars: int = 200,
    temperature: float = 1.0,
):
    text = seed_text
    vocab_size = tokenizer.get_vocab_size()

    for _ in range(num_chars):
        # Take the last *input_timesteps* number of characters in the text so far
        # as input.
        input_seq = tokenizer.encode(text[-SEQ_LEN:]).ids
        ohe = torch.eye(vocab_size)
        input_tensor = ohe[input_seq]

        # Create probability distribution for next character adjusted by temperature.
        model.eval()
        with torch.no_grad():
            # input: (seq_len, vocab_size) -> (1, seq_len, vocab_size)
            # outputs: (1, seq_len, vocab_size)
            # нам нужны вероятности только выхода последнего фрейма
            outputs = model(input_tensor.unsqueeze(0).to(device)).cpu()
            last_output = outputs[0, -1, :]  # (vocab_size,)

        # Применяем softmax с температурой
        logits = last_output / temperature
        probas = torch.softmax(logits, dim=-1)

        # Sample next character
        next_char_idx = torch.multinomial(probas, num_samples=1).item()

        # Convert index to character
        next_char = tokenizer.id_to_token(next_char_idx)

        # Add to running text
        text += next_char

    return text

In [18]:
print(
    generate_text(
        model,
        tokenizer,
        "Banana peels on the battlefield can",
        num_chars=300,
        temperature=0.2,
    )
)

Banana peels on the battlefield can snow attacked by heard of this way, we may
succeed in accomplishing the essential part of our schemes if these five points.

18. hence the saying: if you know the enemy and know yourself, your
victory will not stand in doubt; if you know heaven and know earth, you
may make your victory complete.

1


In [19]:
print(
    generate_text(
        model,
        tokenizer,
        "It's time to release the Kraken when",
        num_chars=300,
        temperature=0.5,
    )
)

It's time to release the Kraken when the wile consequences that must ensue.

5. thus, though we have heard of stupid haste in war, cleverness has
never been seen associated with long delays.

6. there is no instance of a country having benefited from prolonged
warfare.

7. it is only one who is thoroughly acquainted with the evils of 


In [20]:
print(
    generate_text(
        model,
        tokenizer,
        "Crush your enemies, see them driven before you, and",
        num_chars=300,
        temperature=1,
    )
)

Crush your enemies, see them driven before you, and you will be attacked by its head,
and in making tactical dispositions, the highest pitch you can attain
is to conceal them; conceal your dispositions, and you will be strongly fate in our hands.

10. you may advantages to account unless
we make use of locoly.

28. nor rein to rese are officers and 


In [21]:
print(
    generate_text(
        model, tokenizer, "What is best in life?", num_chars=300, temperature=2
    )
)

What is best in life?
on a pass, of compars, and cheakers offin wain, (1)
desivalent proigs; and
distantage, the army advantage.

1. sun tzŭ said: there
arm ig of infanctions of great, fore, we
camail starth into areiving to time and general’s fall-your it; to be utpentains filia-tervance. sins if the tend-bawn
and prik
