In [None]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
import time
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from collections import Counter
from IPython.display import clear_output

from scripts import BpeTokenizer, Model, Trainer, Collator, MyDataset, generate

# Загружаем данные

In [None]:
df = pd.read_csv('data/dataset.csv')
train_texts = df['text'][:-1024].tolist()
eval_texts = df['text'][-1024:].tolist()

# Инициализируем и обучаем токенизатор

In [None]:
tokenizer = BpeTokenizer()

In [None]:
tokenizer.train(train_texts[:2048], max_vocab=2048)

# Создаем датасеты и Collator

In [None]:
train_dataset = MyDataset(train_texts, tokenizer, max_length=128)
eval_dataset = MyDataset(eval_texts, tokenizer, max_length=128)
collator = Collator(tokenizer.pad_token_id)

# Создаем модель

In [None]:
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [None]:
model = Model(tokenizer.get_vocab_size(), emb_size=128, hidden_size=256, num_layers=2, dropout=0.1)

# Создаем Trainer и запускаем обучение

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    n_epochs=8,
    train_batch_size=32,
    eval_batch_size=32,
    eval_steps=64,
    collator=collator,
    lr=1e-2,
    ignore_index=tokenizer.pad_token_id
)

In [None]:
trainer.train()

# Оцениваем качество и проверяем жадную и случайную генерацию

In [None]:
trainer.evaluate()

In [None]:
generate(model, tokenizer, temperature=0)

In [None]:
generate(model, tokenizer, temperature=0.5, top_k=20)