# Контекст

- корпус текста - 'Эмма', Джейн Остин
- модель на основе GPT2

In [38]:
import warnings

from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import Sequence, Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

from transformers import GPT2TokenizerFast,\
                         GPT2Config,\
                         GPT2LMHeadModel

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
warnings.filterwarnings("ignore")

# Создание, настройка, обучение и сохранение токенизатора

In [17]:
# настройки токенизатора
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([Lowercase()])
tokenizer.pre_tokenizer = ByteLevel() # байты в качестве входных данных
tokenizer.decoder = ByteLevelDecoder()

# обучение токенизатора
trainer = BpeTrainer(
    vocab_size = 50000,
    inital_alphabet=ByteLevel.alphabet(),
    special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"]
    )

tokenizer.train(["austen-emma.txt"], trainer)


# сохранение токенизатора в папке tokenizer_gpt
tokenizer.save('tokenizer_gpt/tokenizer.json')

# Добавление спец токенов к токенизатору

In [18]:
# скачивание токенизатора под GPT2 архитектуру
tokenizer_gpt = GPT2TokenizerFast.from_pretrained('tokenizer_gpt')
len(tokenizer_gpt.vocab)

11751

In [19]:
# добавление специальных токенов
tokenizer_gpt.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
})

print('Токен начала последовательности: ', tokenizer_gpt.bos_token_id)
print('Токен конца последовательности: ', tokenizer_gpt.eos_token_id)
tokenizer_gpt.encode('<s> hi, what is your name </s>')

Токен начала последовательности:  0
Токен конца последовательности:  2


[0, 95, 37, 11, 264, 157, 312, 1143, 56, 2]

# Конфигурация модели

In [20]:
config = GPT2Config(
    vocab_size = tokenizer_gpt.vocab_size,
    bos_token_id = tokenizer_gpt.bos_token_id,
    eos_token_id = tokenizer_gpt.eos_token_id
)

model = GPT2LMHeadModel(config)

In [21]:
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 11750
}

# Подготовка корпуса к предварительному обучению

In [22]:
with open('austen-emma.txt', 'r', encoding = 'utf-8') as file:
    content = file.readlines()

In [23]:
# удаление всех символов \n и оставляем только строки длиной больше 10 для обучения модели на длинных строках,
# чтобы она могла генерировать более длинные выходные последовательности

content_new = []
for stroka in content:
    stroka = stroka.replace('\n', '')
    if len(stroka) > 10:
        content_new.append(stroka.strip())

content_new = ' '.join(content_new) + tokenizer_gpt.eos_token

In [24]:
# сделаем одну длинную последовательность идентификаторов токенов
tokenized_content = tokenizer.encode(content_new)
tokenized_content

Encoding(num_tokens=195130, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

# Возьмем последовательности токенов длиной 100 (нарастающим эффектом)

- 0:100, 1:101, 2:102 и т.д. (100 выборок длиной 100 для ограничения времени обучения модели)
- также разобъем каждый образец текста на x и target (x = 'Идет бычок', target = 'бычок качается')

In [25]:
examples = []
for i in range(0, 100):
    examples.append(tokenized_content.ids[i:i+100])

In [26]:
train_data = []
labels = []

for example in examples:
    train_data.append(example[:-1])
    labels.append(example[1:])

# Dataset

In [27]:
class GPT_2Dataset(Dataset):
    def __init__(self, x, target):
        self.x = x
        self.target = target

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return torch.tensor(self.x[idx]), torch.tensor(self.target[idx])
    

GPT_2Dataset(train_data, labels)[0]

(tensor([ 6270,  1684,   236,   390, 11379, 10870,    26,   190,   371,    11,
          1426,    11,  1739,    11,    83,  1707,    11,   148,    59,  1561,
           619,    83,   656,  1911,    11,   596,    74,  5526,   392,    84,
            71,   829,  4100,    84,  3840,    23,    83,   141,  1845,  2559,
          2572,    12,   329,  1137,    97,    71,   866,   148,   167,   337,
            74,  1771,   254,  2718,   105,    13,   120,   116,    71,  4873,
            84,    71,   515,  3746,    84,    59,   429,  3296,    11,  7087,
           476,    23,    83,   141,    11,    97,  1824,    84,   105,  1327,
           176,  1559,    11,   204,  2805,    84,   166,   737,   242,    59,
           167,  1397,  2182,    13,    56,   105,   939,   141,  4723]),
 tensor([ 1684,   236,   390, 11379, 10870,    26,   190,   371,    11,  1426,
            11,  1739,    11,    83,  1707,    11,   148,    59,  1561,   619,
            83,   656,  1911,    11,   596,    74,  5526,

# Dataloader

In [28]:
torch.manual_seed(42)

train_dataloader = DataLoader(
    dataset = GPT_2Dataset(train_data, labels),\
    batch_size = 16,
    shuffle = True,
    drop_last = True
)


train_dataloader.dataset[0]

(tensor([ 6270,  1684,   236,   390, 11379, 10870,    26,   190,   371,    11,
          1426,    11,  1739,    11,    83,  1707,    11,   148,    59,  1561,
           619,    83,   656,  1911,    11,   596,    74,  5526,   392,    84,
            71,   829,  4100,    84,  3840,    23,    83,   141,  1845,  2559,
          2572,    12,   329,  1137,    97,    71,   866,   148,   167,   337,
            74,  1771,   254,  2718,   105,    13,   120,   116,    71,  4873,
            84,    71,   515,  3746,    84,    59,   429,  3296,    11,  7087,
           476,    23,    83,   141,    11,    97,  1824,    84,   105,  1327,
           176,  1559,    11,   204,  2805,    84,   166,   737,   242,    59,
           167,  1397,  2182,    13,    56,   105,   939,   141,  4723]),
 tensor([ 1684,   236,   390, 11379, 10870,    26,   190,   371,    11,  1426,
            11,  1739,    11,    83,  1707,    11,   148,    59,  1561,   619,
            83,   656,  1911,    11,   596,    74,  5526,

# Training loop

In [30]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)

model = model.to(device)


for epoch in range(300):

    total = 0.0
    correct_predict = 0.0
    total_loss = 0.0
    batches_in_dataloader = 0

    for batch in train_dataloader:

        model.train()
        
        input_ids = batch[0].to(device)
        targets = batch[1].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids)
        logits = outputs[0]

        batch_size, len_sequence, vocab_size = logits.size()

        loss = loss_fn(
            logits.view(-1, logits.size(2)),
            targets.view(-1))
        
        
        predict_tokens = torch.argmax(logits, dim = -1)

        correct_predict += (predict_tokens == targets).sum().item()
        total += targets.numel()
        total_loss += loss.item()
        batches_in_dataloader += 1
    
    accuracy = correct_predict / total
    epoch_loss = total_loss / batches_in_dataloader
    if (epoch + 1 == 1) or ((epoch+1) % 20 == 0):
        print("Epoch: {}, Loss: {:.4f}, Accuracy: {:.4f}".\
            format(epoch+1, epoch_loss, accuracy))

    loss.backward()
    optimizer.step()

        

Epoch: 1, Loss: 9.4835, Accuracy: 0.0000
Epoch: 20, Loss: 2.1110, Accuracy: 0.6805
Epoch: 40, Loss: 0.8857, Accuracy: 0.8621
Epoch: 60, Loss: 0.3823, Accuracy: 0.9477
Epoch: 80, Loss: 0.1917, Accuracy: 0.9721
Epoch: 100, Loss: 0.1084, Accuracy: 0.9874
Epoch: 120, Loss: 0.0699, Accuracy: 0.9921
Epoch: 140, Loss: 0.0586, Accuracy: 0.9913
Epoch: 160, Loss: 0.0396, Accuracy: 0.9958
Epoch: 180, Loss: 0.0302, Accuracy: 0.9961
Epoch: 200, Loss: 0.0265, Accuracy: 0.9960
Epoch: 220, Loss: 0.0239, Accuracy: 0.9966
Epoch: 240, Loss: 0.0208, Accuracy: 0.9965
Epoch: 260, Loss: 0.0202, Accuracy: 0.9968
Epoch: 280, Loss: 0.0179, Accuracy: 0.9968
Epoch: 300, Loss: 0.0171, Accuracy: 0.9968


# Генерация текста

In [39]:
def generate(
        start,
        model,
        max_length
        ):

    model = model.cpu()
    input_token_ids = tokenizer_gpt.encode(start, return_tensors='pt')

    
    output = model.generate(
                        input_token_ids,
                        max_length = max_length,
                        num_beams = 5,
                        temperature = 0.7,
                        no_repeat_ngram_size=2,
                        num_return_sequences=1
                        )
    return tokenizer_gpt.decode(output[0])


generate(' ', model, 500)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"  her mother had died too long ago for her to have more than an indistinct remembrance of her caresses; and her place had been supplied by an excellent woman as governess, who had fallen little short of a mother in affection. sixteen years had miss taylor been in mr. woodhouse's family, less as a governess than a friend, very fond of both daughters, but particularly of emma.  between _them_ it was more the intimacy of sisters. even before ceased to hold the nominal office of governess had lived nearly twenty-one years in the world with very little to distress or vex her sister's marriage, with a comfortable home and happy disposition, indulgent father; clever, been mistress of his house from a most affectionate,  even a very early period. she was the and had ceased governess to unite some of the youngest of nominal a the two daughters consequence of. had had, her governess rich, nominal little of existence; 1816] her the closing been had home had nearly of before miss in consequence h

In [40]:
generate('"wetson was very good', model, 30)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'"wetson was very good little to distress or vex her. she was the youngest of the two daughters of a most affectionate, indulgent father;'

# Сохранение - загрузка модели

In [41]:
# сохранение
model.save_pretrained('my_gpt2/')

# загрузка
# model_reloaded = TFGPT2LMHeadModel.from_pretrained("my_gpt-2/")

In [42]:
tokenizer_gpt.save_pretrained('tokenizer_gpt/')

('tokenizer_gpt/tokenizer_config.json',
 'tokenizer_gpt/special_tokens_map.json',
 'tokenizer_gpt/vocab.json',
 'tokenizer_gpt/merges.txt',
 'tokenizer_gpt/added_tokens.json',
 'tokenizer_gpt/tokenizer.json')