# Обучение dialogpt по Гарри Поттеру с помощью pytorch lightning

In [1]:
import json
import logging
import os
import random
import re
import sys
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

from transformers import (
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /archive/evseev/envllm/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /archive/evseev/envllm/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so...


  warn(msg)
  warn(msg)


### Загрузка датасета

In [2]:
with open("harry_potter_dataset.json", 'r') as inp:
    dataset = json.load(inp)

### Разбиение датасета на тренировочный и тестовый

In [3]:
train_data, test_data = train_test_split(dataset, test_size=0.1)

In [4]:
if not os.path.isdir('pt-l-checkpoints'):
    os.mkdir('pt-l-checkpoints')

### Аргументы обучения

In [5]:
class Args():
    def __init__(self):
        self.output_dir = 'pt-l-checkpoints'
        self.pretrained_trf = 'microsoft/DialoGPT-small'
        self.device = "cuda"
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.01
        self.adam_epsilon = 1e-8
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 500
        self.save_steps = 1000

In [6]:
args = Args()

### Пользоветельский класс dataset'а

In [7]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, data: List[Tuple[List[str], str]], max_length: int = 512):
        self.examples = data
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        dialogue = self.examples[item]
        flatten = lambda l: [item for sublist in l for item in sublist]
        conv = list([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in dialogue])
        conv = flatten(conv)
        conv = conv[-self.max_length:]
        inputs = torch.tensor(conv, dtype=torch.long)
        return {"input_ids": inputs, "labels": inputs}

### Функция для формирования батча

In [8]:
def collate(examples: List[torch.Tensor]):
    ids = [example['input_ids'] for example in examples]

    if tokenizer._pad_token is None:
        padded = pad_sequence(ids, batch_first=True)
    padded = pad_sequence(ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    return {"input_ids": padded, "labels": padded}


### Выбор девайса и вычисление размера батча

In [9]:
num_devices = 1

if torch.cuda.is_available():
    args.device = torch.device("cuda")
    num_devices = torch.cuda.device_count()
else:
    args.device = torch.device("cpu")

args.train_batch_size = args.per_gpu_train_batch_size * num_devices
args.eval_batch_size = args.per_gpu_eval_batch_size * num_devices

In [10]:
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_trf)
tokenizer.pad_token_id = 0

### Загрузчики данных для train, eval и test data

In [11]:
class ConversarionDataModule(pl.LightningDataModule):

    def __init__(self, train_data, test_data, tokenizer, max_token_len=512):
        super().__init__()
        self.train_data = train_data
        self.test_data = test_data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = ConversationDataset(
            self.tokenizer,
            self.train_data,
            self.max_token_len
        )

        self.test_dataset = ConversationDataset(
            self.tokenizer,
            self.test_data,
            self.max_token_len
        )

    def train_dataloader(self):
        train_sampler = RandomSampler(self.train_dataset)
        return DataLoader(
            self.train_dataset,
            sampler=train_sampler, 
            batch_size=args.train_batch_size, 
            collate_fn=collate, 
            drop_last = True,
            num_workers=40
        )

    def val_dataloader(self):
        eval_sampler = SequentialSampler(self.test_dataset)
        return DataLoader(
            self.test_dataset,
            sampler=eval_sampler, 
            batch_size=args.eval_batch_size, 
            collate_fn=collate, 
            drop_last = True,
            num_workers=40
        )

    def test_dataloader(self):
        eval_sampler = SequentialSampler(self.test_dataset)
        return DataLoader(
            self.test_dataset,
            sampler=eval_sampler, 
            batch_size=args.eval_batch_size, 
            collate_fn=collate, 
            drop_last = True,
            num_workers=40
        )

In [12]:
data_module = ConversarionDataModule(
    train_data,
    test_data,
    tokenizer,
)

### Класс с моделью и методами, используемыми при обучении

In [13]:
class ConversationModel(pl.LightningModule):

    def __init__(self, model_name, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.model = AutoModelWithLMHead.from_pretrained(model_name)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []

    def forward(self, input_ids, labels):
        output = self.model(input_ids,  return_dict=True, labels=labels)
        return output['loss'], output['logits']

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        self.training_step_outputs.append(loss)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, labels)
        self.validation_step_outputs.append(loss)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, labels)
        self.test_step_outputs.append(loss)
        return loss

    def on_save_checkpoint(self, checkpoint):
        model_to_save = (
            self.model.module if hasattr(self.model, "module") else self.model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)

    def on_train_epoch_end(self):
        epoch_average = torch.stack(self.training_step_outputs).mean()
        print("----- training_epoch_average:", epoch_average.item(), '-----')
        self.log("training_epoch_average", epoch_average.item())
        self.training_step_outputs.clear()  # free memory
                
    def on_validation_epoch_end(self):
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        print('===== val_loss_avg', epoch_average.item(), '='*5)
        self.log("val_loss_avg", epoch_average.item())
        self.validation_step_outputs.clear()  # free memory
        
    def on_test_epoch_end(self):
        test_loss_avg = torch.stack(self.test_step_outputs).mean()
        print('test_loss_avg', test_loss_avg.item())
        self.test_step_outputs.clear()  # free memory
        return test_loss_avg

    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": args.weight_decay,
            },
            {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, 
                          lr=args.learning_rate, 
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=args.warmup_steps, 
            num_training_steps=self.n_training_steps
        )

        return dict(
          optimizer=optimizer,
          lr_scheduler=dict(
            scheduler=scheduler,
            interval='step'
          )
        )

### Инициализация модели

In [14]:
model = ConversationModel(
    args.pretrained_trf,
    n_warmup_steps=args.warmup_steps,
    n_training_steps=len(train_data) // args.train_batch_size * args.num_train_epochs
)



### Настройки сохранения чекпоинтов и процесса обучения

In [15]:
checkpoint_callback = ModelCheckpoint(
  dirpath=args.output_dir,
  save_top_k=2,
  monitor="val_loss_avg",
  mode="min",
  every_n_train_steps=args.save_steps,
)


In [16]:
logger = TensorBoardLogger("pt_l_logs", name="conversation")

In [17]:
trainer = pl.Trainer(
    default_root_dir=args.output_dir,
    logger=logger,
    callbacks=[checkpoint_callback],
    enable_checkpointing=True,
    max_epochs=args.num_train_epochs,
    max_steps=args.max_steps,
    devices=torch.cuda.device_count(),
    log_every_n_steps=args.logging_steps,
    val_check_interval=args.logging_steps,
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Обучение

In [18]:
trainer.fit(model, data_module)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 124 M 
------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

===== val_loss_avg 7.254636764526367 =====


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [19]:
trainer.test(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2]


Testing: 0it [00:00, ?it/s]

test_loss_avg 3.1944661140441895


[{}]

### Загрузка и запуск обученной модели

In [None]:
trained_model = ConversationModel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path,
    model_name = 'microsoft/DialoGPT-small'
)


In [20]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('pt-l-checkpoints') # Let's chat for 3 lines

for step in range(3):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids# generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("Harry Potter Bot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))



>> User:Harry, where is the Chamber of Secrets?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: I don't know.
>> User:Who knows?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: We'll have to wait and see.
>> User:May be Hermione knows?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: !!!!!!!!!!!!Harry!!
