# Обучение dialogpt по Гарри Поттеру с помощью pytorch

In [1]:
import json
import logging
import os
import random
import re
import sys
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s %(message)s")


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /archive/evseev/envllm/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /archive/evseev/envllm/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so...


  warn(msg)
  warn(msg)


### Загрузка датасета

In [2]:
with open("harry_potter_dataset.json", 'r') as inp:
    dataset = json.load(inp)

### Разбиение датасета на тренировочный и тестовый

In [3]:
train_data, test_data = train_test_split(dataset, test_size=0.1)

In [4]:
if not os.path.isdir('pt-checkpoints'):
    os.mkdir('pt-checkpoints')

### Аргументы обучения

In [5]:
class Args():
    def __init__(self):
        self.output_dir = 'pt-checkpoints'
        self.model_type = 'gpt2'
        self.pretrained_trf = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.do_train = True
        self.do_eval = True
        self.device = "cuda"
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 2e-5
        self.weight_decay = 0.01 
        self.adam_epsilon = 1e-5 
        self.max_grad_norm = 1.0 
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 1500

### Пользоветельский класс набора данных

In [6]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, data: List[Tuple[List[str], str]], max_length: int = 512):
        self.examples = data
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        context = self.examples[item]
        flatten = lambda l: [item for sublist in l for item in sublist]
        conv = list([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in context])
        conv = flatten(conv)
        conv = conv[-self.max_length:]
        return torch.tensor(conv, dtype=torch.long)

### Функция для формирования батча

In [7]:
def collate(examples: List[torch.Tensor]):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

In [8]:
args = Args()

### Выбор девайса и вычисление размера батча

In [9]:
num_devices = 1

if torch.cuda.is_available():
    args.device = torch.device("cuda")
    num_devices = torch.cuda.device_count()
else:
    args.device = torch.device("cpu")

args.train_batch_size = args.per_gpu_train_batch_size * num_devices
args.eval_batch_size = args.per_gpu_eval_batch_size * num_devices

In [10]:
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_trf)

In [11]:
train_dataset = ConversationDataset(tokenizer, train_data)
test_dataset = ConversationDataset(tokenizer, test_data)

### Загрузчики данных для train и eval data

In [12]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
)

In [13]:
eval_sampler = SequentialSampler(test_dataset)
eval_dataloader = DataLoader(
    test_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
)

### Вычисление общего количества training steps 

In [14]:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
t_total

1656

### Инициализация модели

In [15]:
model = AutoModelWithLMHead.from_pretrained(args.pretrained_trf)
model.to(args.device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### Подготовка optimizer и scheduler

In [16]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
)



In [17]:
if num_devices > 1:
    model = torch.nn.DataParallel(model)

### Функция оценки модели

In [18]:
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, eval_dataloader: DataLoader) -> Dict:
    # Eval!
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    return {"perplexity": perplexity}

### Функция со всеми этапамим обучения

In [19]:
def train_and_eval(args, train_dataloader: DataLoader, eval_dataloader: DataLoader, model: PreTrainedModel,
                   tokenizer: PreTrainedTokenizer, optimizer, scheduler) -> None:
    global_step = 0
    tr_loss = 0.0
    model.zero_grad()
    for _ in range(args.num_train_epochs):
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train() 
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if num_devices > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()

            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ###
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    results = evaluate(args, model, tokenizer, eval_dataloader)
                    logger.info(f"step {step} tr_loss {tr_loss} results {results}")
                
                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format("checkpoint", global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            break
    model_to_save = (
        model.module if hasattr(model, "module") else model
    )  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args.output_dir)

### Обучение

In [20]:
train_and_eval(args, train_dataloader, eval_dataloader, model, tokenizer, optimizer, scheduler)

Iteration:   0%|          | 0/552 [00:00<?, ?it/s]



KeyboardInterrupt: 

### Загрузка и запуск обученной модели

In [21]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('pt-checkpoints') # Let's chat for 3 lines

for step in range(3):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids# generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,    
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("Harry Potter Bot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:Harry, where is the Chamber of Secrets?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: It's a little bit of a secret.
>> User:What is in the Chamber of Secrets?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: What do you mean?
>> User:Do you know what is inside the Chamber of Secrets?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: !!!?!!
