# Обучение dialogpt по Гарри Поттеру с помощью transformers

In [1]:
import json
import logging
import os
import random
import re
import sys
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import (
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s %(message)s")


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /archive/evseev/envllm/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so
CUDA SETUP: CUDA runtime path found: /cephfs/local/cuda-11.3/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /archive/evseev/envllm/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so...


  warn(msg)
  warn(msg)


### Загрузка датасета

In [2]:
with open("harry_potter_dataset.json", 'r') as inp:
    dataset = json.load(inp)

### Разбиение датасета на тренировочный и тестовый

In [3]:
train_data, test_data = train_test_split(dataset, test_size=0.1)

In [4]:
if not os.path.isdir('tr-checkpoints'):
    os.mkdir('tr-checkpoints')

### Пользоветельский класс dataset'а

In [5]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, data: List[Tuple[List[str], str]], max_length: int = 512):
        self.examples = data
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        dialogue = self.examples[item]
        flatten = lambda l: [item for sublist in l for item in sublist]
        conv = list([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in dialogue])
        conv = flatten(conv)
        conv = conv[-self.max_length:]
        inputs = torch.tensor(conv, dtype=torch.long)
        return {"input_ids": inputs, "labels": inputs}

### Инициализация токенизатора и модели

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token_id = 0
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
train_dataset = ConversationDataset(tokenizer, train_data)
test_dataset = ConversationDataset(tokenizer, test_data)

In [8]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

### Аргументы обучения

In [9]:
training_args = TrainingArguments(
                                  output_dir="./tr-checkpoints",
                                  evaluation_strategy="steps",
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  num_train_epochs=3,
                                  weight_decay=0.01,
                                  logging_steps = 100,
                                  eval_steps=100,
                                  save_strategy='epoch'
                                  )

### Инициализация тренера

In [10]:
trainer = Trainer(
                  model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=test_dataset,
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  )

### Запуск тренировки и сохранение весов

In [11]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

### Сохранение обученной модели

In [None]:
model.save_pretrained("./tr-checkpoints")

### Загрузка и запуск обученной модели

In [12]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('tr-checkpoints') # Let's chat for 3 lines

for step in range(3):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)# append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids# generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("Harry Potter Bot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:Harry, where is the Chamber of Secrets?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: There is no such thing as a secret Chamber of secrets.
>> User:Where is Hermione?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: She's in the Chamber.
>> User:What is she doing there?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Harry Potter Bot: That's not true, I'm just telling you.
