In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
! pip install transformers==2.11.0

Collecting transformers==2.11.0
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |▌                               | 10kB 18.4MB/s eta 0:00:01[K     |█                               | 20kB 16.4MB/s eta 0:00:01[K     |█▌                              | 30kB 13.7MB/s eta 0:00:01[K     |██                              | 40kB 12.3MB/s eta 0:00:01[K     |██▍                             | 51kB 8.4MB/s eta 0:00:01[K     |███                             | 61kB 8.6MB/s eta 0:00:01[K     |███▍                            | 71kB 9.0MB/s eta 0:00:01[K     |███▉                            | 81kB 10.0MB/s eta 0:00:01[K     |████▍                           | 92kB 8.9MB/s eta 0:00:01[K     |████▉                           | 102kB 8.1MB/s eta 0:00:01[K     |█████▍                          | 112kB 8.1MB/s eta 0:00:01[K     |█████▉                         

# **Загрузка библиотек**

In [3]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange
from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


# **Модели GPT без тонкой настройки**

In [4]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=641.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=351265583.0, style=ProgressStyle(descri…




# **Чат на GPT без тонкой настройки**

In [5]:
# пообщаемся например на 5 вопросов
for step in range(5):
    # encode ввода нового пользователя, добавляем eos_token и возвращаем tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> Black:") + tokenizer.eos_token, return_tensors='pt')

    # добавляем новые токены введенные пользователем в историю чата
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # генерируем ответ, ограничев в истории до 1000 tokens    
    chat_history_ids = model.generate(
    bot_input_ids, max_length=1000,
    pad_token_id=tokenizer.eos_token_id
    )

    
    print("Rick: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> Black:hi
Rick: Hi
>> Black:what is you name?
Rick: I'm in
>> Black:I'm Nickolas
Rick: I'm Nick
>> Black:what can you do?
Rick: I'm Nick
>> Black:what can you do?
Rick: I'm Nick


# **Модель GPT с тонкой настройкой**

# **Проведем тонкую настройку для новых данных**

In [6]:
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

## настройки модели

In [7]:
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

# **Загрузим данные**

In [8]:
from google.colab import files
file = files.upload()

Saving RickAndMortyScripts.csv to RickAndMortyScripts.csv


## **посмотрим на данные**

In [9]:
all_rick = pd.read_csv('RickAndMortyScripts.csv')
all_rick.head(10)

Unnamed: 0,index,season no.,episode no.,episode name,name,line
0,0,1,1,Pilot,Rick,Morty! You gotta come on. Jus'... you gotta co...
1,1,1,1,Pilot,Morty,"What, Rick? What’s going on?"
2,2,1,1,Pilot,Rick,"I got a surprise for you, Morty."
3,3,1,1,Pilot,Morty,It's the middle of the night. What are you tal...
4,4,1,1,Pilot,Rick,"Come on, I got a surprise for you. Come on, h..."
5,5,1,1,Pilot,Morty,Ow! Ow! You're tugging me too hard!
6,6,1,1,Pilot,Rick,"We gotta go, gotta get outta here, come on. Go..."
7,7,1,1,Pilot,Rick,"What do you think of this... flying vehicle, M..."
8,8,1,1,Pilot,Morty,"Yeah, Rick... I-it's great. Is this the surprise?"
9,9,1,1,Pilot,Rick,Morty. I had to... I had to do it. I had— I ha...


## **подготавливаем данные**

преобразуем этот набор данных таким образом, чтобы каждая строка ответа содержала n предыдущих ответов в качестве контекста, используем семь предыдущих ответов.

In [10]:
contexted = []

n = 7

for i in range(n, len(all_rick['line'])):
  row = []
  prev = i - 1 - n # вычитаем 1, поэтому строка будет содержать текущий ответ и 7 предыдущих ответов 
  for j in range(i, prev, -1):
    row.append(all_rick['line'][j])
  contexted.append(row)  

In [11]:
len(contexted)

1898

In [12]:
columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(n-1)]
columns

['response',
 'context',
 'context/0',
 'context/1',
 'context/2',
 'context/3',
 'context/4',
 'context/5']

## **посмотрим что получилось**

In [13]:
df = pd.DataFrame.from_records(contexted, columns=columns)
df.head(5)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
0,"What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty.","What, Rick? What’s going on?",Morty! You gotta come on. Jus'... you gotta co...
1,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty.","What, Rick? What’s going on?"
2,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty."
3,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...
4,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h..."


## **Разделите наш набор данных на обучающую и тестовую часть.**

In [14]:
trn_df, val_df = train_test_split(df, test_size = 0.1)
trn_df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
381,"Wow, you know what? I mean, it looks like we c...",Worst-case scenario we're back to running.,"Hey, you know what? You got a really good poin...","Yeah, well, since when are we taking this guy'...",But that's the opposite of what-,"Hold on, Morty. Y-you know what? He keeps sayi...","You can run, but you can't hide, bitch!","Man, he sure says ""bitch"" a lot!"
280,"No, you didn't.",Make it bounce.,"All right, Morty, time to make our move.","Oh, I think you've had enough, sir.",I'll take two.,Wheat thins. Wheat thins.,"It's about to get a whole lot weirder, Morty.","Wow, Rick, I can't believe we're sitting aroun..."
1572,He's right. This is far from over.,"Rick, whoever did this is an even bigger threa...","All right. Short mission, good mission. Rememb...","Million Ants, ladies and gentleman! The ant co...",I sense his life force is fading.,It's Worldender! What happened to him?,What the FUCK?!,"Ooh, real scared. Real fucking on alert, high ..."
446,"I'm gonna miss you, snowball.",Taking over the human's world will lead to not...,We are not them! We are not them.,"To hell with my kingdom, bean counter. I would...",Anything. Anything for my precious Morty.,"It's necessary for the plan, Morty. Don't even...",What?!,Close. It's gonna make your kidneys shut down.
430,What?,"No, no, no, I was just playing dead. Good news...","Mmm. Thank you, Fido. Rick! I thought you were...",Begin phase two.,"Th-thanks, snuffles.",Bring the boy to me. You were always kind to m...,"Ooh, great plan, Jerry.",Bad person. Bad.


# **Подготовим данные для модели**

объединяем ответы в одну строку для каждой строки (дополнительно добавим специальный токен конца строки между ответами, чтобы модель понимала конец каждого ответа в строке).

In [15]:
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

# **Контрольные точки для модели**

In [16]:
def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)

In [17]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [18]:
def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted

## **Проверяем нужно ли удалять старые контрольные точки**

In [19]:
def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return


    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

# **Модуль оценки модели**

In [20]:
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:

    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # DistributedSampler выбираем случайным образом

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    print('Итоговая LOSS {:.4f}'.format(eval_loss))
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

# **Модуль тренировки**

In [21]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Позаботьтесь о распределенном / параллельном обучении
    model.resize_token_embeddings(len(tokenizer))


    # Подготовить оптимизатор и расписание (линейный прогрев и спад)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Проверьте, существуют ли сохраненные состояния оптимизатора или планировщика
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Загрузить в состояниях оптимизатора и планировщика
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # обучение с несколькими графическими процессорами (должно быть после инициализации apex fp16)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Распределенное обучение (должно быть после инициализации apex fp16)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )


    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Проверьте, продолжается ли обучение с контрольной точки
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # установите global_step на gobal_step последней сохраненной контрольной точки из пути к модели
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Пропустите любые уже обученные шаги, если продолжите тренировку
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # выходы модели всегда являются кортежами в трансформаторах (см. документ)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean () к среднему при параллельном обучении с несколькими графическими процессорами
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            print("Итерация: {};  LOSS: {:.4f}".format(step, loss.item()))

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Обновить расписание скорости обучения
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Оценивайте только при использовании одного графического процессора, иначе показатели могут не усредниться
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss
                    

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Позаботьтесь о распределенном / параллельном обучении
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step


# **Модуль обучения**

In [22]:
def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Настройка CUDA, GPU и распределенного обучения
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device


    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Рекомендации по сохранению: если вы используете save_pretrained для модели и токенизатора, вы можете перезагрузить их с помощью from_pretrained ()
    if args.do_train:
        # При необходимости создайте выходной каталог
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Сохраните обученную модель, конфигурацию и токенизатор с помощью save_pretrained ().
        # Затем их можно перезагрузить с помощью from_pretrained ().
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Позаботьтесь о распределенном / параллельном обучении
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Хорошая практика: сохраните свои обучающие аргументы вместе с обученной моделью
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Загрузите обученную модель и словарь, который вы точно настроили
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

# **Обучаем модель**

In [23]:
print("Начало обучения!")
GPT = main(trn_df, val_df)

Начало обучения!


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=641.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=351265583.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=427.0, style=ProgressStyle(description_wi…

Итерация: 0;  LOSS: 8.5412
Итерация: 1;  LOSS: 7.7641
Итерация: 2;  LOSS: 5.7720
Итерация: 3;  LOSS: 6.7122
Итерация: 4;  LOSS: 5.7744
Итерация: 5;  LOSS: 5.1474
Итерация: 6;  LOSS: 4.7060
Итерация: 7;  LOSS: 4.4662
Итерация: 8;  LOSS: 4.1914
Итерация: 9;  LOSS: 4.9712
Итерация: 10;  LOSS: 4.6570
Итерация: 11;  LOSS: 4.7368
Итерация: 12;  LOSS: 4.4883
Итерация: 13;  LOSS: 3.1086
Итерация: 14;  LOSS: 4.6998
Итерация: 15;  LOSS: 4.6735
Итерация: 16;  LOSS: 5.0073
Итерация: 17;  LOSS: 4.2379
Итерация: 18;  LOSS: 3.8829
Итерация: 19;  LOSS: 2.9926
Итерация: 20;  LOSS: 5.0642
Итерация: 21;  LOSS: 3.8646
Итерация: 22;  LOSS: 2.8460
Итерация: 23;  LOSS: 4.2199
Итерация: 24;  LOSS: 2.7651
Итерация: 25;  LOSS: 3.9326
Итерация: 26;  LOSS: 3.5037
Итерация: 27;  LOSS: 3.9854
Итерация: 28;  LOSS: 4.9467
Итерация: 29;  LOSS: 3.8337
Итерация: 30;  LOSS: 4.4723
Итерация: 31;  LOSS: 2.9218
Итерация: 32;  LOSS: 3.9368
Итерация: 33;  LOSS: 2.9206
Итерация: 34;  LOSS: 3.5266
Итерация: 35;  LOSS: 3.3138
Ит

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=427.0, style=ProgressStyle(description_wi…

Итерация: 0;  LOSS: 2.6421
Итерация: 1;  LOSS: 2.3480
Итерация: 2;  LOSS: 1.7970
Итерация: 3;  LOSS: 2.1824
Итерация: 4;  LOSS: 1.8316
Итерация: 5;  LOSS: 1.3809
Итерация: 6;  LOSS: 1.7112
Итерация: 7;  LOSS: 1.8339
Итерация: 8;  LOSS: 2.1378
Итерация: 9;  LOSS: 1.9557
Итерация: 10;  LOSS: 1.8171
Итерация: 11;  LOSS: 1.8149
Итерация: 12;  LOSS: 1.8207
Итерация: 13;  LOSS: 2.0957
Итерация: 14;  LOSS: 2.0191
Итерация: 15;  LOSS: 1.9571
Итерация: 16;  LOSS: 1.8421
Итерация: 17;  LOSS: 1.8930
Итерация: 18;  LOSS: 1.8746
Итерация: 19;  LOSS: 1.9920
Итерация: 20;  LOSS: 1.7064
Итерация: 21;  LOSS: 1.5738
Итерация: 22;  LOSS: 1.6290
Итерация: 23;  LOSS: 1.9026
Итерация: 24;  LOSS: 1.7408
Итерация: 25;  LOSS: 1.6927
Итерация: 26;  LOSS: 2.3679
Итерация: 27;  LOSS: 1.7442
Итерация: 28;  LOSS: 2.4879
Итерация: 29;  LOSS: 1.7138
Итерация: 30;  LOSS: 1.9416
Итерация: 31;  LOSS: 2.0416
Итерация: 32;  LOSS: 2.1063
Итерация: 33;  LOSS: 2.3019
Итерация: 34;  LOSS: 1.5269
Итерация: 35;  LOSS: 1.8558
Ит

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=427.0, style=ProgressStyle(description_wi…

Итерация: 0;  LOSS: 1.2996
Итерация: 1;  LOSS: 1.2503
Итерация: 2;  LOSS: 1.1983
Итерация: 3;  LOSS: 1.3301
Итерация: 4;  LOSS: 1.4731
Итерация: 5;  LOSS: 1.9288
Итерация: 6;  LOSS: 1.6944
Итерация: 7;  LOSS: 1.7751
Итерация: 8;  LOSS: 1.7985
Итерация: 9;  LOSS: 1.5963
Итерация: 10;  LOSS: 1.3438
Итерация: 11;  LOSS: 1.3966
Итерация: 12;  LOSS: 1.2053
Итерация: 13;  LOSS: 1.6372
Итерация: 14;  LOSS: 1.3027
Итерация: 15;  LOSS: 1.8532
Итерация: 16;  LOSS: 1.8615
Итерация: 17;  LOSS: 1.1307
Итерация: 18;  LOSS: 1.4493
Итерация: 19;  LOSS: 1.3909
Итерация: 20;  LOSS: 1.6117
Итерация: 21;  LOSS: 1.9047
Итерация: 22;  LOSS: 1.4804
Итерация: 23;  LOSS: 1.7433
Итерация: 24;  LOSS: 1.6691
Итерация: 25;  LOSS: 1.6527
Итерация: 26;  LOSS: 1.3011
Итерация: 27;  LOSS: 1.5200
Итерация: 28;  LOSS: 1.6655
Итерация: 29;  LOSS: 1.4789
Итерация: 30;  LOSS: 1.3915
Итерация: 31;  LOSS: 1.4273
Итерация: 32;  LOSS: 1.8304
Итерация: 33;  LOSS: 1.5205
Итерация: 34;  LOSS: 1.7005
Итерация: 35;  LOSS: 1.6584
Ит



Итерация: 146;  LOSS: 1.6925
Итерация: 147;  LOSS: 1.8220
Итерация: 148;  LOSS: 1.6651
Итерация: 149;  LOSS: 1.9250
Итерация: 150;  LOSS: 1.1811
Итерация: 151;  LOSS: 1.7270
Итерация: 152;  LOSS: 1.3425
Итерация: 153;  LOSS: 1.7786
Итерация: 154;  LOSS: 1.5104
Итерация: 155;  LOSS: 1.3815
Итерация: 156;  LOSS: 1.5752
Итерация: 157;  LOSS: 1.3675
Итерация: 158;  LOSS: 1.4745
Итерация: 159;  LOSS: 1.3878
Итерация: 160;  LOSS: 1.8305
Итерация: 161;  LOSS: 1.5167
Итерация: 162;  LOSS: 1.2514
Итерация: 163;  LOSS: 1.7842
Итерация: 164;  LOSS: 1.8063
Итерация: 165;  LOSS: 1.7658
Итерация: 166;  LOSS: 1.7016
Итерация: 167;  LOSS: 1.3247
Итерация: 168;  LOSS: 1.2636
Итерация: 169;  LOSS: 1.2840
Итерация: 170;  LOSS: 1.3912
Итерация: 171;  LOSS: 1.6594
Итерация: 172;  LOSS: 1.8380
Итерация: 173;  LOSS: 1.2050
Итерация: 174;  LOSS: 1.4357
Итерация: 175;  LOSS: 1.4023
Итерация: 176;  LOSS: 1.3433
Итерация: 177;  LOSS: 1.4221
Итерация: 178;  LOSS: 1.5208
Итерация: 179;  LOSS: 1.7362
Итерация: 180;

HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=47.0, style=ProgressStyle(description_wi…


Итоговая LOSS 1.3279


# **Чат GPT с тонкой настройкой**

In [24]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('output-small')

In [25]:
for step in range(5):
    new_user_input_ids = tokenizer.encode(input(">> Black:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)


    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("RickBot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> Black:What do you think about Elon Musk?
RickBot: I don't think about him. He's just a character.
>> Black:But what exactly do you think?
RickBot: You think I could put a bomb in there and blow it up all at once?
>> Black:I think he is charismatic
RickBot: You're right, I'm wrong.
>> Black:What else you can say about him?
RickBot: He's not charismatic.
>> Black:ok, thank you
RickBot: !!!
