# Data 🐱‍🏍

Данные о решении арфмитических задач

In [2]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split

> Загружаем данные, чистим до тех, у кого контекст $\ge 5$

> Делаем их пригодными для DialoGPT

In [3]:
N_CONTEXT = 3

In [4]:
data = json.load(open('../data/qa_arith.json', 'r', encoding='utf8'))
idxs = []

for idx, row in enumerate(data):
    if len(row['conversation']) > N_CONTEXT:
        idxs.append(idx)
print(f"Данных было: {len(data)}, валидных (CONTEXT > {N_CONTEXT}): {len(idxs)}")

contexted = []
for sample in idxs:
    row = []
    for context in range(N_CONTEXT + 1):
        row.append(data[sample]['conversation'][context])
    contexted.append(row)

columns = ['context/' + str(i) for i in range(N_CONTEXT)] + ['response']

df = pd.DataFrame.from_records(contexted, columns=columns)
df = df[reversed(df.columns)]
df.sample(5)

Данных было: 104131, валидных (CONTEXT > 3): 59807


Unnamed: 0,response,context/2,context/1,context/0
56516,1692 минус 825 равняется 867,"Что получится, если из этого числа вычесть 825?",(20+16)×47 равно 1692,"Что получится, если к 20 прибавить 16, а потом..."
31894,Умножаем 0 на 55 и получаем 0,Помножь на 55 и напиши результат.,Это произведение равно 0,"Найди произведение чисел 9, 0 и 8"
6478,9669,"Какое число получится, если справа дописать к ...","если к 81 прибавить 15, получится 96","Чему равно значение выражения m+n, когда m=81,..."
20180,32,"Какое число получится, если в этом числе замен...",72,Что надо подставить вместо u в уравнении u-4=68?
25163,Помножить его на 18,Как превратить это число в 882 с помощью арифм...,84-35=49,"Чему равняется X-Y, если X=84, а Y=35?"


In [5]:
trn_df, val_df = train_test_split(df, test_size=0.1)
trn_df.head()

Unnamed: 0,response,context/2,context/1,context/0
18854,Их там две,Сколько в этом числе цифр?,50,"Прежде всего, 10*5 чему равняется?"
23118,114*95=10830,"Что получится, если это число умножить на 95?",114,"Что надо подставить вместо e, чтобы выражение ..."
14611,Результат равен 906,"Прибавь к нему 906, какой результат?",0,"Владислав, скажи, а чему равно 4-4?"
22726,1168 умножить на 92 равно 107456,Чему равно произведение этого числа на 92?,Результат этого выражения равен 1168,Посчитай 3087024 / ( 25938402 / ( 5807 - 5637 ...
44101,"508, 607, 142, 321, 116, 58",Зачеркни в этом списке у каждого числа первую ...,"9508, 5607, 3142, 2321, 2116, 458","Отсортируй по убыванию числа в списке 3142, 56..."


# Подготовка сопутсвующих штук: датасеты 🐱‍🐉

In [6]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

In [7]:
def construct_conv(row, tokenizer, eos = True):
    """Строим conversation"""
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

In [8]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Грущим фичи с кэша %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Создаем фичи в директории %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Сохраняем фичи в файл %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [9]:
def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False):
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False):
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info(f"Удаляем старый чекпоинт [{checkpoint}], ротируем, так сказать :)")
        shutil.rmtree(checkpoint)

# Модель! 🤖

In [10]:
import torch
import glob
import logging
import os
import pickle
import random
import re
import shutil
from torch.utils.tensorboard import SummaryWriter

In [11]:
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")



In [11]:
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

> Колдунство с аргументами

In [12]:
class Args():
    def __init__(self):
        self.output_dir = '../our_gpt/content/output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = '../our_gpt/cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 20
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

> Портянка нагло взята в официальном туториале

In [13]:
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Количество примеров = %d", len(train_dataset))
    logger.info("  Эпоха = %d", args.num_train_epochs)
    logger.info("  Батч на GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Общий батч (с кластером) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Шаги аккамуляции градиента = %d", args.gradient_accumulation_steps)
    logger.info("  Всего шагов обучения = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Продолжаю обучение с эпохи %d", epochs_trained)
            logger.info("  Глобальный шаг %d", global_step)
            logger.info("  Пропускаем %d шагов на первой эпохи", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Начинаем тюнить.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Эпоха", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Итерация", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer, df_trn, df_val)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Сохраняю чекпоинт %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Сохраняю оптимизер и шелдулер %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model, tokenizer, df_trn, df_val, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Старт валидации {} *****".format(prefix))
    logger.info("  Количество образцов = %d", len(eval_dataset))
    logger.info("  Размер батча = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Результаты валидации {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

> Утащено оттуда же

In [14]:
def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

In [15]:
main(trn_df, val_df)



Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/351M [00:00<?, ?B/s]

05/21/2023 11:14:41 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x000001F943C8E370>
05/21/2023 11:14:41 - INFO - __main__ -   Создаем фичи в директории ../our_gpt/cached
05/21/2023 11:14:55 - INFO - __main__ -   Сохраняем фичи в файл ../our_gpt/cached\gpt2_cached_lm_512
05/21/2023 11:14:57 - INFO - __main__ -   ***** Running training *****
05/21/2023 11:14:57 - INFO - __main__ -     Количество примеров = 53826
05/21/2023 11:14:57 - INFO - __main__ -     Эпоха = 20
05/21/2023 11:14:57 - INFO - __main__ -     Батч на GPU = 4
05/21/2023 11:14:57 - INFO - __main__ -     Общий батч (с кластером) = 4
05/21/2023 11:14:57 - INFO - __main__ -     Шаги аккамуляции градиента = 1
05/21/2023 11:14:57 - INFO - __main__ -     Всего шагов обучения = 269120


Эпоха:   0%|          | 0/20 [00:00<?, ?it/s]

Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 11:25:22 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-3500
05/21/2023 11:25:28 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-3500
05/21/2023 11:34:49 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-7000
05/21/2023 11:34:54 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-7000
05/21/2023 11:44:21 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-10500
05/21/2023 11:44:26 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-10500


Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 11:53:58 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-14000
05/21/2023 11:54:01 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-14000
05/21/2023 12:03:26 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-17500
05/21/2023 12:03:32 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-17500
05/21/2023 12:12:58 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-21000
05/21/2023 12:13:03 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-21000
05/21/2023 12:22:27 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-24500
05/21/2023 12:22:31 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-24500


Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 12:31:50 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-28000
05/21/2023 12:31:54 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-28000
05/21/2023 12:41:11 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-31500
05/21/2023 12:41:14 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-31500
05/21/2023 12:50:31 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-35000
05/21/2023 12:50:34 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-35000
05/21/2023 12:59:53 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-38500
05/21/2023 12:59:56 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-38500


Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 13:09:10 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-42000
05/21/2023 13:09:13 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-42000
05/21/2023 13:18:31 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-45500
05/21/2023 13:18:35 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-45500
05/21/2023 13:27:53 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-49000
05/21/2023 13:27:56 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-49000
05/21/2023 13:37:18 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-52500
05/21/2023 13:37:21 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-52500


Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 13:46:37 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-56000
05/21/2023 13:46:40 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-56000
05/21/2023 13:56:00 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-59500
05/21/2023 13:56:03 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-59500
05/21/2023 14:05:23 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-63000
05/21/2023 14:05:27 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-63000
05/21/2023 14:14:43 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-66500
05/21/2023 14:14:46 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-66500


Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 14:24:03 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-70000
05/21/2023 14:24:06 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-70000
05/21/2023 14:33:24 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-73500
05/21/2023 14:33:28 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-73500
05/21/2023 14:42:46 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-77000
05/21/2023 14:42:49 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-77000
05/21/2023 14:52:07 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-80500
05/21/2023 14:52:10 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-80500


Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 15:01:28 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-84000
05/21/2023 15:01:31 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-84000
05/21/2023 15:10:50 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-87500
05/21/2023 15:10:52 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-87500
05/21/2023 15:20:12 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-91000
05/21/2023 15:20:15 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-91000


Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 15:29:31 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-94500
05/21/2023 15:29:33 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-94500
05/21/2023 15:38:50 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-98000
05/21/2023 15:38:52 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-98000
05/21/2023 15:48:09 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-101500
05/21/2023 15:48:12 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-101500
05/21/2023 15:57:41 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-105000
05/21/2023 15:57:45 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-105000


Итерация:   0%|          | 0/13456 [00:00<?, ?it/s]

05/21/2023 16:07:11 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-108500
05/21/2023 16:07:15 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-108500
05/21/2023 16:16:43 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-112000
05/21/2023 16:16:48 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-112000
05/21/2023 16:26:14 - INFO - __main__ -   Сохраняю чекпоинт ../our_gpt/content/output-small\checkpoint-115500
05/21/2023 16:26:17 - INFO - __main__ -   Сохраняю оптимизер и шелдулер ../our_gpt/content/output-small\checkpoint-115500


# Валидируемся 🧾

In [13]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('../our_gpt/content/output-small')

In [14]:
for step in range(4):
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature=0.8,
    )
    
    # pretty print last ouput tokens from bot
    print("ArifMan: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:x=2y


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


ArifMan: 2
>> User:Результат нужно умножить на 16


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


ArifMan: 32
