In [None]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelWithLMHead

In [None]:
!python prepare_messages.py --tg-history-path "data/result.json" --output-path "data/data.csv"

In [None]:
DATA_PATH = "data/data.csv"

data = load_dataset("csv", data_files=DATA_PATH, split="train")

In [None]:
print(data)

In [None]:
data = data.filter(lambda example: example["context_1"] != None)
print(data)

In [None]:
print(data)

In [None]:
data[0]

In [None]:
data = data.train_test_split(test_size=0.2, shuffle=True)

In [None]:
data['train'][0:1]

In [None]:
FIRST_SPEAKER_TOKEN = '@@ПЕРВЫЙ@@'
SECOND_SPEAKER_TOKEN = '@@ВТОРОЙ@@'

CONTEXT_COLS = ['context_3', 'context_2', 'context_1']
RESPONSE_COL = ['response']
SEP = ' '

def convert_to_dialog(sample: Dict[str, str]) -> Dict[str, str]:
    """
        Convert sample row to dialogs str format
    """
    result_dict = dict()
    dialog = ""
    for i in range(len(CONTEXT_COLS)):
        key = CONTEXT_COLS[i]
        if key in sample and sample[key] is not None:
            speaker_token = FIRST_SPEAKER_TOKEN if i % 2 == 0 else SECOND_SPEAKER_TOKEN
            dialog += speaker_token + SEP + sample[key] + SEP
    
    response_key = RESPONSE_COL[0]
    if response_key in sample and sample[response_key] is not None:
        dialog += SECOND_SPEAKER_TOKEN + SEP + sample[response_key]

    result_dict['text'] = dialog

    return result_dict

In [None]:
convert_to_dialog(
    {
        'context_3': 'привет',
        'context_2': 'привет!',
        'context_1': 'как дела?',
        'response': 'супер)'
    }
)

In [None]:
convert_to_dialog(
    {
        'context_1': 'как дела?',
        'response': 'супер)'
    }
)

In [None]:
assert convert_to_dialog(
    {
        'context_3': 'привет',
        'context_2': 'привет!',
        'context_1': 'как дела?',
        'response': 'супер)'
    }
) == {'text': '@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет! @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

assert convert_to_dialog(
    {
        'context_1': 'как дела?',
        'response': 'супер)'
    }
) == {'text': '@@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

In [None]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')

In [None]:
def tokenize_sample(sample: Dict[str, str]):
    return tokenizer(sample['text'], padding='max_length', truncation=True)

In [None]:
print(data['train'][0])

In [None]:
train = data['train'].map(convert_to_dialog)
test = data['test'].map(convert_to_dialog)

In [None]:
tokenized_train = train.map(tokenize_sample)
tokenized_test = train.map(tokenize_sample)

In [None]:
import torch
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForCausalLM.from_pretrained('tinkoff-ai/ruDialoGPT-medium').to(device)
metric = evaluate.load('f1') # load sth else

In [None]:
training_args = {
    'output_dir': './training_output',  # path to save the model's checkpoints
    'per_device_train_batch_size': 128,  # batch size per GPU/CPU for training
    'gradient_accumulation_steps': 4,  # number of batches to accumulate gradient
    'max_steps': 500,  # total number of optimizer.step() calls
    'save_steps': 100,  # save every save_steps
    'eval_steps': 100,  # run evaluation every eval_steps
    'dataloader_num_workers': 0,  # number of workers for data loading (default: 0)
    'save_total_limit': 2,  # total number of checkpoints to save, delete older checkpoints when reached
}

In [None]:
def compute_metrics(eval_pred):
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
	model = model,
	args = training_args,
	train_dataset = tokenized_train,
	eval_dataset = tokenized_test,
	compute_metrics = compute_metrics)

trainer.train()

In [None]:
save_directory = './pt_save_pretrained'
#tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)
#alternatively save the trainer
#trainer.save_model('CustomModels/CustomHamSpam')

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(
# 	'./pt_save_pretrained')

In [None]:
from transformers import AutoModelWithLMHead

checkpoint_path = 'path/to/your/checkpoint-100'

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelWithLMHead.from_pretrained(checkpoint_path)