In [None]:
import torch
import pandas as pd
import numpy as np
import math

from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline


In [None]:
notebook_login()

In [None]:
!python prepare_messages.py --tg-history-path "data/result.json" --output-path "data/data.csv"

In [None]:
data = load_dataset("csv", data_files="data/data.csv", split="train")

In [None]:
print(data)

In [None]:
data = data.filter(lambda example: example["context_1"] != None)
data

In [None]:
FIRST_SPEAKER_TOKEN = '@@ПЕРВЫЙ@@'
SECOND_SPEAKER_TOKEN = '@@ВТОРОЙ@@'
CONTEXT_COLS = ['context_3', 'context_2', 'context_1']
RESPONSE_COL = ['response']
SEP = ' '


def convert_to_dialog(sample: Dict[str, str]) -> Dict[str, str]:
    result_dict = dict()
    dialog = ""
    for i in range(len(CONTEXT_COLS)):
        key = CONTEXT_COLS[i]
        if key in sample and sample[key] is not None:
            speaker_token = FIRST_SPEAKER_TOKEN if i % 2 == 0 else SECOND_SPEAKER_TOKEN
            dialog += speaker_token + SEP + sample[key] + SEP
    
    response_key = RESPONSE_COL[0]
    if response_key in sample and sample[response_key] is not None:
        dialog += SECOND_SPEAKER_TOKEN + SEP + sample[response_key]
    result_dict['text'] = dialog
    return result_dict

In [None]:
data = data.train_test_split(test_size=0.2, shuffle=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')

In [None]:
# https://huggingface.co/docs/transformers/tasks/language_modeling

In [None]:
dialog_data = data.map(convert_to_dialog)

In [None]:
type(dialog_data["train"]["text"])

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=128)

In [None]:
tokenized_data = dialog_data.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=data["train"].column_names,
)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir="model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    per_device_train_batch_size=2,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained('tinkoff-ai/ruDialoGPT-medium').to(device)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=data_collator,
)

trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()

In [None]:
prompt = "МФТИ или Вышка"

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [None]:
model = AutoModelForCausalLM.from_pretrained("my-ruDialoGPT-medium-model/checkpoint-10000/")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

In [None]:
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
response