In [40]:
import torch
import pandas as pd
import numpy as np
import math

from pathlib import Path
from typing import Dict, List
from datasets import concatenate_datasets, load_dataset
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline


In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!python prepare_messages.py --tg-history-path "data/result.json" --output-path "data/data.csv"

In [5]:
data = load_dataset("csv", data_files="data/data.csv", split="train")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
print(data)

In [7]:
data = data.filter(lambda example: example["context_1"] != None and example["response"] != None and \
    example["context_2"] != None and example["context_3"] != None)
data

Filter:   0%|          | 0/121829 [00:00<?, ? examples/s]

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 24221
})

In [8]:
data[250]

{'context_3': 'некоторых людей ещё на Волгоградском проспекте селят\nно вероятность незначительная',
 'context_2': 'Если ты в теории уедешь из общаги',
 'context_1': 'Не стоит так утвердительно заявлять все же',
 'response': 'И какое-то количество времени будешь снимать квартиру'}

In [9]:
data = data.train_test_split(test_size=0.2, shuffle=True)

In [10]:
data['train'][200]

{'context_3': 'Давайте ещё вспомним как в начале 90-х МГУ пытались отучить от антисемитизма при отборе абитуриентов на мехмат.',
 'context_2': 'Ну это известная история',
 'context_1': 'Так?',
 'response': 'Зачем'}

In [11]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [35]:
# data = data.flatten()

In [None]:
# https://huggingface.co/docs/transformers/tasks/language_modeling

In [20]:
data["train"][0]

{'context_3': 'Первая заточена под разработку на всяких там плюсах, шарпах и прочем барахле от микромягких\nВторой это тупо блокнот для хипстеров с возможностью обвесить всяким разным\nНу и он там дико кастомизируемый',
 'context_2': 'Опять вим',
 'context_1': 'А компилятор во "всякое разное" входит?',
 'response': 'Насколько я помню то запуск можно настроить в vscode, да'}

In [47]:
KEYS = ['context_3', 'context_2', 'context_1', 'response']


def preprocess_function(examples):
    concat_examples = []
    for key in keys:
        concat_examples.extend(examples[KEYS])
    return tokenizer([" ".join(x) for x in concat_examples])

In [48]:
tokenized_data = data.map(
    preprocess_function,
    batched = True,
    num_proc=4,
    remove_columns=data["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/19376 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4845 [00:00<?, ? examples/s]

In [49]:
block_size = 128


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [50]:
lm_dataset = tokenized_data.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/77504 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/19380 [00:00<?, ? examples/s]

In [51]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForCausalLM.from_pretrained('tinkoff-ai/ruDialoGPT-medium').to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir="my-ruDialoGPT-medium-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()

In [None]:
prompt = "Somatic hypermutation allows the immune system to"

In [None]:
generator = pipeline("text-generation", model="my-ruDialoGPT-medium-model")
generator(prompt)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("my-ruDialoGPT-medium-model")
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [None]:
model = AutoModelForCausalLM.from_pretrained("my-ruDialoGPT-medium-model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

In [None]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)