In [50]:
import torch
import pandas as pd
from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelWithLMHead

In [51]:
!python prepare_messages.py --tg-history-path "data/result.json" --output-path "data/data.csv"

In [52]:
DATA_PATH = "data/data.csv"

data = load_dataset("csv", data_files=DATA_PATH, split="train")

Downloading and preparing dataset csv/default to /Users/ninachely/.cache/huggingface/datasets/csv/default-d515fd3fd78f0310/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/ninachely/.cache/huggingface/datasets/csv/default-d515fd3fd78f0310/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


In [53]:
print(data)

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 121829
})


In [54]:
data = data.filter(lambda example: example["context_1"] != None)
print(data)

Filter:   0%|          | 0/121829 [00:00<?, ? examples/s]

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 79105
})


In [55]:
print(data)

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 79105
})


In [56]:
data[0]

{'context_3': None,
 'context_2': None,
 'context_1': 'норм цвет)',
 'response': 'Грустно видеть Диану с ником куратора\nто есть ты больше не наш куратор?(('}

In [57]:
data = data.train_test_split(test_size=0.2, shuffle=True)

In [58]:
data['train'][0:1]

{'context_3': ['Много чего на самом деле)\nНо меня именно от microsoft штуки интересуют, остальное от физтеховской есть (она гугловская)'],
 'context_2': ['Может быть будут работать студенческие подписки'],
 'context_1': ['Jetbrains же одной кнопкой продлевается\nБез всяких подтверждений статуса студента'],
 'response': ['Никто не в курсе, когда там приказы-то вообще появятся по коммерции?']}

In [84]:
FIRST_SPEAKER_TOKEN = '@@ПЕРВЫЙ@@'
SECOND_SPEAKER_TOKEN = '@@ВТОРОЙ@@'

CONTEXT_COLS = ['context_3', 'context_2', 'context_1']
RESPONSE_COL = ['response']
SEP = ' '

def convert_to_dialog(sample: Dict[str, str]) -> Dict[str, str]:
    """
        Convert sample row to dialogs str format
    """
    result_dict = dict()
    dialog = ""
    for i in range(len(CONTEXT_COLS)):
        key = CONTEXT_COLS[i]
        if key in sample and sample[key] is not None:
            speaker_token = FIRST_SPEAKER_TOKEN if i % 2 == 0 else SECOND_SPEAKER_TOKEN
            dialog += speaker_token + SEP + sample[key] + SEP
    
    response_key = RESPONSE_COL[0]
    if response_key in sample and sample[response_key] is not None:
        dialog += SECOND_SPEAKER_TOKEN + SEP + sample[response_key]

    result_dict['text'] = dialog

    return result_dict

In [77]:
convert_to_dialog(
    {
        'context_3': 'привет',
        'context_2': 'привет!',
        'context_1': 'как дела?',
        'response': 'супер)'
    }
)

{'text': '@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет! @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

In [78]:
convert_to_dialog(
    {
        'context_1': 'как дела?',
        'response': 'супер)'
    }
)

{'text': '@@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

In [85]:
assert convert_to_dialog(
    {
        'context_3': 'привет',
        'context_2': 'привет!',
        'context_1': 'как дела?',
        'response': 'супер)'
    }
) == {'text': '@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет! @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

assert convert_to_dialog(
    {
        'context_1': 'как дела?',
        'response': 'супер)'
    }
) == {'text': '@@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

In [80]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 232a1a54-1dcd-4d13-92a1-b91cacc658a1)')' thrown while requesting HEAD https://huggingface.co/tinkoff-ai/ruDialoGPT-medium/resolve/main/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [86]:
def tokenize_sample(sample: Dict[str, str]):
    return tokenizer(sample['text'], padding='max_length', truncation=True)

In [87]:
print(data['train'][0])

{'context_3': 'Много чего на самом деле)\nНо меня именно от microsoft штуки интересуют, остальное от физтеховской есть (она гугловская)', 'context_2': 'Может быть будут работать студенческие подписки', 'context_1': 'Jetbrains же одной кнопкой продлевается\nБез всяких подтверждений статуса студента', 'response': 'Никто не в курсе, когда там приказы-то вообще появятся по коммерции?'}


In [88]:
train = data['train'].map(convert_to_dialog)
test = data['test'].map(convert_to_dialog)

Map:   0%|          | 0/63284 [00:00<?, ? examples/s]

Map:   0%|          | 0/15821 [00:00<?, ? examples/s]

In [89]:
train = train.map(tokenize_sample)
test = train.map(tokenize_sample)

Map:   0%|          | 0/63284 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/63284 [00:00<?, ? examples/s]

In [None]:
import torch
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForCausalLM.from_pretrained('tinkoff-ai/ruDialoGPT-medium').to(device)

In [None]:
arguments = {
    'output_dir': './training_output',  # path to save the model's checkpoints
    'per_device_train_batch_size': 16,  # batch size per GPU/CPU for training
    'gradient_accumulation_steps': 4,  # number of batches to accumulate gradient
    'max_steps': 500,  # total number of optimizer.step() calls
    'save_steps': 100,  # save every save_steps
    'eval_steps': 100,  # run evaluation every eval_steps
    'dataloader_num_workers': 0,  # number of workers for data loading (default: 0)
    'save_total_limit': 2,  # total number of checkpoints to save, delete older checkpoints when reached
}

trainer = ... # YOUR CODE HERE

In [None]:
from transformers import AutoModelWithLMHead

checkpoint_path = 'path/to/your/checkpoint-100'

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelWithLMHead.from_pretrained(checkpoint_path)