In [1]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelWithLMHead

In [2]:
!python prepare_messages.py --tg-history-path "data/result.json" --output-path "data/data.csv"

/bin/bash: /home/consent-flower/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)


In [3]:
DATA_PATH = "data/data.csv"

data = load_dataset("csv", data_files=DATA_PATH, split="train")

Downloading and preparing dataset csv/default to /home/consent-flower/.cache/huggingface/datasets/csv/default-8e5d4ec974263eda/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to /home/consent-flower/.cache/huggingface/datasets/csv/default-8e5d4ec974263eda/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


In [4]:
print(data)

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 121829
})


In [5]:
data = data.filter(lambda example: example["context_1"] != None)
print(data)

Filter:   0%|          | 0/121829 [00:00<?, ? examples/s]

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 79105
})


In [6]:
print(data)

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 79105
})


In [7]:
data[0]

{'context_3': None,
 'context_2': None,
 'context_1': 'норм цвет)',
 'response': 'Грустно видеть Диану с ником куратора\nто есть ты больше не наш куратор?(('}

In [8]:
data = data.train_test_split(test_size=0.2, shuffle=True)

In [9]:
data['train'][0:1]

{'context_3': ['Г'],
 'context_2': ['Ирпень\nГостомель'],
 'context_1': ['Ереван'],
 'response': ['Ростов']}

In [10]:
FIRST_SPEAKER_TOKEN = '@@ПЕРВЫЙ@@'
SECOND_SPEAKER_TOKEN = '@@ВТОРОЙ@@'

CONTEXT_COLS = ['context_3', 'context_2', 'context_1']
RESPONSE_COL = ['response']
SEP = ' '

def convert_to_dialog(sample: Dict[str, str]) -> Dict[str, str]:
    """
        Convert sample row to dialogs str format
    """
    result_dict = dict()
    dialog = ""
    for i in range(len(CONTEXT_COLS)):
        key = CONTEXT_COLS[i]
        if key in sample and sample[key] is not None:
            speaker_token = FIRST_SPEAKER_TOKEN if i % 2 == 0 else SECOND_SPEAKER_TOKEN
            dialog += speaker_token + SEP + sample[key] + SEP
    
    response_key = RESPONSE_COL[0]
    if response_key in sample and sample[response_key] is not None:
        dialog += SECOND_SPEAKER_TOKEN + SEP + sample[response_key]

    result_dict['text'] = dialog

    return result_dict

In [11]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
print(data['train'][0])

{'context_3': 'Г', 'context_2': 'Ирпень\nГостомель', 'context_1': 'Ереван', 'response': 'Ростов'}


In [13]:
train = data['train'].map(convert_to_dialog)
test = data['test'].map(convert_to_dialog)

Map:   0%|          | 0/63284 [00:00<?, ? examples/s]

Map:   0%|          | 0/15821 [00:00<?, ? examples/s]

In [14]:
print(train[0])

{'context_3': 'Г', 'context_2': 'Ирпень\nГостомель', 'context_1': 'Ереван', 'response': 'Ростов', 'text': '@@ПЕРВЫЙ@@ Г @@ВТОРОЙ@@ Ирпень\nГостомель @@ПЕРВЫЙ@@ Ереван @@ВТОРОЙ@@ Ростов'}


In [15]:
def preprocess_function(examples):
    return tokenizer(text=[" ".join(x) for x in examples['text']], max_length=128, truncation=True, padding=True)

In [16]:
tokenized_train = train.map(
    preprocess_function,
    remove_columns=train.column_names
)

tokenized_test = test.map(
    preprocess_function,
    remove_columns=test.column_names
)

Map:   0%|          | 0/63284 [00:00<?, ? examples/s]

Map:   0%|          | 0/15821 [00:00<?, ? examples/s]

In [17]:
# https://huggingface.co/docs/transformers/tasks/language_modeling

block_size = 128


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [18]:
lm_dataset_train = tokenized_train.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/63284 [00:00<?, ? examples/s]

In [30]:
len(lm_dataset_train[1])

3

In [19]:
lm_dataset_test = tokenized_test.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/15821 [00:00<?, ? examples/s]

In [20]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

2023-09-08 18:34:44.964848: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-08 18:34:44.988697: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_train,
    eval_dataset=lm_dataset_test,
    data_collator=data_collator,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mninachely[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/28185 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")