In [1]:
import torch
import pandas as pd
import numpy as np
import math

from pathlib import Path
from typing import Dict, List
from datasets import concatenate_datasets, load_dataset
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline


2023-09-09 11:12:38.956953: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-09 11:12:38.984773: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!python prepare_messages.py --tg-history-path "data/result.json" --output-path "data/data.csv"

/bin/bash: /home/consent-flower/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)


In [4]:
data = load_dataset("csv", data_files="data/data.csv", split="train")

Downloading and preparing dataset csv/default to /home/consent-flower/.cache/huggingface/datasets/csv/default-f6b61b4995cdb398/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to /home/consent-flower/.cache/huggingface/datasets/csv/default-f6b61b4995cdb398/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


In [5]:
print(data)

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 121829
})


In [6]:
data = data.filter(lambda example: example["context_1"] != None and example["response"] != None and \
    example["context_2"] != None and example["context_3"] != None)
data

Filter:   0%|          | 0/121829 [00:00<?, ? examples/s]

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 24221
})

In [7]:
data[250]

{'context_3': 'некоторых людей ещё на Волгоградском проспекте селят\nно вероятность незначительная',
 'context_2': 'Если ты в теории уедешь из общаги',
 'context_1': 'Не стоит так утвердительно заявлять все же',
 'response': 'И какое-то количество времени будешь снимать квартиру'}

In [8]:
data = data.train_test_split(test_size=0.2, shuffle=True)

In [9]:
data['train'][200]

{'context_3': 'ничесе\nизвиняюсь\nне знал',
 'context_2': 'больше 27 дюймов\nпод мак',
 'context_1': 'а рефарб есть такой моник?',
 'response': ')'}

In [10]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# data = data.flatten()

In [12]:
# https://huggingface.co/docs/transformers/tasks/language_modeling

In [13]:
data["train"][0]

{'context_3': 'Не призовут тебя правда',
 'context_2': 'А кто ты, если не либерал?',
 'context_1': 'Ты знаешь, кто такие либералы?',
 'response': 'Впервые слышц'}

In [16]:
KEYS = ['context_3', 'context_2', 'context_1', 'response']


def preprocess_function(examples):
    concat_examples = []
    for key in KEYS:
        concat_examples.extend(examples[key])
    return tokenizer([" ".join(x) for x in concat_examples])

In [17]:
tokenized_data = data.map(
    preprocess_function,
    batched = True,
    num_proc=4,
    remove_columns=data["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/19376 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4845 [00:00<?, ? examples/s]

In [18]:
block_size = 128


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [19]:
lm_dataset = tokenized_data.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/77504 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/19380 [00:00<?, ? examples/s]

In [20]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForCausalLM.from_pretrained('tinkoff-ai/ruDialoGPT-medium').to(device)

In [22]:
training_args = TrainingArguments(
    output_dir="my-ruDialoGPT-medium-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mninachely[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/10464 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.0691, 'learning_rate': 1.904434250764526e-05, 'epoch': 0.14}
{'loss': 1.7422, 'learning_rate': 1.808868501529052e-05, 'epoch': 0.29}
{'loss': 1.6382, 'learning_rate': 1.713302752293578e-05, 'epoch': 0.43}
{'loss': 1.5736, 'learning_rate': 1.617737003058104e-05, 'epoch': 0.57}
{'loss': 1.5385, 'learning_rate': 1.5221712538226302e-05, 'epoch': 0.72}
{'loss': 1.5214, 'learning_rate': 1.4266055045871561e-05, 'epoch': 0.86}


  0%|          | 0/843 [00:00<?, ?it/s]

{'eval_loss': 1.4632976055145264, 'eval_runtime': 44.3699, 'eval_samples_per_second': 151.927, 'eval_steps_per_second': 18.999, 'epoch': 1.0}
{'loss': 1.4983, 'learning_rate': 1.3310397553516821e-05, 'epoch': 1.0}
{'loss': 1.449, 'learning_rate': 1.235474006116208e-05, 'epoch': 1.15}
{'loss': 1.4461, 'learning_rate': 1.139908256880734e-05, 'epoch': 1.29}
{'loss': 1.4316, 'learning_rate': 1.04434250764526e-05, 'epoch': 1.43}
{'loss': 1.4214, 'learning_rate': 9.48776758409786e-06, 'epoch': 1.58}
{'loss': 1.4196, 'learning_rate': 8.53211009174312e-06, 'epoch': 1.72}
{'loss': 1.399, 'learning_rate': 7.57645259938838e-06, 'epoch': 1.86}


  0%|          | 0/843 [00:00<?, ?it/s]

{'eval_loss': 1.3927334547042847, 'eval_runtime': 44.275, 'eval_samples_per_second': 152.253, 'eval_steps_per_second': 19.04, 'epoch': 2.0}
{'loss': 1.3994, 'learning_rate': 6.620795107033639e-06, 'epoch': 2.01}
{'loss': 1.3696, 'learning_rate': 5.665137614678899e-06, 'epoch': 2.15}
{'loss': 1.3702, 'learning_rate': 4.70948012232416e-06, 'epoch': 2.29}
{'loss': 1.3524, 'learning_rate': 3.7538226299694192e-06, 'epoch': 2.44}
{'loss': 1.3578, 'learning_rate': 2.798165137614679e-06, 'epoch': 2.58}
{'loss': 1.3596, 'learning_rate': 1.8425076452599388e-06, 'epoch': 2.72}
{'loss': 1.3553, 'learning_rate': 8.868501529051989e-07, 'epoch': 2.87}


  0%|          | 0/843 [00:00<?, ?it/s]

{'eval_loss': 1.3729362487792969, 'eval_runtime': 44.3049, 'eval_samples_per_second': 152.15, 'eval_steps_per_second': 19.027, 'epoch': 3.0}
{'train_runtime': 2383.3382, 'train_samples_per_second': 35.12, 'train_steps_per_second': 4.39, 'train_loss': 1.4798526472272493, 'epoch': 3.0}


TrainOutput(global_step=10464, training_loss=1.4798526472272493, metrics={'train_runtime': 2383.3382, 'train_samples_per_second': 35.12, 'train_steps_per_second': 4.39, 'train_loss': 1.4798526472272493, 'epoch': 3.0})

In [24]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/843 [00:00<?, ?it/s]

Perplexity: 3.95


In [25]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

'https://huggingface.co/ninachely/my-ruDialoGPT-medium-model/tree/main/'

In [41]:
prompt = "МФТИ или Вышка"

In [42]:
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [43]:
model = AutoModelForCausalLM.from_pretrained("my-ruDialoGPT-medium-model/checkpoint-10000/")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [46]:
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
response

['МФТИ или Вышка   н е   т у д аА л г о с ы   д а л ь ш е   с о с и т ь,   к у м у л я т о р н оГ е н и й   к т о - н и б у д ь   н е   з н а е т? \n П о й д у   с в я з а т ь с я...Н е т,   б л и н']