In [1]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
from transformers import TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer
from transformers import pipeline

import pandas as pd
import numpy as np

import re 

import torch
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to C:\Users\Daria/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [2]:
model_checkpoint = "sberbank-ai/ruRoberta-large"
block_size = 128
batch_size = 2
num_epochs = 3

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [26]:
df = pd.read_csv("all_recepies_inter.csv", sep="\t", usecols=["name", "Инструкции"]).rename(columns={"Инструкции": "Instructions"})

df["Instructions"] = df["Instructions"].apply(lambda x: x.replace(u'\xa0', u' '))
df["Instructions"] = df["Instructions"].apply(lambda x: re.sub("\n|\r|\t", " ",  x))

In [7]:
ds = Dataset.from_pandas(df[["Instructions"]]).train_test_split(test_size=0.2)

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["Instructions"], truncation=True, max_length=512, padding="max_length", return_tensors="pt")

In [9]:
tokenized_ds = ds.map(
    preprocess_function,
    batched=True,
    remove_columns=ds["train"].column_names,
)

Map:   0%|          | 0/22307 [00:00<?, ? examples/s]

Map:   0%|          | 0/5577 [00:00<?, ? examples/s]

In [10]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
ds = tokenized_ds.map(group_texts, batched=True)

Map:   0%|          | 0/22307 [00:00<?, ? examples/s]

Map:   0%|          | 0/5577 [00:00<?, ? examples/s]

In [12]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [13]:
logging_steps = len(ds["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-for-chat",
    overwrite_output_dir=False,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_epochs,
    gradient_accumulation_steps=8,
    logging_steps=logging_steps,
    push_to_hub=False,
    report_to="none",
    # optim=torch.optim.AdamW
)

In [14]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    data_collator=data_collator,
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 89228
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 8
  Total optimization steps = 16728
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,
1,No log,
2,No log,


Saving model checkpoint to ruRoberta-large-finetuned-for-chat\checkpoint-500
Configuration saved in ruRoberta-large-finetuned-for-chat\checkpoint-500\config.json
Model weights saved in ruRoberta-large-finetuned-for-chat\checkpoint-500\pytorch_model.bin
Saving model checkpoint to ruRoberta-large-finetuned-for-chat\checkpoint-1000
Configuration saved in ruRoberta-large-finetuned-for-chat\checkpoint-1000\config.json
Model weights saved in ruRoberta-large-finetuned-for-chat\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to ruRoberta-large-finetuned-for-chat\checkpoint-1500
Configuration saved in ruRoberta-large-finetuned-for-chat\checkpoint-1500\config.json
Model weights saved in ruRoberta-large-finetuned-for-chat\checkpoint-1500\pytorch_model.bin
Saving model checkpoint to ruRoberta-large-finetuned-for-chat\checkpoint-2000
Configuration saved in ruRoberta-large-finetuned-for-chat\checkpoint-2000\config.json
Model weights saved in ruRoberta-large-finetuned-for-chat\checkpoint-20

Saving model checkpoint to ruRoberta-large-finetuned-for-chat\checkpoint-12000
Configuration saved in ruRoberta-large-finetuned-for-chat\checkpoint-12000\config.json
Model weights saved in ruRoberta-large-finetuned-for-chat\checkpoint-12000\pytorch_model.bin
Deleting older checkpoint [ruRoberta-large-finetuned-for-chat\checkpoint-10500] due to args.save_total_limit
Saving model checkpoint to ruRoberta-large-finetuned-for-chat\checkpoint-12500
Configuration saved in ruRoberta-large-finetuned-for-chat\checkpoint-12500\config.json
Model weights saved in ruRoberta-large-finetuned-for-chat\checkpoint-12500\pytorch_model.bin
Deleting older checkpoint [ruRoberta-large-finetuned-for-chat\checkpoint-11000] due to args.save_total_limit
Saving model checkpoint to ruRoberta-large-finetuned-for-chat\checkpoint-13000
Configuration saved in ruRoberta-large-finetuned-for-chat\checkpoint-13000\config.json
Model weights saved in ruRoberta-large-finetuned-for-chat\checkpoint-13000\pytorch_model.bin
Delet

TrainOutput(global_step=16728, training_loss=538006.8943089431, metrics={'train_runtime': 21762.8498, 'train_samples_per_second': 12.3, 'train_steps_per_second': 0.769, 'total_flos': 6.237331564144435e+16, 'train_loss': 538006.8943089431, 'epoch': 3.0})

In [17]:
model.push_to_hub(f"{model_name}-finetuned-for-chat")

Configuration saved in ruRoberta-large-finetuned-for-chat\config.json
Model weights saved in ruRoberta-large-finetuned-for-chat\pytorch_model.bin
Uploading the following files to pankratozzi/ruRoberta-large-finetuned-for-chat: config.json,pytorch_model.bin


In [22]:
text = "Добавить ложку <mask>, полить сливочным соусом"
text1 = "Обжарьте <mask> на легком огне"

In [11]:
model = AutoModelForMaskedLM.from_pretrained(f"pankratozzi/{model_name}-finetuned-for-chat")

In [None]:
mask_filler = pipeline("fill-mask", f"pankratozzi/{model_name}-finetuned-for-chat")

In [23]:
mask_filler(text, top_k=3)

[{'score': 0.50724196434021,
  'token': 36734,
  'token_str': ' меда',
  'sequence': 'Добавить ложку меда, полить сливочным соусом'},
 {'score': 0.15127043426036835,
  'token': 45517,
  'token_str': ' сметаны',
  'sequence': 'Добавить ложку сметаны, полить сливочным соусом'},
 {'score': 0.09975725412368774,
  'token': 17841,
  'token_str': ' сахара',
  'sequence': 'Добавить ложку сахара, полить сливочным соусом'}]

In [24]:
mask_filler(text1, top_k=3)

[{'score': 0.10000195354223251,
  'token': 651,
  'token_str': ' их',
  'sequence': 'Обжарьте их на легком огне'},
 {'score': 0.08920028060674667,
  'token': 9816,
  'token_str': ' мясо',
  'sequence': 'Обжарьте мясо на легком огне'},
 {'score': 0.06923796236515045,
  'token': 24585,
  'token_str': ' овощи',
  'sequence': 'Обжарьте овощи на легком огне'}]

In [28]:
mask_filler(text1, top_k=3)[0]["sequence"]

'Обжарьте их на легком огне'

In [18]:
tokenizer.push_to_hub(f"pankratozzi/{model_name}-finetuned-for-chat")

In [17]:
# tokenizer = AutoTokenizer.from_pretrained(f"pankratozzi/{model_name}-finetuned-for-chat")
inputs = tokenizer(text, return_tensors="pt")
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

# model = AutoModelForMaskedLM.from_pretrained("pankratozzi/{model_name}-finetuned-for-chat")
logits = model(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]

top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))

Добавить ложку  меда, полить  меда соусом
Добавить ложку  сметаны, полить  сметаны соусом
Добавить ложку  сахара, полить  сахара соусом
