In [1]:
!pip install transformers
!pip install datasets
!pip install trl
!pip install git+https://github.com/huggingface/peft

In [2]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import PeftModel

In [3]:
df = pd.read_excel('train_dataset_Датасет.xlsx')
df.dropna(inplace=True)
df.to_csv('dataset.csv', index=False)
dataset = load_dataset("csv", data_files="dataset.csv", split="train")

In [4]:
model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['QUESTION'])):
        text = f"### Question: {example['QUESTION'][i]}\n ### Answer: {example['ANSWER'][i]}"
        output_texts.append(text)
    return output_texts
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir="./checkpoint",
    num_train_epochs=30,  # Increase the number of epochs here
    per_device_train_batch_size=4,
    save_total_limit=2,
    learning_rate=2e-5,
    warmup_steps=2000,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
)
trainer = SFTTrainer(
    model,
    args=training_args,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    eval_dataset=dataset,
)
trainer.train()