In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TRANSFORMERS_CACHE'] = './hfcache_proj'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
cache_dir = "hfcache_proj"

In [4]:
import torch
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from datasets import load_metric, Dataset, DatasetDict

from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

from peft import (
    LoraConfig,
    get_peft_model
)


from tqdm.auto import tqdm, trange
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.nn.functional as F

In [6]:
device = torch.device('cuda:0')

In [7]:
df = pd.read_csv('../../data/2_preprocessed_data.csv', usecols=['data_patterns', 'Category']).rename(\
    columns={'data_patterns':'text', 'Category': 'label'})

le = LabelEncoder()
le.fit(df['label'])

df['label'] = le.transform(df['label'])

In [10]:
full_dataset = Dataset.from_pandas(df)
full_dataset = full_dataset.class_encode_column("label")

Stringifying the column: 100%|█| 39772/39772 [00:00<00:00, 512555.68 ex
Casting to class labels: 100%|█| 39772/39772 [00:00<00:00, 633505.21 ex


In [11]:
dataset = full_dataset.train_test_split(test_size=0.15, stratify_by_column="label")

test_dataset = dataset['test']

temp_dataset = dataset['train'].train_test_split(test_size=0.2, stratify_by_column="label")
val_dataset = temp_dataset['test']
train_dataset = temp_dataset['train']

In [12]:
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 27044
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 6762
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 5966
    })
})

In [13]:
model_name = "ai-forever/ruRoberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, device_map=device, )
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, cache_dir=cache_dir,
                                                           device_map=device, is_decoder=False)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 2,626,564 || all params: 357,990,408 || trainable%: 0.7336967531264134


In [16]:
dataset = dataset.map(lambda e:
                      tokenizer(e['text'],
                                truncation = True,
                                max_length=300,
                                padding='max_length'), batched=True)


dataset = dataset.remove_columns('text')
dataset.set_format(type='torch', device=device)

Map: 100%|██████████████| 27044/27044 [00:05<00:00, 5246.16 examples/s]
Map: 100%|████████████████| 6762/6762 [00:01<00:00, 6083.83 examples/s]
Map: 100%|████████████████| 5966/5966 [00:00<00:00, 6281.14 examples/s]


In [20]:
# Загрузка метрики вне функции
f1_metric = load_metric("f1")

# Параметры обучения
training_args = TrainingArguments(
    output_dir="./results_skil_3_lora",  # Каталог для сохранения результатов обучения
    num_train_epochs=10,  # Количество эпох обучения
    per_device_train_batch_size=32,  # Размер батча для обучения
    per_device_eval_batch_size=16,  # Размер батча для валидации
    warmup_steps=400,  # Количество шагов разогрева
    weight_decay=0.01,  # Сила L2 регуляризации
    logging_dir="./runs",  # Каталог для логов TensorBoard
    logging_steps=500,  # Логирование каждые 500 шагов
    evaluation_strategy="epoch",  # Стратегия оценки
    save_strategy="epoch",  # Стратегия сохранения модели
    load_best_model_at_end=True,  # Загрузка лучшей модели в конце
    # metric_for_best_model="f1",  # Метрика для выбора лучшей модели
    # greater_is_better=True,  # Указывает, что большее значение F1 лучше
    no_cuda=False, 
    dataloader_pin_memory=False,
    label_names=["labels"]
)

# Функция для вычисления метрик
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    

trainer = Trainer(
    model=model,  # Модель для обучения
    args=training_args,  # Аргументы обучения
    train_dataset=dataset["train"],  # Набор данных для обучения
    eval_dataset=dataset["validation"],  # Набор данных для валидации
    compute_metrics=compute_metrics,  # Метрики для вычисления
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.5356,0.521923,0.798849
2,0.5226,0.507931,0.8087
3,0.4956,0.504487,0.810562
4,0.4858,0.496228,0.810801
5,0.4768,0.491256,0.809526
6,0.4635,0.487192,0.810751
7,0.4542,0.486137,0.815752
8,0.4522,0.487222,0.814514
9,0.4531,0.484713,0.814181
10,0.4523,0.483156,0.814824


TrainOutput(global_step=8460, training_loss=0.4772305066985723, metrics={'train_runtime': 3893.7573, 'train_samples_per_second': 69.455, 'train_steps_per_second': 2.173, 'total_flos': 1.48954305784896e+17, 'train_loss': 0.4772305066985723, 'epoch': 10.0})

In [22]:
trainer.predict(dataset["test"])

PredictionOutput(predictions=array([[ 2.6408105 ,  1.4095021 , -1.7025145 , -3.7913508 ],
       [ 1.1955377 , -1.013178  ,  0.3954386 , -1.9937655 ],
       [ 1.982578  ,  1.0721871 , -0.3338479 , -3.5502372 ],
       ...,
       [ 3.8960667 , -0.7571025 , -0.79035205, -3.175102  ],
       [ 1.5289737 ,  0.5212045 ,  0.37871343, -3.2820044 ],
       [-0.04937939,  2.91227   , -1.4072332 , -2.3378756 ]],
      dtype=float32), label_ids=array([1, 0, 0, ..., 0, 1, 1]), metrics={'test_loss': 0.4879555106163025, 'test_f1': 0.8136142085151359, 'test_runtime': 36.2537, 'test_samples_per_second': 164.563, 'test_steps_per_second': 10.289})

In [23]:
#  model_path = "./skil_save_3_lora"
#  trainer.save_model(model_path)