In [None]:
pip install datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import os
import wandb
import torch
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [9]:
# === Загрузка данных ===
data_path = "data/reranker_dataset.pkl"
df = pd.read_pickle(data_path)

# === Настройка wandb ===
wandb.init(project="CrossEncoder_Optimized", name="training_run", sync_tensorboard=True)

# === Загрузка модели и токенизатора ===
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [None]:
# === Конфигурация модели с Dropout ===
config = AutoConfig.from_pretrained(
    model_name,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)

# === Создание модели ===
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)


# === Метрики ===
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    acc_result = accuracy.compute(predictions=preds, references=eval_pred.label_ids)
    f1_result = f1.compute(predictions=preds, references=eval_pred.label_ids, average="weighted")
    return {"accuracy": acc_result["accuracy"], "f1": f1_result["f1"]}

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import EarlyStoppingCallback

# === Разделение на train/valid ===
df_train, df_valid = train_test_split(df, test_size=0.15, random_state=42)

# === Создание DatasetDict для Hugging Face ===
dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train.reset_index(drop=True)),
    "valid": Dataset.from_pandas(df_valid.reset_index(drop=True)),
})

# === Токенизация ===
def preprocess_data(examples):
    return tokenizer(examples["combined"], truncation=True, max_length=512)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
encoded_dataset = dataset.map(preprocess_data, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")

# === Настройки обучения ===
training_args = TrainingArguments(
    output_dir="CrossEncoder_Optimized",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-7,  # Сниженный learning rate
    weight_decay=0.01,
    num_train_epochs=10,  # Ставим больше эпох, чтобы Early Stopping решил, когда остановиться
    warmup_ratio=0.05,
    optim="adamw_torch",
    lr_scheduler_type="linear",
    logging_dir="./wandb",
    logging_steps=50,
    report_to="wandb",
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16=True,
    gradient_accumulation_steps=2,
    gradient_checkpointing=False,
    metric_for_best_model="eval_loss",  # Ориентируемся на loss
    greater_is_better=False,  # Нужно, чтобы loss снижался
)


Map:   0%|          | 0/12608 [00:00<?, ? examples/s]

Map:   0%|          | 0/2226 [00:00<?, ? examples/s]

In [None]:
# === Обучение модели ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Остановится, если 2 эпохи без улучшения
)

# === Оценка модели перед обучением ===
eval_results = trainer.evaluate()
print("\nПоказатели модели перед обучением:")
for key, value in eval_results.items():
    print(f"{key}: {value:.6f}")




Показатели модели перед обучением:
eval_loss: 0.693656
eval_model_preparation_time: 0.003300
eval_accuracy: 0.513926
eval_f1: 0.348921
eval_runtime: 7.486100
eval_samples_per_second: 297.351000
eval_steps_per_second: 18.701000


In [None]:
# === Запуск обучения ===
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,F1
1,0.6991,0.692865,0.0033,0.513926,0.348921
2,0.6956,0.691273,0.0033,0.521114,0.366269
3,0.6911,0.688556,0.0033,0.690027,0.686211
4,0.6882,0.683082,0.0033,0.691375,0.677121
5,0.6839,0.672095,0.0033,0.64735,0.611458
6,0.6699,0.658817,0.0033,0.660377,0.627463
7,0.6632,0.647404,0.0033,0.661725,0.62818
8,0.6498,0.638177,0.0033,0.677448,0.650138
9,0.6459,0.633967,0.0033,0.675651,0.647615
10,0.6456,0.632074,0.0033,0.679245,0.652462


TrainOutput(global_step=3940, training_loss=0.6751836597616903, metrics={'train_runtime': 1525.1911, 'train_samples_per_second': 82.665, 'train_steps_per_second': 2.583, 'total_flos': 1.2375699762039936e+16, 'train_loss': 0.6751836597616903, 'epoch': 10.0})

In [None]:
# === Сохранение модели ===
model.save_pretrained("CrossEncoder_Optimized")
tokenizer.save_pretrained("CrossEncoder_Optimized")

print("Обучение завершено!")

wandb.finish()

Обучение завершено!


0,1
eval/accuracy,▁▇▇██
eval/f1,▁█▇██
eval/loss,█▁▆▅▃
eval/model_preparation_time,▁▁▁▁▁
eval/runtime,█▂▁▁▁
eval/samples_per_second,▁▇███
eval/steps_per_second,▁▇███
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▁▁▂▂▄▃▂▂▂▄▆▃▅▄█▃▆▄▅▅▂▄▅▅▅▇▃▄▃

0,1
eval/accuracy,0.75606
eval/f1,0.7447
eval/loss,0.63419
eval/model_preparation_time,0.0053
eval/runtime,6.5699
eval/samples_per_second,338.82
eval/steps_per_second,21.309
total_flos,4958178311455296.0
train/epoch,4.0
train/global_step,1576.0


In [10]:
# Загрузка в Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
# Сохранение модели локально
model_path = "/content/RerankerModel_chat_bot"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

model.push_to_hub("nikatonika/chatbot_reranker")
tokenizer.push_to_hub("nikatonika/chatbot_reranker")

print("Модель кросс-энкодера загружена в Hugging Face Hub!")

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Модель кросс-энкодера загружена в Hugging Face Hub!
