In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
import evaluate
from sklearn.model_selection import train_test_split

# === Выбор оптимальной модели ===
model_name = "microsoft/deberta-v3-base"  # Альтернативы: "roberta-base", "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# === Загрузка данных ===
data_path = "cross_encoder_dataset.csv"
df = pd.read_csv(data_path)

# === Создание пар для обучения кросс-энкодера ===
reranker_data = []
for _, row in df.iterrows():
    reranker_data.append({"combined": f"{row['final_context']} [SEP] {row['question']} [SEP] {row['answer']}", "label": 1})
    reranker_data.append({"combined": f"{row['final_context']} [SEP] {row['question']} [SEP] {row['neg_answer']}", "label": 0})

# === Разбиение на train/valid ===
reranker_df = pd.DataFrame(reranker_data)
df_train, df_valid = train_test_split(reranker_df, test_size=0.15, random_state=42)

# === Создание DatasetDict для Hugging Face ===
dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train.reset_index(drop=True)),
    "valid": Dataset.from_pandas(df_valid.reset_index(drop=True)),
})

# === Оптимизированная токенизация ===
def preprocess_data(examples):
    return tokenizer(examples["combined"], truncation=True, max_length=512)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
encoded_dataset = dataset.map(preprocess_data, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")

# === Загрузка модели ===
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# === Определение метрик ===
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    acc_result = accuracy.compute(predictions=preds, references=eval_pred.label_ids)
    f1_result = f1.compute(predictions=preds, references=eval_pred.label_ids, average="weighted")
    return {"accuracy": acc_result["accuracy"], "f1": f1_result["f1"]}

# === Оптимальные параметры тренировки ===
training_args = TrainingArguments(
    output_dir="CrossEncoder_Optimized",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,  # Увеличен размер батча
    per_device_eval_batch_size=16,
    learning_rate=2e-5,  # Оптимальная скорость обучения
    weight_decay=0.01,
    num_train_epochs=4,  # Снижено количество эпох
    warmup_ratio=0.1,
    optim="adamw_torch",
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    save_total_limit=1,
    push_to_hub=False,
    fp16=True,  # Включение 16-битных вычислений
    gradient_accumulation_steps=2,  # Увеличение эффективного размера батча
)

# === Запуск обучения ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# === Сохранение модели ===
model.save_pretrained("CrossEncoder_Optimized")
tokenizer.save_pretrained("CrossEncoder_Optimized")

print("Обучение завершено!")


In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [1]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
import evaluate
import warnings
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [2]:
# Загрузка триплетов
triplets_df = pd.read_pickle("/content/data/house_triplets.pkl")

# Создание данных для кросс-энкодера
reranker_data = []

for _, row in triplets_df.iterrows():
    # Добавление позитивного примера
    reranker_data.append({"combined": f"{row['anchor']} [SEP] {row['response']}", "label": 1})

    # Добавление негативного примера
    reranker_data.append({"combined": f"{row['anchor']} [SEP] {row['neg_response']}", "label": 0})

# Преобразование в DataFrame
reranker_df = pd.DataFrame(reranker_data)

# Сохранение
reranker_df.to_pickle("/content/data/scripts_for_reranker.pkl")

print(f"Создан датасет для кросс-энкодера: {len(reranker_df)} пар.")

Создан датасет для кросс-энкодера: 14834 пар.


In [3]:
# Настройки
model_name = "bert-base-uncased"
batch_size = 8
num_epochs = 8
learning_rate = 1e-5

# Загрузка данных
data_path = "/content/data/scripts_for_reranker.pkl"
df = pd.read_pickle(data_path)
df_train, df_valid = train_test_split(df, test_size=0.2, random_state=42)

# Создание датасетов
dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(df_train.reset_index(drop=True)),
        "valid": Dataset.from_pandas(df_valid.reset_index(drop=True)),
    }
)

In [4]:
# Токенизация
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_data(examples):
    return tokenizer(examples["combined"], truncation=True, padding=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

encoded_dataset = dataset.map(preprocess_data, batched=True)
encoded_dataset = encoded_dataset.remove_columns(["combined"])
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")


Map:   0%|          | 0/11867 [00:00<?, ? examples/s]

Map:   0%|          | 0/2967 [00:00<?, ? examples/s]

In [5]:
# === Определение модели ===
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# === Определение метрик ===
ACCURACY = evaluate.load("accuracy")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc_result = ACCURACY.compute(predictions=preds, references=p.label_ids)
    return {"accuracy": acc_result["accuracy"]}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Модель для векторизации
bi_encoder = SentenceTransformer("all-mpnet-base-v2")

# Кодирование данных для вычисления эмбеддингов
train_embeddings = bi_encoder.encode(df_train["combined"].tolist(), convert_to_numpy=True)
valid_embeddings = bi_encoder.encode(df_valid["combined"].tolist(), convert_to_numpy=True)

# Косинусное сходство между всеми эмбеддингами
cosine_similarities = cosine_similarity(train_embeddings, valid_embeddings)

# Cреднее и максимальное сходство
print(f"Среднее косинусное сходство между тренировочными и валидационными данными: {cosine_similarities.mean():.4f}")
print(f"Максимальное косинусное сходство: {cosine_similarities.max():.4f}")


Среднее косинусное сходство между тренировочными и валидационными данными: 0.2564
Максимальное косинусное сходство: 1.0000


In [7]:
# Проверка на одинаковые строки между тренировочным и валидационным наборами
intersection = set(df_train["combined"]).intersection(set(df_valid["combined"]))
print(f"Количество одинаковых примеров: {len(intersection)}")

Количество одинаковых примеров: 25


In [8]:
# Удаление одинаковых строки из валидационного набора
df_valid = df_valid[~df_valid["combined"].isin(df_train["combined"])]

# Проверка
intersection = set(df_train["combined"]).intersection(set(df_valid["combined"]))
print(f"Количество одинаковых примеров после удаления: {len(intersection)}")


Количество одинаковых примеров после удаления: 0


In [9]:
from transformers import EarlyStoppingCallback

# Ранняя остановка
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

# Параметры тренировки
training_args = TrainingArguments(
    output_dir=f"/content/RerankerModel_chat_bot",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",  # Оценка на каждую эпоху
    save_strategy="epoch",  # Сохранение на каждую эпоху
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=0.001,
    num_train_epochs=num_epochs,
    warmup_ratio=0.1,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,  # Загружаем лучшую модель в конце
    push_to_hub=False,
    report_to="none",
)

# Создаем тренера с ранней остановкой
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping]
)

trainer.train()

print("Обучение завершено!")


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5744,0.491759,0.752275
2,0.391,0.419127,0.803842
3,0.3105,0.657988,0.828109
4,0.1822,0.889713,0.830131


Обучение завершено!


In [9]:
# Сохранение модели локально
model_path = "/content/RerankerModel_chat_bot"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Загрузка в Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

model.push_to_hub("nikatonika/chatbot_reranker")
tokenizer.push_to_hub("nikatonika/chatbot_reranker")

print("Модель кросс-энкодера загружена в Hugging Face Hub!")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Модель кросс-энкодера загружена в Hugging Face Hub!
