In [None]:
!pip -q install -U sentence-transformers peft bitsandbytes datasets==3.6.0

### Импорты и утилиты

In [None]:
import os, gc, random, math
import numpy as np

import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses, models
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer

from peft import LoraConfig, get_peft_model, TaskType
from bitsandbytes.optim import AdamW8bit
from datasets import Dataset, load_dataset

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
def flush():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

### Подготовка датасета

In [None]:
def extract_pairs(ds, max_samples=None, seed=42):
    pairs = []
    for ex in ds:
        q = ex.get("question")
        pos = ex.get("context")
        pairs.append((q, pos))

    if seed is not None:
        random.seed(seed)
        random.shuffle(pairs)

    if max_samples is not None:
        pairs = pairs[:max_samples]
    return pairs


ds_train = load_dataset("kuznetsoffandrey/sberquad", split="train[:2000]")
ds_val = load_dataset("kuznetsoffandrey/sberquad", split="validation[:500]")

train_pairs = extract_pairs(ds_train)
val_pairs   = extract_pairs(ds_val)

print(f"Train pairs: {len(train_pairs)} | Val pairs: {len(val_pairs)}")
print("Sample train pair:", train_pairs[0])

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


Train pairs: 2000 | Val pairs: 500
Sample train pair: ('где в основном российские метрополитены расположены', 'Кроме того, Максимом Горьким в Городе Жёлтого Дьявола было введено в русский язык слово-калька подземка . Оно прижилось, но преимущественно в качестве обозначения зарубежных метрополитенов (лондонская подземка, нью-йоркская подземка и т. д.), хотя в последнее время встречается в российской прессе и применительно к российским метрополитенам, проложенным в основном под землёй. Соответственно, преимущественно эстакадные метрополитены называют надземками , несмотря на то, что таких метрополитенов в России пока ещё нет.')


### Дообучение

In [None]:
flush()

base_name = "intfloat/multilingual-e5-small"
st_model = SentenceTransformer(base_name, device=device)

# Извлекаем базовый AutoModel
backbone = st_model[0].auto_model

# Включаем gradient checkpointing
if hasattr(backbone, "gradient_checkpointing_enable"):
    backbone.gradient_checkpointing_enable()

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key", "value", "dense"],
    task_type=TaskType.FEATURE_EXTRACTION,
)
peft_backbone = get_peft_model(backbone, lora_cfg)
peft_backbone.print_trainable_parameters()

st_model[0].auto_model = peft_backbone

loss_fn = losses.MultipleNegativesRankingLoss(st_model)

trainable params: 1,339,392 || all params: 118,993,152 || trainable%: 1.1256


In [None]:
def embed(texts, model, batch_size=128, normalize=True):
    vectors = model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=normalize,
        device=device,
        show_progress_bar=False,
    )
    return vectors

queries = [q for q,_ in val_pairs]
docs    = [d for _,d in val_pairs]

q_vecs = embed(queries, st_model)
d_vecs = embed(docs, st_model)

sims = np.matmul(q_vecs, d_vecs.T)
k = min(5, sims.shape[1])
topk_idx = np.argpartition(-sims, kth=k-1, axis=1)[:, :k]

true_idx = np.arange(len(val_pairs))
hits = (topk_idx == true_idx[:, None]).any(axis=1)
hit5 = hits.mean()
print(f"Hit@5: {hit5:.3f}")

Hit@5: 0.980


In [None]:
train_data = [
    {
        "anchor": q,
        "positive": d
    }
    for q, d in train_pairs
]
train_ds = Dataset.from_list(train_data)

val_data = [
    {
        "anchor": q,
        "positive": d
    }
    for q, d in val_pairs
]
val_ds = Dataset.from_list(val_data)

In [None]:
epochs = 5
batch_size = 32
gradient_accumulation_steps = 4
max_steps_cap = 120
warmup_ratio = 0.05

steps_per_epoch = min(math.ceil(len(train_ds) / batch_size), max_steps_cap)
total_steps = steps_per_epoch * epochs

loss_fn = losses.MultipleNegativesRankingLoss(st_model)

training_args = SentenceTransformerTrainingArguments(
    output_dir="st-encoder-qlora-out",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=2e-4,
    warmup_ratio=warmup_ratio,
    num_train_epochs=epochs,
    max_steps=total_steps,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=50,
    report_to="none",
    optim="paged_adamw_8bit",
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    dataloader_drop_last=True,
    dataloader_num_workers=0,
    seed=42,
)

trainer = SentenceTransformerTrainer(
    model=st_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    loss=loss_fn,
)

trainer.train()

st_model.save("st-encoder-qlora-out/final_model")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss
50,0.1937,0.039785
100,0.1447,0.035343
150,0.125,0.035241
200,0.1015,0.03462
250,0.1013,0.034605
300,0.0969,0.034676


In [None]:
q_vecs_after = embed(queries, st_model)
d_vecs_after = embed(docs, st_model)

sims_after = np.matmul(q_vecs_after, d_vecs_after.T)

k = min(5, sims_after.shape[1])
topk_idx = np.argpartition(-sims_after, kth=k-1, axis=1)[:, :k]

true_idx = np.arange(len(val_pairs))
hits = (topk_idx == true_idx[:, None]).any(axis=1)
hit5 = hits.mean()

print(f"Hit@5: {hit5:.3f}")

Hit@5: 0.986
