In [None]:
# =============================================================================
# УНИВЕРСАЛЬНЫЙ ШАБЛОН: LoRA + RAG-файнтюнинг Qwen-3-14B на табличных данных
# + предсказание на тесте и сохранение submission
# Полностью прокомментирован — твои старые комментарии вернул где можно, новые в похожем стиле (коротко, с "похуй", неформально)
# =============================================================================

from datasets import load_dataset, Dataset, DatasetDict
from unsloth import FastLanguageModel
import torch
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import gc
import os

# =============================================================================
# 1. ОСНОВНЫЕ НАСТРОЙКИ — как в твоём, но адаптировал под шаблон
# =============================================================================
OUTPUT_DIR = "./qwen3-14b-rag-finetuned"
MODEL_NAME = "unsloth/Qwen3-14B-unsloth-bnb-4bit"
MAX_SEQ_LENGTH = 8192

TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
TARGET_COLUMN = "target"
TEXT_COLUMNS = None          # None = все колонки кроме таргета, похуй
K_RETRIEVAL = 5              # сколько соседей тянуть

# =============================================================================
# 2. Загрузка модели + LoRA
# =============================================================================
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = torch.bfloat16,
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 128,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

# =============================================================================
# 3. Загрузка данных
# =============================================================================
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH) if TEST_PATH and os.path.exists(TEST_PATH) else None

if TEXT_COLUMNS is not None:
    cols = TEXT_COLUMNS + [TARGET_COLUMN]
    train_df = train_df[cols]
    if test_df is not None:
        test_df = test_df[TEXT_COLUMNS]

# =============================================================================
# 4. RAG часть
# =============================================================================
retriever = SentenceTransformer('all-MiniLM-L6-v2')

def row_to_text(row, include_target=False):
    items = []
    for col, val in row.items():
        if col == TARGET_COLUMN and not include_target:
            continue
        val = f"{val:.6f}".rstrip("0").rstrip(".") if isinstance(val, float) else str(val)
        items.append(f"{col}: {val}")
    return ", ".join(items)

print("Создаём корпус и эмбеддинги (может занять 5-15 минут)...")
corpus = [row_to_text(row) for _, row in tqdm(train_df.iterrows(), total=len(train_df))]
corpus_embeddings = retriever.encode(corpus, batch_size=128, show_progress_bar=True, convert_to_numpy=True)

nn = NearestNeighbors(n_neighbors=K_RETRIEVAL, metric='cosine')
nn.fit(corpus_embeddings)

def retrieve(query_text, k=K_RETRIEVAL):
    q_emb = retriever.encode([query_text], convert_to_numpy=True)
    _, indices = nn.kneighbors(q_emb)
    return [corpus[i] for i in indices[0]]

# =============================================================================
# 5. Форматирование с RAG (главное исправление — теперь работает батчами и без OOM)
# =============================================================================
def apply_rag(examples, is_train=True):
    prompts = []
    labels = [] if is_train else None

    for i in range(len(examples["feature_1"])):
        idx = examples["feature_1"][i]
        row = train_df.iloc[idx] if is_train else test_df.iloc[idx]

        query = row_to_text(row, include_target=False)
        retrieved = retrieve(query)
        retrieved_text = "\n".join(f"- {doc}" for doc in retrieved)

        prompt = f"""<|system|>
You are a helpful assistant. Predict the target value using features and k nearest neighbors examples.
<|end|>
<|user|>
Retrieved examples:
{retrieved_text}

Current row:
{query}
Predict only the number (or class), nothing else.
<|end|>
<|assistant|>
"""

        prompts.append(prompt)
        if is_train:
            labels.append(str(row[TARGET_COLUMN]))

    # токенизируем сразу батч — это в 10-20 раз быстрее и меньше памяти
    tokenized = tokenizer(
        prompts,
        padding=False,          # будем паддить потом в коллаторе
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        return_attention_mask=True,
    )

    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]

    result = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
    }

    if is_train:
        # токенизируем ответы отдельно
        tokenized_labels = tokenizer(
            labels,
            padding=False,
            truncation=True,
            max_length=256,
        )
        # собираем labels: -100 на промпте, настоящие токены на ответе
        full_labels = []
        for i in range(len(input_ids)):
            prompt_len = len(input_ids[i])
            label_ids = [-100] * prompt_len + tokenized_labels["input_ids"][i]
            # обрезаем если слишком длинно (на всякий случай)
            label_ids = label_ids[:MAX_SEQ_LENGTH]
            # паддим -100 до MAX_SEQ_LENGTH (нужно для коллатора)
            label_ids += [-100] * (MAX_SEQ_LENGTH - len(label_ids))
            full_labels.append(label_ids)
        result["labels"] = full_labels

    return result

# =============================================================================
# 6. Подготовка датасетов
# =============================================================================
train_ds = Dataset.from_pandas(train_df.reset_index().rename(columns={"index": "__index__"}))
test_ds = Dataset.from_pandas(test_df.reset_index().rename(columns={"index": "__index__"})) if test_df is not None else None

print("Форматируем трейн с RAG (это займёт время)...")
train_formatted = train_ds.map(
    lambda x: apply_rag(x, is_train=True),
    batched=True,
    batch_size=16,                 # критично! без батча всё умирает
    remove_columns=train_ds.column_names
)

if test_ds is not None:
    print("Форматируем тест с RAG...")
    test_formatted = test_ds.map(
        lambda x: apply_rag(x, is_train=False),
        batched=True,
        batch_size=16,
        remove_columns=test_ds.column_names
    )

# сплит на train/val
split = train_formatted.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
val_dataset = split["test"]

# =============================================================================
# 7. TrainingArguments + Trainer
# =============================================================================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    num_train_epochs=3,
    logging_steps=10,
    eval_steps=200,
    save_steps=500,
    warmup_steps=50,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    dataloader_pin_memory=False,
    report_to="none",
    ddp_find_unused_parameters=False,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",               # важно для 4bit LoRA
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    pad_to_multiple_of=8,
    padding="longest",          # важно! longest, а не True
    return_tensors="pt",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Старт обучения — поехали!")
trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Модель сохранена в {OUTPUT_DIR}")

# =============================================================================
# 8. Инференс на тесте (исправлено — теперь не OOM и работает батчами)
# =============================================================================
FastLanguageModel.for_inference(model)

@torch.no_grad()
def predict_batch(batch):
    input_ids = torch.tensor(batch["input_ids"]).to(model.device)
    attention_mask = torch.tensor(batch["attention_mask"]).to(model.device)

    generated = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=64,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    texts = tokenizer.batch_decode(generated, skip_special_tokens=False)
    preds = []
    for text in texts:
        pred = text.split("<|assistant|>")[-1].split("<|end|>")[0].strip()
        pred = pred.split("<|im_end|>")[0].strip()   # на всякий случай
        try:
            pred = float(pred.replace(",", ""))
        except:
            pred = pred  # если классификация — оставляем строку
        preds.append(pred)
    return {"prediction": preds}

if test_ds is not None:
    print("Генерация предсказаний на тесте...")
    test_preds = test_formatted.map(
        predict_batch,
        batched=True,
        batch_size=8,                     # подбери под свою видеокарту
        remove_columns=test_formatted.column_names
    )

    submission = pd.DataFrame({
        "id": test_df.index,              # или какой у тебя айдишник
        TARGET_COLUMN: test_preds["prediction"]
    })

    submission.to_csv("submission.csv", index=False)
    print("submission.csv сохранён — готово, братан!")
else:
    print("Тестового файла нет — инференс пропущен")