# AutoRAG-style RAG Pipeline

Этот ноутбук — автоконфигурируемый пайплайн в стиле AutoRAG: chunking → индекс → hybrid retrieval → cross-encoder rerank → LLM (Qwen) → submission.

## Block 0. Setup & конфиг

In [None]:
!pip install qdrant_client rank-bm25 sentence-transformers datasets nltk langchain --quiet

In [None]:
import os
import random
import json
from pathlib import Path
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import pipeline
from datasets import load_dataset

from sentence_transformers import SentenceTransformer, CrossEncoder
from qdrant_client import QdrantClient, models
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi

import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

DATA_DIR = Path("/kaggle/input")
DOC_SOURCE = "hf"  # "hf" или "local"

HF_DATASET_NAME = "your/dataset"
HF_SPLIT_DOCS = "train"
HF_SPLIT_QA = "validation"

RAW_DOCS_DIR = DATA_DIR / "docs"

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 250
MIN_CHARS = 50

TOP_K_RETRIEVAL = 10
TOP_K_FINAL = 5

EMBED_MODEL_ID = "intfloat/multilingual-e5-large"
CROSS_ENCODER_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"

GEN_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/1.5b-instruct/1"
MAX_NEW_TOKENS = 256
TEMP = 0.2
TOP_P = 0.9

QDRANT_COLLECTION = "autorag_like_rag"
EMBED_DIM = 1024

## Block 1. Загрузка данных (corpus + QA)

In [None]:
def load_docs_and_qa() -> Tuple[List[Dict[str, Any]], pd.DataFrame]:
    docs = []
    qa_df = None

    if DOC_SOURCE == "hf":
        dataset_docs = load_dataset(HF_DATASET_NAME, split=HF_SPLIT_DOCS)
        for i, row in enumerate(dataset_docs):
            text = str(row.get("content", ""))
            if text.strip():
                docs.append({"doc_id": i, "text": text, "meta": {k: row[k] for k in row if k != "content"}})

        dataset_qa = load_dataset(HF_DATASET_NAME, split=HF_SPLIT_QA)
        qa_df = dataset_qa.to_pandas()
    else:
        for i, p in enumerate(sorted(RAW_DOCS_DIR.glob("*.txt"))):
            text = p.read_text(encoding="utf-8", errors="ignore")
            docs.append({"doc_id": i, "text": text, "meta": {"filename": p.name}})
        qa_df = pd.read_csv(DATA_DIR / "qa.csv")

    print(f"Loaded {len(docs)} docs, QA shape = {None if qa_df is None else qa_df.shape}")
    return docs, qa_df


docs, qa_df = load_docs_and_qa()

## Block 2. Chunking

In [None]:
def make_recursive_chunks(
    docs: List[Dict[str, Any]],
    chunk_size: int = CHUNK_SIZE,
    chunk_overlap: int = CHUNK_OVERLAP,
    min_chars: int = MIN_CHARS
) -> List[Dict[str, Any]]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    all_chunks = []
    for d in tqdm(docs, desc="Chunking docs (recursive)"):
        doc_id = d["doc_id"]
        text = d["text"]
        chunks_loc = splitter.split_text(text)
        for idx, ch in enumerate(chunks_loc):
            if len(ch.strip()) < min_chars:
                continue
            all_chunks.append(
                {
                    "doc_id": doc_id,
                    "chunk_id": f"{doc_id}_{idx}",
                    "text": ch,
                    "meta": d.get("meta", {}),
                }
            )
    print("Total chunks:", len(all_chunks))
    return all_chunks


chunks = make_recursive_chunks(docs)

## Block 3. Эмбеддинги + Qdrant + BM25

In [None]:
embedding_model = SentenceTransformer(
    EMBED_MODEL_ID,
    device=str(device),
)
embedding_model.eval()

test_vec = embedding_model.encode(["test"], normalize_embeddings=True)
EMBED_DIM = test_vec.shape[1]
print("Embedding dim:", EMBED_DIM)

client = QdrantClient(":memory:")

client.recreate_collection(
    collection_name=QDRANT_COLLECTION,
    vectors_config=models.VectorParams(
        size=EMBED_DIM,
        distance=models.Distance.COSINE,
    ),
    on_disk_payload=False,
)
print("Qdrant collection created:", QDRANT_COLLECTION)

In [None]:
def index_chunks_qdrant(chunks: List[Dict[str, Any]]) -> None:
    texts = [c["text"] for c in chunks]
    vectors = embedding_model.encode(
        texts,
        batch_size=64,
        normalize_embeddings=True,
        show_progress_bar=True,
    ).astype(np.float32)

    payloads = []
    for i, ch in enumerate(chunks):
        payloads.append(
            {
                "doc_id": ch["doc_id"],
                "chunk_id": ch["chunk_id"],
                **(ch.get("meta") or {}),
            }
        )

    client.upsert(
        collection_name=QDRANT_COLLECTION,
        points=[
            models.PointStruct(id=int(i), vector=vectors[i].tolist(), payload=payloads[i])
            for i in range(len(vectors))
        ],
    )
    print("Indexed to Qdrant:", len(vectors))


index_chunks_qdrant(chunks)

In [None]:
tokenized_chunks = [word_tokenize(c["text"]) for c in chunks]
bm25 = BM25Okapi(tokenized_chunks)
print("BM25 corpus ready:", len(tokenized_chunks))

## Block 4. Ретраиверы (semantic / BM25 / hybrid)

In [None]:
def semantic_search(query: str, top_k: int = TOP_K_RETRIEVAL):
    query_vec = embedding_model.encode([query], normalize_embeddings=True).astype(np.float32)[0].tolist()
    hits = client.search(
        collection_name=QDRANT_COLLECTION,
        query_vector=query_vec,
        limit=top_k,
    )
    results = []
    for h in hits:
        results.append(
            {
                "score_semantic": h.score,
                "chunk_id": h.payload.get("chunk_id"),
                "doc_id": h.payload.get("doc_id"),
            }
        )
    return results


def bm25_search(query: str, top_k: int = TOP_K_RETRIEVAL):
    tokenized_query = word_tokenize(query)
    scores = bm25.get_scores(tokenized_query)
    idx_scores = list(enumerate(scores))
    idx_scores.sort(key=lambda x: x[1], reverse=True)
    results = []
    for idx, sc in idx_scores[:top_k]:
        ch = chunks[idx]
        results.append(
            {
                "score_bm25": float(sc),
                "chunk_id": ch["chunk_id"],
                "doc_id": ch["doc_id"],
            }
        )
    return results


def hybrid_search(query: str, alpha: float = 0.5, top_k: int = TOP_K_RETRIEVAL):
    sem = semantic_search(query, top_k=top_k * 2)
    bm = bm25_search(query, top_k=top_k * 2)

    sem_map = {(r["doc_id"], r["chunk_id"]): r for r in sem}
    bm_map = {(r["doc_id"], r["chunk_id"]): r for r in bm}

    all_keys = set(sem_map.keys()) | set(bm_map.keys())
    results = []

    max_sem = max([r["score_semantic"] for r in sem], default=1.0)
    max_bm = max([r["score_bm25"] for r in bm], default=1.0)

    for key in all_keys:
        sem_s = sem_map.get(key, {}).get("score_semantic", 0.0) / max_sem
        bm_s = bm_map.get(key, {}).get("score_bm25", 0.0) / max_bm
        score = alpha * sem_s + (1 - alpha) * bm_s
        doc_id, chunk_id = key
        results.append(
            {
                "score_hybrid": float(score),
                "doc_id": doc_id,
                "chunk_id": chunk_id,
            }
        )

    results.sort(key=lambda x: x["score_hybrid"], reverse=True)
    return results[:top_k]

## Block 5. Cross-encoder reranker

In [None]:
cross_encoder = CrossEncoder(CROSS_ENCODER_ID, device=str(device))


def get_chunks_by_ids(chunk_keys: List[Tuple[int, str]]) -> List[str]:
    index_map = {(c["doc_id"], c["chunk_id"]): c["text"] for c in chunks}
    return [index_map[k] for k in chunk_keys]


def rerank_with_cross_encoder(query: str, candidates: List[Dict[str, Any]], top_k: int = TOP_K_FINAL):
    if not candidates:
        return []

    keys = []
    for c in candidates:
        keys.append((c["doc_id"], c["chunk_id"]))
    texts = get_chunks_by_ids(keys)

    pair_inputs = []
    for t in texts:
        pair_inputs.append((query, t))

    scores = cross_encoder.predict(pair_inputs)
    scored = []
    for (doc_id, chunk_id), sc in zip(keys, scores):
        scored.append(
            {
                "doc_id": doc_id,
                "chunk_id": chunk_id,
                "score_ce": float(sc),
            }
        )
    scored.sort(key=lambda x: x["score_ce"], reverse=True)
    return scored[:top_k]

## Block 6. LLM-генерация (Qwen) + RAG-answer

In [None]:
generation_pipeline = pipeline(
    "text-generation",
    model=GEN_MODEL_PATH,
    device=0 if device.type == "cuda" else -1,
    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
)

SYSTEM_PROMPT = """You are a helpful assistant that answers questions using the provided context.
Answer in the same language as the question.
If the answer cannot be found in the context, say that you don't know and avoid hallucinations.
"""


def build_prompt(query: str, context_chunks: List[str]) -> str:
    context_text = "\n\n".join(context_chunks)
    prompt = f"{SYSTEM_PROMPT}\n\nContext:\n{context_text}\n\nQuestion:\n{query}\n\nAnswer:"
    return prompt


def llm_answer(query: str, context_chunks: List[str]) -> str:
    prompt = build_prompt(query, context_chunks)
    out = generation_pipeline(
        prompt,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=TEMP,
        top_p=TOP_P,
        pad_token_id=generation_pipeline.tokenizer.eos_token_id,
    )
    text = out[0]["generated_text"]
    if "Answer:" in text:
        text = text.split("Answer:", 1)[1].strip()
    return text

## Block 7. End-to-End RAG pipeline

In [None]:
def rag_answer_with_context_and_refs(
    query: str,
    retrieval_mode: str = "hybrid",
):
    if retrieval_mode == "semantic":
        initial = semantic_search(query, top_k=TOP_K_RETRIEVAL)
    elif retrieval_mode == "bm25":
        initial = bm25_search(query, top_k=TOP_K_RETRIEVAL)
    else:
        initial = hybrid_search(query, top_k=TOP_K_RETRIEVAL)

    reranked = rerank_with_cross_encoder(query, initial, top_k=TOP_K_FINAL)

    index_map = {(c["doc_id"], c["chunk_id"]): c["text"] for c in chunks}
    context_chunks = []
    refs = []
    for r in reranked:
        key = (r["doc_id"], r["chunk_id"])
        txt = index_map.get(key)
        if not txt:
            continue
        context_chunks.append(txt)
        refs.append(
            {
                "doc_id": int(r["doc_id"]),
                "chunk_id": str(r["chunk_id"]),
                "score_ce": r["score_ce"],
            }
        )

    answer = llm_answer(query, context_chunks)
    context_text = "\n\n---\n\n".join(context_chunks)

    refs_dict = {
        "retrieval_mode": retrieval_mode,
        "top_k": TOP_K_FINAL,
        "chunks": refs,
    }
    return answer, context_text, refs_dict

## Block 8. Submission (вопросы → ответы)

In [None]:
QA_ID_COL = "id"
QA_QUESTION_COL = "question"
SUBM_ID_COL = "id"
SUBM_ANSWER_COL = "answer"
SUBM_REFS_COL = "refs_json"


def build_submission(
    qa_df: pd.DataFrame,
    retrieval_mode: str = "hybrid",
    output_path: str = "submission.csv",
) -> pd.DataFrame:
    rows = []
    for _, row in tqdm(qa_df.iterrows(), total=len(qa_df), desc="Building submission"):
        q_id = row[QA_ID_COL]
        q_text = str(row[QA_QUESTION_COL])

        try:
            answer, context_text, refs_dict = rag_answer_with_context_and_refs(
                q_text,
                retrieval_mode=retrieval_mode,
            )
        except Exception as e:
            answer = f"error: {repr(e)}"
            refs_dict = {"error": repr(e)}

        rows.append(
            {
                SUBM_ID_COL: q_id,
                SUBM_ANSWER_COL: answer,
                SUBM_REFS_COL: json.dumps(refs_dict, ensure_ascii=False),
            }
        )

    subm_df = pd.DataFrame(rows)
    subm_df.to_csv(output_path, index=False)
    print("Saved submission to:", output_path)
    return subm_df


# раскомментируй, когда qa_df будет готова
# submission_df = build_submission(qa_df, retrieval_mode="hybrid", output_path="submission.csv")
# submission_df.head()

## Block 9. (Опционально) простая offline-оценка

In [None]:
GT_ANSWER_COL = "answer"


def simple_exact_match(pred: str, gt: str) -> float:
    return float(pred.strip().lower() == gt.strip().lower())


def evaluate_on_small_subset(
    qa_df: pd.DataFrame,
    n: int = 50,
    retrieval_mode: str = "hybrid",
):
    sub = qa_df.sample(min(n, len(qa_df)), random_state=SEED)
    scores = []
    for _, row in tqdm(sub.iterrows(), total=len(sub), desc="Eval"):
        q = str(row[QA_QUESTION_COL])
        gt = str(row[GT_ANSWER_COL])
        pred, _, _ = rag_answer_with_context_and_refs(q, retrieval_mode=retrieval_mode)
        scores.append(simple_exact_match(pred, gt))
    print("Exact match:", np.mean(scores))


# пример вызова:
# evaluate_on_small_subset(qa_df, n=30, retrieval_mode="hybrid")