# Simple RAG Playground

Этот ноутбук — лёгкий, автономный playground для RAG:
- без Qdrant, всё в памяти;
- dense + BM25 + гибридный ретривер;
- отладка качества: смотрим, какие куски текста достаются под вопрос;
- простая оффлайн-оценка, если есть столбец с правильным ответом.

Структура блоков:
0) Setup & конфиг
1) Загрузка данных (docs + QA)
2) Chunking
3) Эмбеддинги + in-memory индекс + BM25
4) Ретраиверы + гибрид
5) LLM-генерация (локальная HF-модель)
6) Debug для одного вопроса
7) Оффлайн-оценка на подвыборке


## Block 0. Setup & конфиг

In [None]:
!pip install sentence-transformers rank-bm25 datasets nltk --quiet


In [None]:
import os
import random
import json
from pathlib import Path
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import pipeline
from datasets import load_dataset

from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

# === Конфиг данных ===
DATA_DIR = Path('/kaggle/input')  # поменяй под себя
DOC_SOURCE = 'hf'  # 'hf' или 'local'

# HF dataset (например, если в нём есть тексты и QA)
HF_DATASET_NAME = 'your/dataset'   # замени
HF_SPLIT_DOCS = 'train'
HF_SPLIT_QA = 'validation'
HF_DOC_TEXT_COL = 'content'        # колонка с текстом в HF

# Локальный вариант (если DOC_SOURCE='local')
LOCAL_DOCS_CSV = DATA_DIR / 'docs.csv'  # docs.csv: id, text
LOCAL_QA_CSV = DATA_DIR / 'qa.csv'      # qa.csv: id, question, [answer]
LOCAL_DOC_ID_COL = 'id'
LOCAL_DOC_TEXT_COL = 'text'

# === Chunking ===
CHUNK_SIZE = 800
CHUNK_OVERLAP = 200
MIN_CHARS = 50

# === Retrieval ===
TOP_K_DENSE = 15
TOP_K_BM25 = 15
TOP_K_FINAL = 8

# === Эмбеддер + LLM ===
EMBED_MODEL_ID = 'intfloat/multilingual-e5-large'
GEN_MODEL_PATH = '/kaggle/input/qwen2.5/transformers/1.5b-instruct/1'  # замени при необходимости
MAX_NEW_TOKENS = 256
TEMP = 0.2
TOP_P = 0.9

# === Колонки QA/submit ===
QA_ID_COL = 'id'
QA_QUESTION_COL = 'question'
QA_GT_ANSWER_COL = 'answer'  # если нет GT, можно игнорировать

SUBM_ID_COL = 'id'
SUBM_ANSWER_COL = 'answer'
SUBM_REFS_COL = 'refs_json'


## Block 1. Загрузка данных (docs + QA)

In [None]:
def load_docs_and_qa() -> Tuple[pd.DataFrame, pd.DataFrame]:
    if DOC_SOURCE == 'hf':
        ds_docs = load_dataset(HF_DATASET_NAME, split=HF_SPLIT_DOCS)
        docs_df = ds_docs.to_pandas()
        ds_qa = load_dataset(HF_DATASET_NAME, split=HF_SPLIT_QA)
        qa_df = ds_qa.to_pandas()
    else:
        docs_df = pd.read_csv(LOCAL_DOCS_CSV)
        qa_df = pd.read_csv(LOCAL_QA_CSV)

    print('Docs shape:', docs_df.shape)
    print('QA shape:', qa_df.shape)
    return docs_df, qa_df

docs_df, qa_df = load_docs_and_qa()


## Block 2. Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def make_chunks_from_docs(
    docs_df: pd.DataFrame,
    text_col: str,
    id_col: str = 'doc_id',
    chunk_size: int = CHUNK_SIZE,
    chunk_overlap: int = CHUNK_OVERLAP,
    min_chars: int = MIN_CHARS,
) -> pd.DataFrame:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=['\n\n', '\n', '. ', ' ', ''],
    )
    rows = []
    for _, row in tqdm(docs_df.iterrows(), total=len(docs_df), desc='Chunking docs'):
        doc_id = row[id_col] if id_col in docs_df.columns else _
        text = str(row[text_col])
        if not text.strip():
            continue
        parts = splitter.split_text(text)
        for idx, ch in enumerate(parts):
            ch = ch.strip()
            if len(ch) < min_chars:
                continue
            rows.append({'doc_id': doc_id, 'chunk_id': f'{doc_id}_{idx}', 'text': ch})
    chunks_df = pd.DataFrame(rows)
    print('Total chunks:', chunks_df.shape)
    return chunks_df

if DOC_SOURCE == 'hf':
    chunks_df = make_chunks_from_docs(docs_df, text_col=HF_DOC_TEXT_COL, id_col='id' if 'id' in docs_df.columns else 'doc_id')
else:
    chunks_df = make_chunks_from_docs(docs_df, text_col=LOCAL_DOC_TEXT_COL, id_col=LOCAL_DOC_ID_COL)


## Block 3. Эмбеддинги + in-memory индекс + BM25

In [None]:
embedding_model = SentenceTransformer(EMBED_MODEL_ID, device=str(device))
embedding_model.eval()
print('Loaded embedder:', EMBED_MODEL_ID)

def build_dense_index(chunks_df: pd.DataFrame) -> np.ndarray:
    texts = chunks_df['text'].tolist()
    vecs = embedding_model.encode(
        texts,
        batch_size=64,
        show_progress_bar=True,
        normalize_embeddings=True,
    )
    return np.array(vecs, dtype=np.float32)

dense_matrix = build_dense_index(chunks_df)
print('dense_matrix shape:', dense_matrix.shape)


In [None]:
tokenized_chunks = [word_tokenize(t) for t in chunks_df['text'].tolist()]
bm25 = BM25Okapi(tokenized_chunks)
print('BM25 corpus size:', len(tokenized_chunks))


## Block 4. Ретраиверы (dense / BM25 / hybrid)

In [None]:
def dense_search(query: str, top_k: int = TOP_K_DENSE) -> pd.DataFrame:
    q_vec = embedding_model.encode([query], normalize_embeddings=True)
    q_vec = q_vec.astype(np.float32)[0]
    sims = dense_matrix @ q_vec
    idx = np.argsort(-sims)[:top_k]
    sub = chunks_df.iloc[idx].copy()
    sub['score_dense'] = sims[idx]
    return sub

def bm25_search(query: str, top_k: int = TOP_K_BM25) -> pd.DataFrame:
    toks = word_tokenize(query)
    scores = bm25.get_scores(toks)
    idx = np.argsort(-scores)[:top_k]
    sub = chunks_df.iloc[idx].copy()
    sub['score_bm25'] = scores[idx]
    return sub

def hybrid_search(query: str, alpha: float = 0.5, top_k: int = TOP_K_FINAL) -> pd.DataFrame:
    d = dense_search(query, top_k=TOP_K_DENSE)
    b = bm25_search(query, top_k=TOP_K_BM25)

    d_ = d[['doc_id', 'chunk_id', 'score_dense']]
    b_ = b[['doc_id', 'chunk_id', 'score_bm25']]
    merged = pd.merge(d_, b_, on=['doc_id', 'chunk_id'], how='outer')

    max_d = merged['score_dense'].max() if merged['score_dense'].notna().any() else 1.0
    max_b = merged['score_bm25'].max() if merged['score_bm25'].notna().any() else 1.0
    merged['score_dense_n'] = merged['score_dense'] / max_d
    merged['score_bm25_n'] = merged['score_bm25'] / max_b
    merged['score_hybrid'] = alpha * merged['score_dense_n'].fillna(0) + (1 - alpha) * merged['score_bm25_n'].fillna(0)

    out = pd.merge(merged, chunks_df, on=['doc_id', 'chunk_id'], how='left')
    out = out.sort_values('score_hybrid', ascending=False).head(top_k).reset_index(drop=True)
    return out


## Block 5. LLM-генерация (Qwen / любая HF-модель)

In [None]:
generation_pipeline = pipeline(
    'text-generation',
    model=GEN_MODEL_PATH,
    device=0 if device.type == 'cuda' else -1,
    torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32,
)
print('Loaded generation model from:', GEN_MODEL_PATH)

SYSTEM_PROMPT = '''You are a helpful assistant that answers questions using the provided context.
Answer in the same language as the question.
If the answer cannot be found in the context, say that you don't know and avoid hallucinations.
'''

def build_prompt(query: str, context_chunks: List[str]) -> str:
    ctx = '\n\n'.join(context_chunks)
    return f"{SYSTEM_PROMPT}\n\nContext:\n{ctx}\n\nQuestion:\n{query}\n\nAnswer:"

def llm_answer(query: str, context_chunks: List[str]) -> str:
    prompt = build_prompt(query, context_chunks)
    out = generation_pipeline(
        prompt,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=TEMP,
        top_p=TOP_P,
        pad_token_id=generation_pipeline.tokenizer.eos_token_id,
    )
    text = out[0]['generated_text']
    if 'Answer:' in text:
        text = text.split('Answer:', 1)[1].strip()
    return text


## Block 6. Debug одного вопроса

In [None]:
def debug_question(
    query: str,
    mode: str = 'hybrid',  # 'dense', 'bm25', 'hybrid'
    top_k: int = TOP_K_FINAL,
) -> Dict[str, Any]:
    if mode == 'dense':
        retrieved = dense_search(query, top_k=top_k)
    elif mode == 'bm25':
        retrieved = bm25_search(query, top_k=top_k)
    else:
        retrieved = hybrid_search(query, top_k=top_k)

    context_chunks = retrieved['text'].tolist()
    answer = llm_answer(query, context_chunks)

    print('=== QUESTION ===')
    print(query)
    print('\n=== TOP CHUNKS ===')
    for i, row in retrieved.head(top_k).iterrows():
        print(f"--- chunk #{i} | doc_id={row['doc_id']} | chunk_id={row['chunk_id']}")
        if 'score_hybrid' in row and not pd.isna(row['score_hybrid']):
            print('score_hybrid =', row['score_hybrid'])
        elif 'score_dense' in row and not pd.isna(row['score_dense']):
            print('score_dense =', row['score_dense'])
        elif 'score_bm25' in row and not pd.isna(row['score_bm25']):
            print('score_bm25 =', row['score_bm25'])
        print(row['text'][:600])
        print('')

    print('=== ANSWER ===')
    print(answer)

    refs = [
        {
            'doc_id': row['doc_id'],
            'chunk_id': row['chunk_id'],
            'score_hybrid': row.get('score_hybrid', None),
            'score_dense': row.get('score_dense', None),
            'score_bm25': row.get('score_bm25', None),
        }
        for _, row in retrieved.head(top_k).iterrows()
    ]

    return {
        'query': query,
        'answer': answer,
        'retrieved': retrieved,
        'refs': refs,
    }

# пример интерактивного вызова:
# debug_question('What is the main idea of the document?')


## Block 7. Оффлайн-оценка на подвыборке (если есть GT-ответы)

In [None]:
def simple_f1(pred: str, gt: str) -> float:
    pred_tokens = pred.lower().split()
    gt_tokens = gt.lower().split()
    if not pred_tokens or not gt_tokens:
        return 0.0
    common = set(pred_tokens) & set(gt_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def evaluate_subset(
    qa_df: pd.DataFrame,
    n: int = 30,
    mode: str = 'hybrid',
) -> pd.DataFrame:
    if QA_GT_ANSWER_COL not in qa_df.columns:
        raise ValueError(f'Колонка с GT ответом {QA_GT_ANSWER_COL} не найдена в qa_df')

    sub = qa_df.sample(min(n, len(qa_df)), random_state=SEED)
    rows = []
    for _, row in tqdm(sub.iterrows(), total=len(sub), desc='Eval subset'):
        q = str(row[QA_QUESTION_COL])
        gt = str(row[QA_GT_ANSWER_COL])

        if mode == 'dense':
            retrieved = dense_search(q, top_k=TOP_K_FINAL)
        elif mode == 'bm25':
            retrieved = bm25_search(q, top_k=TOP_K_FINAL)
        else:
            retrieved = hybrid_search(q, top_k=TOP_K_FINAL)
        ctx = retrieved['text'].tolist()
        pred = llm_answer(q, ctx)

        f1 = simple_f1(pred, gt)
        rows.append({
            QA_ID_COL: row[QA_ID_COL],
            'question': q,
            'gt': gt,
            'pred': pred,
            'f1': f1,
        })

    res_df = pd.DataFrame(rows)
    print('Mean F1:', res_df['f1'].mean())
    return res_df

# пример вызова:
# res_eval = evaluate_subset(qa_df, n=20, mode='hybrid')
# res_eval.head()


## Block 8. Быстрое построение submission

In [None]:
def build_submission(
    qa_df: pd.DataFrame,
    mode: str = 'hybrid',
    output_path: str = 'submission.csv',
) -> pd.DataFrame:
    rows = []
    for _, row in tqdm(qa_df.iterrows(), total=len(qa_df), desc='Build submission'):
        q_id = row[QA_ID_COL]
        q_text = str(row[QA_QUESTION_COL])

        if mode == 'dense':
            retrieved = dense_search(q_text, top_k=TOP_K_FINAL)
        elif mode == 'bm25':
            retrieved = bm25_search(q_text, top_k=TOP_K_FINAL)
        else:
            retrieved = hybrid_search(q_text, top_k=TOP_K_FINAL)

        ctx = retrieved['text'].tolist()
        try:
            answer = llm_answer(q_text, ctx)
        except Exception as e:
            answer = f'error: {repr(e)}'

        refs = [
            {
                'doc_id': r['doc_id'],
                'chunk_id': r['chunk_id'],
                'score_hybrid': r.get('score_hybrid', None),
                'score_dense': r.get('score_dense', None),
                'score_bm25': r.get('score_bm25', None),
            }
            for _, r in retrieved.iterrows()
        ]

        rows.append({
            SUBM_ID_COL: q_id,
            SUBM_ANSWER_COL: answer,
            SUBM_REFS_COL: json.dumps(refs, ensure_ascii=False),
        })

    subm_df = pd.DataFrame(rows)
    subm_df.to_csv(output_path, index=False)
    print('Saved submission to:', output_path)
    return subm_df

# пример:
# submission_df = build_submission(qa_df, mode='hybrid', output_path='submission.csv')
# submission_df.head()
