# RAG Final Baseline

Структурированный ноутбук для финала RAG:
- единый конфиг в начале;
- загрузка `book.pdf` + `queries.json`;
- чанкинг текста;
- dense (FAISS) + BM25 + гибридный ретрив;
- LLM-ответы и сабмит `submission.csv`.


## Block 0. Конфиг, пути, сиды, девайс

In [None]:
import os
import json
import math
import random
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch

BASE_DIR = Path('.').resolve()
DATA_DIR = BASE_DIR / 'data'
DOCS_DIR = DATA_DIR / 'docs'
OUTPUT_DIR = BASE_DIR / 'outputs'

DATA_DIR.mkdir(parents=True, exist_ok=True)
DOCS_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

BOOK_PATH = DOCS_DIR / 'book.pdf'
QUERIES_PATH = DATA_DIR / 'queries.json'
SUBMISSION_PATH = OUTPUT_DIR / 'submission.csv'

SEED = 42

CHUNK_SIZE = 512
CHUNK_OVERLAP = 128
MAX_CONTEXT_CHARS = 4000

EMBED_MODEL_ID = os.getenv('RAG_EMBED_MODEL', 'intfloat/multilingual-e5-large').strip()
HF_LLM_ID = os.getenv('RAG_LLM_ID', 'Qwen/Qwen2.5-1.5B-Instruct').strip()
LOCAL_LLM_PATH = os.getenv('RAG_LLM_PATH', '').strip()  # если модель лежит локально

TOP_K_DENSE = int(os.getenv('RAG_TOPK_DENSE', '32'))
TOP_K_BM25 = int(os.getenv('RAG_TOPK_SPARSE', '32'))
TOP_K_MERGED = int(os.getenv('RAG_TOPK_MERGED', '40'))

W_DENSE = float(os.getenv('RAG_W_DENSE', '0.6'))
W_SPARSE = float(os.getenv('RAG_W_SPARSE', '0.4'))

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
    print('Using GPU:', torch.cuda.get_device_name(0))
else:
    DEVICE = torch.device('cpu')
    print('Using CPU')

print('BASE_DIR   :', BASE_DIR)
print('DATA_DIR   :', DATA_DIR)
print('DOCS_DIR   :', DOCS_DIR)
print('OUTPUT_DIR :', OUTPUT_DIR)
print('EMBED_MODEL_ID:', EMBED_MODEL_ID)
print('HF_LLM_ID      :', HF_LLM_ID)
print('LOCAL_LLM_PATH :', LOCAL_LLM_PATH or '(not set)')


## Block 1. Копирование `book.pdf` и `queries.json` в `data/`

In [None]:
src_book = BASE_DIR / 'book.pdf'
dst_book = BOOK_PATH

if src_book.exists():
    import shutil
    shutil.copy2(src_book, dst_book)
    print('Скопировал book.pdf →', dst_book)
else:
    print('НЕ НАЙДЕН book.pdf в корне:', src_book)

src_queries = BASE_DIR / 'queries.json'
dst_queries = QUERIES_PATH

if src_queries.exists():
    import shutil
    shutil.copy2(src_queries, dst_queries)
    print('Скопировал queries.json →', dst_queries)
else:
    print('НЕ НАЙДЕН queries.json в корне:', src_queries)


## Block 2. Документы: поиск и загрузка текста

In [None]:
from typing import List, Dict, Any
from pypdf import PdfReader

SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md']

def list_documents(docs_dir: Path, exts=None) -> pd.DataFrame:
    if exts is None:
        exts = SUPPORTED_EXTENSIONS
    docs = []
    if not docs_dir.exists():
        docs_dir.mkdir(parents=True, exist_ok=True)
        return pd.DataFrame(columns=['doc_id', 'path', 'ext'])
    for path in sorted(docs_dir.rglob('*')):
        if path.is_file():
            ext = path.suffix.lower()
            if ext in exts:
                docs.append({'doc_id': len(docs), 'path': path, 'ext': ext})
    return pd.DataFrame(docs)

def load_pdf(path: Path) -> str:
    reader = PdfReader(str(path))
    parts = []
    for page in reader.pages:
        try:
            txt = page.extract_text() or ''
        except Exception:
            txt = ''
        parts.append(txt)
    return '\n\n'.join(parts)

def load_text_file(path: Path, encoding: str = 'utf-8') -> str:
    with open(path, 'r', encoding=encoding, errors='ignore') as f:
        return f.read()

def load_single_document(doc_row: pd.Series) -> Dict[str, Any]:
    doc_id = int(doc_row['doc_id'])
    path = Path(doc_row['path'])
    ext = str(doc_row['ext']).lower()
    if ext == '.pdf':
        text = load_pdf(path)
    else:
        text = load_text_file(path)
    return {
        'doc_id': doc_id,
        'path': str(path),
        'ext': ext,
        'text': text,
        'n_chars': len(text),
    }

docs_df = list_documents(DOCS_DIR)
print('Найдено документов:', len(docs_df))
print(docs_df.head())

raw_docs: List[Dict[str, Any]] = []
if len(docs_df) == 0:
    print('Нет документов в DOCS_DIR, дальше RAG не заработает.')
else:
    for _, row in tqdm(docs_df.iterrows(), total=len(docs_df), desc='Загрузка документов'):
        raw_docs.append(load_single_document(row))

raw_docs_df = pd.DataFrame(raw_docs)
print('raw_docs_df shape:', raw_docs_df.shape)
print(raw_docs_df[['doc_id', 'ext', 'n_chars']].head())


## Block 3. Чанкинг текста документов

In [None]:
from typing import List

def split_text_to_chunks(text: str, chunk_size: int, overlap: int) -> List[str]:
    chunks = []
    if not text:
        return chunks
    step = max(1, chunk_size - overlap)
    i = 0
    n = len(text)
    while i < n:
        chunk = text[i : i + chunk_size]
        chunks.append(chunk)
        i += step
    return chunks

all_chunks: List[Dict[str, Any]] = []

if raw_docs_df is None or len(raw_docs_df) == 0:
    print('raw_docs_df пуст — сначала нужно загрузить документы (Block 2).')
else:
    for _, row in tqdm(raw_docs_df.iterrows(), total=len(raw_docs_df), desc='Chunking'):
        doc_id = int(row['doc_id'])
        text = str(row['text'])
        path = str(row['path'])
        ext = str(row['ext'])
        chunks = split_text_to_chunks(text, CHUNK_SIZE, CHUNK_OVERLAP)
        for local_idx, ch in enumerate(chunks):
            all_chunks.append(
                {
                    'chunk_id': None,
                    'doc_id': doc_id,
                    'chunk_idx_in_doc': local_idx,
                    'path': path,
                    'ext': ext,
                    'text': ch,
                    'n_chars': len(ch),
                }
            )

for idx, ch in enumerate(all_chunks):
    ch['chunk_id'] = idx

chunks_df = pd.DataFrame(all_chunks)
print('chunks_df shape:', chunks_df.shape)
print(chunks_df[['chunk_id', 'doc_id', 'chunk_idx_in_doc', 'n_chars']].head())


## Block 4. Эмбеддинги чанков и FAISS-индекс

In [None]:
from sentence_transformers import SentenceTransformer
import faiss

print('Загружаю embedding-модель:', EMBED_MODEL_ID)
embed_model = SentenceTransformer(EMBED_MODEL_ID, device=str(DEVICE))

test_emb = embed_model.encode(['test'], convert_to_numpy=True, show_progress_bar=False)
EMBED_DIM = int(test_emb.shape[1])
print('EMBED_DIM:', EMBED_DIM)

def embed_texts(texts, batch_size: int = 64):
    emb = embed_model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=True,
        normalize_embeddings=True,
    )
    return emb.astype('float32')

if chunks_df is None or len(chunks_df) == 0:
    print('chunks_df пуст — сначала нужно сделать чанкинг.')
else:
    texts = chunks_df['text'].tolist()
    batch_size = 256
    embs_list = []
    for i in tqdm(range(0, len(texts), batch_size), desc='Embedding chunks'):
        batch = texts[i : i + batch_size]
        embs = embed_texts(batch)
        embs_list.append(embs)
    embeddings_np = np.vstack(embs_list).astype('float32')
    print('embeddings_np shape:', embeddings_np.shape)

    index = faiss.IndexFlatIP(EMBED_DIM)
    index.add(embeddings_np)
    faiss_index = index
    print('FAISS index ready, n_vectors =', faiss_index.ntotal)


## Block 5. BM25 по чанкам

In [None]:
import re
from rank_bm25 import BM25Okapi

def bm25_tokenize(text: str):
    text = text.lower()
    tokens = re.findall(r"\w+", text, flags=re.UNICODE)
    return tokens

bm25_corpus_tokens = [bm25_tokenize(t) for t in chunks_df['text'].tolist()]
bm25_chunk_ids = chunks_df['chunk_id'].tolist()
bm25 = BM25Okapi(bm25_corpus_tokens)
print('BM25 corpus size:', len(bm25_corpus_tokens))


## Block 6. Ретраивер: dense, BM25, hybrid

In [None]:
def _minmax_normalize(arr: np.ndarray) -> np.ndarray:
    if arr is None or len(arr) == 0:
        return np.zeros_like(arr)
    a_min = float(np.min(arr))
    a_max = float(np.max(arr))
    if not np.isfinite(a_min) or not np.isfinite(a_max) or a_max <= a_min:
        return np.zeros_like(arr)
    return (arr - a_min) / (a_max - a_min)

def dense_search(query: str):
    if 'faiss_index' not in globals() or faiss_index is None:
        return []
    q_emb = embed_texts([query], batch_size=1)
    D, I = faiss_index.search(q_emb, TOP_K_DENSE)
    scores = D[0]
    ids = I[0]
    scores_norm = _minmax_normalize(scores)
    out = []
    for cid, s, sn in zip(ids, scores, scores_norm):
        cid = int(cid)
        if cid < 0:
            continue
        out.append((cid, float(s), float(sn)))
    return out

def sparse_search(query: str):
    if 'bm25' not in globals():
        return []
    tokens = bm25_tokenize(query)
    scores = bm25.get_scores(tokens)
    if scores.size == 0:
        return []
    top_k = min(TOP_K_BM25, scores.shape[0])
    top_idx = np.argsort(scores)[-top_k:][::-1]
    top_scores = scores[top_idx]
    scores_norm = _minmax_normalize(top_scores)
    out = []
    for i, s, sn in zip(top_idx, top_scores, scores_norm):
        cid = int(bm25_chunk_ids[int(i)])
        out.append((cid, float(s), float(sn)))
    return out

def hybrid_search_one(qid: int, query: str):
    dense = dense_search(query)
    sparse = sparse_search(query)
    dense_dict = {cid: (s_raw, s_norm) for cid, s_raw, s_norm in dense}
    sparse_dict = {cid: (s_raw, s_norm) for cid, s_norm in sparse}
    candidates = set(dense_dict.keys()) | set(sparse_dict.keys())
    rows = []
    for cid in candidates:
        d_raw, d_norm = dense_dict.get(cid, (0.0, 0.0))
        s_raw, s_norm = sparse_dict.get(cid, (0.0, 0.0))
        hybrid = W_DENSE * d_norm + W_SPARSE * s_norm
        rows.append({
            'qid': int(qid),
            'query': str(query),
            'chunk_id': int(cid),
            'dense_score': float(d_raw),
            'dense_norm': float(d_norm),
            'bm25_score': float(s_raw),
            'bm25_norm': float(s_norm),
            'hybrid_score': float(hybrid),
        })
    rows_sorted = sorted(rows, key=lambda r: r['hybrid_score'], reverse=True)[:TOP_K_MERGED]
    return rows_sorted

def hybrid_search_batch(queries, query_ids=None) -> pd.DataFrame:
    if query_ids is None:
        query_ids = list(range(len(queries)))
    rows = []
    for qid, q in tqdm(list(zip(query_ids, queries)), total=len(queries), desc='Hybrid search'):
        rows.extend(hybrid_search_one(qid, q))
    return pd.DataFrame(rows)


## Block 7. Загрузка LLM (Qwen или другая HF-модель)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

if LOCAL_LLM_PATH:
    model_path = LOCAL_LLM_PATH
else:
    model_path = HF_LLM_ID

print('Загружаю LLM из:', model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

gen_pipe = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=0 if DEVICE.type == 'cuda' else -1,
    torch_dtype=torch.float16 if DEVICE.type == 'cuda' else torch.float32,
)

print('LLM готов.')


## Block 8. RAG: retrieve_relevant_chunks + генерация ответа

In [None]:
SYSTEM_PROMPT = (
    'You are an expert in document analysis and retrieval-augmented generation (RAG). '
    'You receive a CONTEXT (fragments from documents) and a QUESTION. '
    'Answer strictly based on the context. '
    'If the context is not sufficient, say that you do not know and avoid hallucinations.'
)

def build_prompt(query: str, context_text: str) -> str:
    return (
        SYSTEM_PROMPT
        + '\n\nContext:\n'
        + context_text
        + '\n\nQuestion:\n'
        + query
        + '\n\nAnswer:'
    )

def generate_answer(query: str, context_text: str, max_new_tokens: int = 256, temperature: float = 0.2, top_p: float = 0.9) -> str:
    prompt = build_prompt(query, context_text)
    out = gen_pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )
    text = out[0]['generated_text']
    if 'Answer:' in text:
        text = text.split('Answer:', 1)[1].strip()
    return text.strip()

def retrieve_relevant_chunks(query: str, top_k: int = 8):
    rows = hybrid_search_one(0, query)
    df = pd.DataFrame(rows)
    if len(df) == 0:
        return {'context_text': '', 'chunks': []}
    merged = df.merge(chunks_df, on='chunk_id', how='left')
    merged = merged.sort_values('hybrid_score', ascending=False).head(top_k)
    texts = merged['text'].tolist()
    context_text = '\n\n---\n\n'.join(texts)
    if len(context_text) > MAX_CONTEXT_CHARS:
        context_text = context_text[:MAX_CONTEXT_CHARS]
    chunk_infos = []
    for _, r in merged.iterrows():
        chunk_infos.append(
            {
                'chunk_id': int(r['chunk_id']),
                'doc_id': int(r['doc_id']),
                'score': float(r['hybrid_score']),
                'path': r['path'],
            }
        )
    return {'context_text': context_text, 'chunks': chunk_infos}

def rag_answer_with_context_and_refs(query: str, max_new_tokens: int = 256, temperature: float = 0.2, top_p: float = 0.9):
    retrieval = retrieve_relevant_chunks(query)
    context_text = retrieval['context_text']
    chunks = retrieval['chunks']
    if not context_text.strip():
        answer = 'I could not find enough relevant information in the documents to answer this question.'
        refs_dict = {'chunks': chunks}
        refs_json = json.dumps(refs_dict, ensure_ascii=False)
        return answer, context_text, refs_json, refs_dict
    answer = generate_answer(query, context_text, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)
    refs_dict = {'chunks': chunks}
    refs_json = json.dumps(refs_dict, ensure_ascii=False)
    return answer, context_text, refs_json, refs_dict


## Block 9. Загрузка вопросов и формирование submission.csv

In [None]:
if not QUERIES_PATH.exists():
    raise FileNotFoundError(f'Не найден файл с вопросами: {QUERIES_PATH}')

with open(QUERIES_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

qa_df = pd.DataFrame(data)
print('qa_df columns:', list(qa_df.columns))
print(qa_df.head())

if 'question' not in qa_df.columns and 'query' in qa_df.columns:
    qa_df.rename(columns={'query': 'question'}, inplace=True)

if 'id' not in qa_df.columns:
    raise KeyError('В queries.json должна быть колонка id.')

rows = []
for _, row in tqdm(qa_df.iterrows(), total=len(qa_df), desc='RAG answers'):
    q_id = row['id']
    q_text = str(row['question'])
    try:
        answer, context_text, refs_json, refs_dict = rag_answer_with_context_and_refs(
            q_text,
            max_new_tokens=256,
            temperature=0.2,
            top_p=0.9,
        )
    except Exception as e:
        answer = f'error: {repr(e)}'
        refs_dict = {'error': repr(e)}
        refs_json = json.dumps(refs_dict, ensure_ascii=False)
    rows.append({'id': q_id, 'answer': answer, 'refs_json': refs_json})

submission_df = pd.DataFrame(rows)
submission_df.to_csv(SUBMISSION_PATH, index=False)
print('Saved submission to:', SUBMISSION_PATH)
submission_df.head()
