In [None]:
!pip install -q bitsandbytes peft accelerate


In [None]:
!pip -q install huggingface_hub==0.25.2 faiss-cpu ir_datasets >/dev/null


In [None]:
!pip install rank_bm25


In [None]:
!pip -q install ir_datasets


In [None]:
!pip install transformers -U

In [1]:
import os, json, math, random
from dataclasses import dataclass
from typing import List, Dict, Any, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, TaskType

TRAIN_JSONL   = "/kaggle/input/final-dataset-train/instructions_one_best_per_id.jsonl"
OUT_DIR       = "/kaggle/working/mE5_lora8bit_infoNCE_v1"
BASE_MODEL    = "intfloat/multilingual-e5-base"    

SEED          = 42
EPOCHS        = 1
BATCH_SIZE    = 8        
ACCUM_STEPS   = 1
MAX_LEN       = 256
LR            = 1e-4      
WARMUP_RATIO  = 0.1
WEIGHT_DECAY  = 0.0
TEMP          = 0.05
MIX_HARD_NEGS = True
MAX_HNEG      = 4

LORA_R        = 16
LORA_ALPHA    = 32
LORA_DROPOUT  = 0.05
LORA_TARGETS  = ["query","key","value","dense"]   

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
random.seed(SEED); torch.manual_seed(SEED)
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)

os.makedirs(OUT_DIR, exist_ok=True)
print("Device:", DEVICE)

def load_jsonl(path: str) -> List[Dict[str, Any]]:
    out = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            try:
                out.append(json.loads(line))
            except:
                pass
    return out

rows = load_jsonl(TRAIN_JSONL)
print("Loaded rows:", len(rows))

class JSONLRetrievalDataset(Dataset):
    
    def __init__(self, data: List[Dict[str,Any]], max_hard_negs: int = MAX_HNEG):
        self.items = []
        for r in data:
            q = str(r.get("query_ru") or "").strip()
            inst = str(r.get("instruction") or "").strip()
            p = str(r.get("positive_ru") or "").strip()
            if not q or not p:
                continue
            negs = r.get("hard_negs_ru") or []
            negs = [str(x).strip() for x in negs if str(x).strip()][:max_hard_negs]
            self.items.append({
                "q": f"query: {q} {inst}".strip(),
                "p": f"passage: {p}".strip(),
                "negs": [f"passage: {t}" for t in negs]
            })
    def __len__(self): return len(self.items)
    def __getitem__(self, idx): return self.items[idx]

dataset = JSONLRetrievalDataset(rows)
print("Samples:", len(dataset))

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)

@dataclass
class Batch:
    q: Dict[str, torch.Tensor]
    p: Dict[str, torch.Tensor]
    n: Optional[Dict[str, torch.Tensor]]
    B: int

def collate(items: List[Dict[str,Any]]) -> Batch:
    q_texts = [it["q"] for it in items]
    p_texts = [it["p"] for it in items]
    n_texts = [n for it in items for n in it["negs"]] if MIX_HARD_NEGS else []

    q = tok(q_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    p = tok(p_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    n = tok(n_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt") if n_texts else None
    return Batch(q, p, n, len(items))

idx = list(range(len(dataset)))
random.Random(SEED).shuffle(idx)
valN = max(200, int(0.1 * len(idx)))
val_ids, train_ids = idx[:valN], idx[valN:]
train_loader = DataLoader(Subset(dataset, train_ids), batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=collate)
val_loader   = DataLoader(Subset(dataset, val_ids),   batch_size=32,       shuffle=False,                 collate_fn=collate)

def load_lora_8bit(model_name: str):
    torch.cuda.empty_cache()
    base = AutoModel.from_pretrained(
        model_name,
        load_in_8bit=True,
        device_map="auto",          
        low_cpu_mem_usage=True,
    )
    lcfg = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION,   
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=LORA_TARGETS,             
        bias="none",
    )
    peft_model = get_peft_model(base, lcfg)
    peft_model.print_trainable_parameters()
    return peft_model

model = load_lora_8bit(BASE_MODEL)
model.train()

def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    return (last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)

def encode_batch(enc_dict: Dict[str, torch.Tensor]) -> torch.Tensor:
    dev = next(model.parameters()).device
    for k in enc_dict:
        enc_dict[k] = enc_dict[k].to(dev)
    out = model(**enc_dict).last_hidden_state
    emb = mean_pooling(out, enc_dict["attention_mask"])
    return F.normalize(emb, p=2, dim=1)

trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = bnb.optim.AdamW8bit(trainable_params, lr=LR, weight_decay=WEIGHT_DECAY)

steps_per_epoch = max(1, len(train_loader) // ACCUM_STEPS)
total_steps = steps_per_epoch * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, int(WARMUP_RATIO * total_steps), total_steps)

ce = nn.CrossEntropyLoss()

def local_validate():
    model.eval()
    hit1=rec=mrr=ndcg=n=0.0
    with torch.no_grad():
        for b in val_loader:
            q = encode_batch(b.q)   # [B,d]
            p = encode_batch(b.p)   # [B,d]
            sims = (q @ p.t())      # [B,B]
            for i in range(sims.size(0)):
                row = sims[i]
                ranks = torch.argsort(row, descending=True)
                pos_rank = (ranks == i).nonzero(as_tuple=False).item() + 1
                k = 10
                hit1 += 1.0 if pos_rank == 1 else 0.0
                rec  += 1.0 if pos_rank <= k else 0.0
                mrr  += (1.0 / pos_rank) if pos_rank <= k else 0.0
                ndcg += (1.0 / math.log2(1 + pos_rank)) if pos_rank <= k else 0.0
                n += 1
    model.train()
    if n == 0: return {}
    return {"N": int(n), "Hit@1": hit1/n, "Recall@10": rec/n, "MRR@10": mrr/n, "nDCG@10": ndcg/n}

best_mrr = -1.0
global_step = 0

for epoch in range(1, EPOCHS+1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    running = 0.0
    optimizer.zero_grad(set_to_none=True)
    pbar = tqdm(enumerate(train_loader, start=1), total=len(train_loader))

    for step_idx, b in pbar:
        q = encode_batch(b.q)                
        p = encode_batch(b.p)               
        logits = (q @ p.t()) / TEMP          
        labels = torch.arange(b.B, device=logits.device)
        loss = ce(logits, labels)

        if MIX_HARD_NEGS and b.n is not None and b.n["input_ids"].numel() > 0:
            n_emb = encode_batch(b.n)                       
            cand  = torch.cat([p, n_emb], dim=0)            
            logits2 = (q @ cand.t()) / TEMP
            loss2 = ce(logits2, labels)
            loss = 0.5 * loss + 0.5 * loss2

        loss.backward()
        torch.nn.utils.clip_grad_norm_(trainable_params, max_norm=1.0)

        if (step_idx % ACCUM_STEPS) == 0:
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
            global_step += 1

        running += loss.item()
        if step_idx % 50 == 0:
            pbar.set_postfix_str(f"loss={running/50:.4f} lr={scheduler.get_last_lr()[0]:.2e}")
            running = 0.0

    ep_dir = os.path.join(OUT_DIR, f"epoch{epoch}")
    os.makedirs(ep_dir, exist_ok=True)
    model.save_pretrained(ep_dir)
    tok.save_pretrained(ep_dir)

    metrics = local_validate()
    print("[VAL]", {k:(round(v,4) if isinstance(v,float) else v) for k,v in metrics.items()})
    if metrics and metrics.get("MRR@10", -1) > best_mrr:
        best_mrr = metrics["MRR@10"]
        best_dir = os.path.join(OUT_DIR, "best")
        os.makedirs(best_dir, exist_ok=True)
        model.save_pretrained(best_dir)
        tok.save_pretrained(best_dir)
        print(f"[BEST] MRR@10={best_mrr:.4f} saved -> {best_dir}")

model.save_pretrained(OUT_DIR)
tok.save_pretrained(OUT_DIR)
print("Saved to:", OUT_DIR)

2025-09-10 00:18:12.721125: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757463492.951430     147 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757463493.015057     147 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: cuda
Loaded rows: 2052
Samples: 2052


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

trainable params: 2,678,784 || all params: 280,722,432 || trainable%: 0.9542

Epoch 1/1


100%|██████████| 230/230 [03:06<00:00,  1.23it/s, loss=0.1943 lr=1.45e-05]


[VAL] {'N': 205, 'Hit@1': 0.9951, 'Recall@10': 1.0, 'MRR@10': 0.9976, 'nDCG@10': 0.9982}
[BEST] MRR@10=0.9976 saved -> /kaggle/working/mE5_lora8bit_infoNCE_v1/best
Saved to: /kaggle/working/mE5_lora8bit_infoNCE_v1


In [2]:
import os, json, math, random
from dataclasses import dataclass
from typing import List, Dict, Any, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, TaskType

TRAIN_JSONL   = "/kaggle/input/final-dataset-train/instructions_one_best_per_id.jsonl"
OUT_DIR       = "/kaggle/working/mE5_lora8bit_queryonly_v1"
BASE_MODEL    = "intfloat/multilingual-e5-base"  

SEED          = 42
EPOCHS        = 1
BATCH_SIZE    = 8          
ACCUM_STEPS   = 1
MAX_LEN       = 256        
LR            = 1e-4       
WARMUP_RATIO  = 0.1
WEIGHT_DECAY  = 0.0
TEMP          = 0.05
MIX_HARD_NEGS = True
MAX_HNEG      = 4

LORA_R        = 16
LORA_ALPHA    = 32
LORA_DROPOUT  = 0.05
LORA_TARGETS  = ["query","key","value","dense"]

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
random.seed(SEED); torch.manual_seed(SEED)
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)

os.makedirs(OUT_DIR, exist_ok=True)
print("Device:", DEVICE)

def load_jsonl(path: str) -> List[Dict[str, Any]]:
    out = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            try: out.append(json.loads(line))
            except: pass
    return out

rows = load_jsonl(TRAIN_JSONL)
print("Loaded rows:", len(rows))

class QueryOnlyRetrievalDataset(Dataset):

    def __init__(self, data: List[Dict[str,Any]], max_hard_negs: int = MAX_HNEG):
        self.items = []
        for r in data:
            q = str(r.get("query_ru") or "").strip()
            p = str(r.get("positive_ru") or "").strip()
            if not q or not p:
                continue
            negs = r.get("hard_negs_ru") or []
            negs = [str(x).strip() for x in negs if str(x).strip()][:max_hard_negs]
            self.items.append({
                "q": f"query: {q}".strip(),         
                "p": f"passage: {p}".strip(),
                "negs": [f"passage: {t}" for t in negs]
            })
    def __len__(self): return len(self.items)
    def __getitem__(self, idx): return self.items[idx]

dataset = QueryOnlyRetrievalDataset(rows)
print("Samples:", len(dataset))

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)

@dataclass
class Batch:
    q: Dict[str, torch.Tensor]
    p: Dict[str, torch.Tensor]
    n: Optional[Dict[str, torch.Tensor]]
    B: int

def collate(items: List[Dict[str,Any]]) -> Batch:
    q_texts = [it["q"] for it in items]
    p_texts = [it["p"] for it in items]
    n_texts = [n for it in items for n in it["negs"]] if MIX_HARD_NEGS else []

    q = tok(q_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    p = tok(p_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    n = tok(n_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt") if n_texts else None
    return Batch(q, p, n, len(items))

idx = list(range(len(dataset)))
random.Random(SEED).shuffle(idx)
valN = max(200, int(0.1 * len(idx)))
val_ids, train_ids = idx[:valN], idx[valN:]
train_loader = DataLoader(Subset(dataset, train_ids), batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=collate)
val_loader   = DataLoader(Subset(dataset, val_ids),   batch_size=32,       shuffle=False,                 collate_fn=collate)

def load_lora_8bit(model_name: str):
    torch.cuda.empty_cache()
    base = AutoModel.from_pretrained(
        model_name,
        load_in_8bit=True,
        device_map="auto",
        low_cpu_mem_usage=True,
    )
    lcfg = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION,
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=LORA_TARGETS,
        bias="none",
    )
    peft_model = get_peft_model(base, lcfg)
    peft_model.print_trainable_parameters()
    return peft_model

model = load_lora_8bit(BASE_MODEL)
model.train()

def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    return (last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)

def encode_batch(enc_dict: Dict[str, torch.Tensor]) -> torch.Tensor:
    dev = next(model.parameters()).device
    for k in enc_dict:
        enc_dict[k] = enc_dict[k].to(dev)
    out = model(**enc_dict).last_hidden_state
    emb = mean_pooling(out, enc_dict["attention_mask"])
    return F.normalize(emb, p=2, dim=1)

trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = bnb.optim.AdamW8bit(trainable_params, lr=LR, weight_decay=WEIGHT_DECAY)

steps_per_epoch = max(1, len(train_loader) // ACCUM_STEPS)
total_steps = steps_per_epoch * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, int(WARMUP_RATIO * total_steps), total_steps)

ce = nn.CrossEntropyLoss()

def local_validate():
    model.eval()
    hit1=rec=mrr=ndcg=n=0.0
    with torch.no_grad():
        for b in val_loader:
            q = encode_batch(b.q)   # [B,d]
            p = encode_batch(b.p)   # [B,d]
            sims = (q @ p.t())      # [B,B]
            for i in range(sims.size(0)):
                row = sims[i]
                ranks = torch.argsort(row, descending=True)
                pos_rank = (ranks == i).nonzero(as_tuple=False).item() + 1
                k = 10
                hit1 += 1.0 if pos_rank == 1 else 0.0
                rec  += 1.0 if pos_rank <= k else 0.0
                mrr  += (1.0 / pos_rank) if pos_rank <= k else 0.0
                ndcg += (1.0 / math.log2(1 + pos_rank)) if pos_rank <= k else 0.0
                n += 1
    model.train()
    if n == 0: return {}
    return {"N": int(n), "Hit@1": hit1/n, "Recall@10": rec/n, "MRR@10": mrr/n, "nDCG@10": ndcg/n}

best_mrr = -1.0
global_step = 0

for epoch in range(1, EPOCHS+1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    running = 0.0
    optimizer.zero_grad(set_to_none=True)
    pbar = tqdm(enumerate(train_loader, start=1), total=len(train_loader))

    for step_idx, b in pbar:
        q = encode_batch(b.q)                
        p = encode_batch(b.p)                
        logits = (q @ p.t()) / TEMP          
        labels = torch.arange(b.B, device=logits.device)
        loss = ce(logits, labels)

        if MIX_HARD_NEGS and b.n is not None and b.n["input_ids"].numel() > 0:
            n_emb = encode_batch(b.n)                       
            cand  = torch.cat([p, n_emb], dim=0)            
            logits2 = (q @ cand.t()) / TEMP
            loss2 = ce(logits2, labels)
            loss = 0.5 * loss + 0.5 * loss2

        loss.backward()
        torch.nn.utils.clip_grad_norm_(trainable_params, max_norm=1.0)

        if (step_idx % ACCUM_STEPS) == 0:
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
            global_step += 1

        running += loss.item()
        if step_idx % 50 == 0:
            pbar.set_postfix_str(f"loss={running/50:.4f} lr={scheduler.get_last_lr()[0]:.2e}")
            running = 0.0

    ep_dir = os.path.join(OUT_DIR, f"epoch{epoch}")
    os.makedirs(ep_dir, exist_ok=True)
    model.save_pretrained(ep_dir)
    tok.save_pretrained(ep_dir)

    metrics = local_validate()
    print("[VAL]", {k:(round(v,4) if isinstance(v,float) else v) for k,v in metrics.items()})
    if metrics and metrics.get("MRR@10", -1) > best_mrr:
        best_mrr = metrics["MRR@10"]
        best_dir = os.path.join(OUT_DIR, "best")
        os.makedirs(best_dir, exist_ok=True)
        model.save_pretrained(best_dir)
        tok.save_pretrained(best_dir)
        print(f"[BEST] MRR@10={best_mrr:.4f} saved -> {best_dir}")

model.save_pretrained(OUT_DIR)
tok.save_pretrained(OUT_DIR)
print("Saved to:", OUT_DIR)

Device: cuda
Loaded rows: 2052
Samples: 2052


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


trainable params: 2,678,784 || all params: 280,722,432 || trainable%: 0.9542

Epoch 1/1


100%|██████████| 230/230 [03:03<00:00,  1.25it/s, loss=0.4411 lr=1.45e-05]


[VAL] {'N': 205, 'Hit@1': 0.9854, 'Recall@10': 0.9951, 'MRR@10': 0.9902, 'nDCG@10': 0.9915}
[BEST] MRR@10=0.9902 saved -> /kaggle/working/mE5_lora8bit_queryonly_v1/best
Saved to: /kaggle/working/mE5_lora8bit_queryonly_v1


In [3]:
import ir_datasets as irds, json

out_dir = "mrtydi_ru_prepared"
import os; os.makedirs(out_dir, exist_ok=True)

ds = irds.load("mr-tydi/ru/test")  

with open(f"{out_dir}/corpus.jsonl", "w", encoding="utf-8") as fc:
    for d in ds.docs_iter():
        j = {"_id": str(d.doc_id), "title": getattr(d, "title", "") or "", "text": d.text or ""}
        fc.write(json.dumps(j, ensure_ascii=False) + "\n")

with open(f"{out_dir}/queries.jsonl", "w", encoding="utf-8") as fq:
    for q in ds.queries_iter():
        j = {"_id": str(q.query_id), "text": q.text}
        fq.write(json.dumps(j, ensure_ascii=False) + "\n")

with open(f"{out_dir}/qrels.tsv", "w", encoding="utf-8") as fr:
    for r in ds.qrels_iter():
        rel = int(getattr(r, "relevance", 1) > 0)
        fr.write(f"{r.query_id}\t{r.doc_id}\t{rel}\n")

print("Wrote:", out_dir, "->", os.listdir(out_dir))

[INFO] If you have a local copy of https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-russian.tar.gz, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/fab64459133bc93a0bec2f0559bfb423
[INFO] [starting] https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-russian.tar.gz
[INFO] [finished] https://git.uwaterloo.ca/jimmylin/mr.tydi/-/raw/master/data/mrtydi-v1.0-russian.tar.gz: [00:25] [1.55GB] [61.0MB/s]
[INFO] [starting] extracting from tar file                                                                         
[INFO] [finished] extracting from tar file [2.52s]


Wrote: mrtydi_ru_prepared -> ['corpus.jsonl', 'qrels.tsv', 'queries.jsonl']


In [7]:
import os, json, random

DATA_DIR = "/kaggle/working/mrtydi_ru_prepared"  # same as in your eval script
Q_PATH   = os.path.join(DATA_DIR, "queries.jsonl")
OUT_PATH = os.path.join(DATA_DIR, "instructions.jsonl")

TEMPLATES = [
    "Ты — поисковая система. Найди наиболее релевантные документы, которые помогают ответить на запрос. Возвращай документы, а не готовый ответ.",
    "Найди статьи/пассажи, содержащие информацию для ответа на вопрос. Отдавай приоритет точности и контекстной релевантности.",
    "Выполни поиск по корпусу и верни документы с наибольшей вероятностью содержать ответ. Избегай нерелевантных совпадений."
]

random.seed(42)
n = 0
with open(Q_PATH, "r", encoding="utf-8") as fin, open(OUT_PATH, "w", encoding="utf-8") as fout:
    for line in fin:
        j = json.loads(line)
        qid = str(j.get("_id") or j.get("id"))
        if not qid: 
            continue
        instr = TEMPLATES[hash(qid) % len(TEMPLATES)]   # stable pseudo-random choice per qid
        out = {"_id": qid, "instruction": instr}
        fout.write(json.dumps(out, ensure_ascii=False) + "\n")
        n += 1

print(f"Wrote {n} instructions to:", OUT_PATH)


Wrote 995 instructions to: /kaggle/working/mrtydi_ru_prepared/instructions.jsonl


In [8]:
# Mr.TyDi-RU eval (BM25 + Base + LoRA adapters)
#  - p-MRR@k = MRR@k(q+instr) − MRR@k(q) 
# ===========================================================
import os, io, json, gzip, math, random, re, time, hashlib
from collections import defaultdict
from typing import Dict, List, Tuple

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel

DEVICE         = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE_DOC = 64
BATCH_SIZE_Q   = 64
MAX_LEN        = 256
K_LIST         = [1, 5, 10]
SEED           = 42

DOC_CAP     = 20000     
FAST_Q_CAP  = None       

USE_PREFIXES = True

# Models
BASE_MODEL          = "intfloat/multilingual-e5-base"
FINETUNED_INST_DIR  = "/kaggle/working/mE5_lora8bit_infoNCE_v1/best"       
FINETUNED_QUERY_DIR = "/kaggle/working/mE5_lora8bit_queryonly_v1/best"    

LOCAL_DATA_DIR = "/kaggle/working/mrtydi_ru_prepared"
INSTR_JSONL    = os.path.join(LOCAL_DATA_DIR, "instructions.jsonl")

ROOT_OUT  = "/kaggle/working/mrtydi_ru_eval"
EMB_CACHE = os.path.join(ROOT_OUT, "emb_cache")
os.makedirs(ROOT_OUT, exist_ok=True)
os.makedirs(EMB_CACHE, exist_ok=True)

random.seed(SEED); np.random.seed(SEED)
print(f"Device: {DEVICE} | DOC_CAP={DOC_CAP} | FAST_Q_CAP={FAST_Q_CAP}")

def tic(msg: str):
    print(msg, end="", flush=True)
    return time.time()

def toc(t0: float, prefix: str = "done"):
    dt = time.time() - t0
    print(f" {prefix} in {dt:.1f}s.")

def smart_open(path):
    if path.endswith(".gz"):
        f = gzip.open(path, "rb")
        return io.TextIOWrapper(f, encoding="utf-8")
    return open(path, "r", encoding="utf-8")

def _tqdm_lines(path: str, desc: str):
    """Yield lines and update tqdm by bytes read to show ETA without pre-counting lines."""
    total = os.path.getsize(path) if os.path.exists(path) else None
    with smart_open(path) as f, tqdm(total=total, unit="B", unit_scale=True, desc=desc) as pbar:
        for line in f:
            pbar.update(len(line.encode("utf-8")))
            yield line

def load_corpus(path: str) -> Dict[str, str]:
    corpus = {}
    for line in _tqdm_lines(path, "Loading corpus"):
        j = json.loads(line)
        cid   = str(j.get("_id") or j.get("id"))
        title = (j.get("title") or "").strip()
        text  = (j.get("text")  or "").strip()
        corpus[cid] = (title + " " + text).strip()
    return corpus

def load_queries(path: str) -> Dict[str, str]:
    queries = {}
    for line in _tqdm_lines(path, "Loading queries"):
        j = json.loads(line)
        qid  = str(j.get("_id") or j.get("id"))
        text = (j.get("text") or "").strip()
        queries[qid] = text
    return queries

def load_instructions_optional(path: str) -> Dict[str, str]:
    """Optional JSONL with instruction text per qid. Tries several key names."""
    if not os.path.exists(path):
        return {}
    inst_map = {}
    for line in _tqdm_lines(path, "Loading instructions"):
        j = json.loads(line)
        qid = str(j.get("_id") or j.get("id") or j.get("qid") or "").strip()
        instr = (j.get("instruction") or j.get("instruction_og") or j.get("instr") or "").strip()
        if qid and instr:
            inst_map[qid] = instr
    return inst_map

def load_qrels_tsv(path: str) -> Dict[str, Dict[str, int]]:
    qrels = defaultdict(dict)
    for row in _tqdm_lines(path, "Loading qrels"):
        row = row.strip()
        if not row or row.startswith("#"): continue
        parts = row.split("\t")
        if len(parts) < 3: continue
        qid, did, rel = parts[0].strip(), parts[1].strip(), parts[2].strip()
        try:
            rel = int(float(rel))
        except:
            continue
        if rel > 0:
            qrels[qid][did] = 1
    return dict(qrels)

for fn in ["corpus.jsonl", "queries.jsonl", "qrels.tsv"]:
    if not os.path.exists(os.path.join(LOCAL_DATA_DIR, fn)):
        raise FileNotFoundError(f"Missing {fn} under {LOCAL_DATA_DIR}")

t0 = tic("Loading dataset files…")
corpus  = load_corpus(os.path.join(LOCAL_DATA_DIR, "corpus.jsonl"))
queries = load_queries(os.path.join(LOCAL_DATA_DIR, "queries.jsonl"))
qrels   = load_qrels_tsv(os.path.join(LOCAL_DATA_DIR, "qrels.tsv"))
instructions = load_instructions_optional(INSTR_JSONL)  # may be empty
toc(t0, prefix=f"Loaded (docs={len(corpus):,}, queries={len(queries):,}, qrels_q={len(qrels):,}, instr_q={len(instructions):,})")

rng = random.Random(SEED)

qids_all = [qid for qid in qrels.keys() if qid in queries]

all_pos_doc_ids = set()
for qid in qids_all:
    all_pos_doc_ids.update(qrels[qid].keys())

pos_doc_ids = [d for d in all_pos_doc_ids if d in corpus]

if DOC_CAP is None:
    doc_ids = list(corpus.keys())
    neg_added = "N/A"
else:
    if len(pos_doc_ids) >= DOC_CAP:
        rng.shuffle(pos_doc_ids)
        doc_ids = pos_doc_ids[:DOC_CAP]
        neg_added = 0
    else:
        need = DOC_CAP - len(pos_doc_ids)
        neg_pool = [d for d in corpus.keys() if d not in all_pos_doc_ids]
        rng.shuffle(neg_pool)
        neg_added = min(need, len(neg_pool))
        doc_ids = pos_doc_ids + neg_pool[:neg_added]

corpus = {d: corpus[d] for d in doc_ids}
doc_texts_raw  = [corpus[d] for d in doc_ids]

qids_all = [qid for qid in qids_all if any((did in corpus) for did in qrels[qid].keys())]

if FAST_Q_CAP and FAST_Q_CAP < len(qids_all):
    rng.shuffle(qids_all)
    qids_all = qids_all[:FAST_Q_CAP]

num_q_with_pos = len(qids_all)
print(
    f"Eval set -> docs: {len(doc_ids):,} "
    f"(positives kept: {len(pos_doc_ids):,}, random negatives added: {neg_added}) "
    f"| queries: {num_q_with_pos:,}"
)
if num_q_with_pos == 0:
    raise RuntimeError("No queries left after subsetting. Increase DOC_CAP, disable caps, or verify your qrels/doc ids.")

def add_prefix(s: str, kind: str) -> str:
    if not USE_PREFIXES: return s
    return (("query: " if kind == "query" else "passage: ") + s).strip()

doc_texts_pref = [add_prefix(t, "passage") for t in doc_texts_raw]
query_map_raw  = {qid: queries[qid] for qid in qids_all}
query_map_pref_plain = {qid: add_prefix(queries[qid], "query") for qid in qids_all}

qids_with_instr = [qid for qid in qids_all if qid in instructions]
query_map_pref_instr = {qid: add_prefix((queries[qid] + " " + instructions[qid]).strip(), "query")
                        for qid in qids_with_instr}

def eval_multi_k(run, qrels_map, ks):
    ks = sorted(set(ks))
    stats = {"N": 0, "Hit@1": 0.0}
    for k in ks:
        stats.update({f"Recall@{k}":0.0, f"MRR@{k}":0.0, f"nDCG@{k}":0.0})
    n = 0
    for qid, ranking in run.items():
        if qid not in qrels_map: continue
        n += 1
        rels = qrels_map[qid]
        pos_ranks = [r for r,(did,_) in enumerate(ranking, start=1) if did in rels]
        pos_ranks.sort()
        if pos_ranks and pos_ranks[0] == 1:
            stats["Hit@1"] += 1.0
        for k in ks:
            stats[f"Recall@{k}"] += float(any(r <= k for r in pos_ranks))
            stats[f"MRR@{k}"]    += (1.0/pos_ranks[0]) if (pos_ranks and pos_ranks[0] <= k) else 0.0
            dcg  = sum(1.0/math.log2(r+1) for r in pos_ranks if r <= k)
            m    = len(rels)
            idcg = sum(1.0/math.log2(r+1) for r in range(1, min(m, k)+1))
            stats[f"nDCG@{k}"] += (dcg/idcg) if idcg > 0 else 0.0
    if n == 0:
        return stats
    stats["N"] = n
    for k in ["Hit@1"]+[f"Recall@{x}" for x in ks]+[f"MRR@{x}" for x in ks]+[f"nDCG@{x}" for x in ks]:
        stats[k] /= n
    return stats

def fmt(d): 
    return {k:(round(v,4) if isinstance(v,float) else v) for k,v in d.items()}

def pmrr(metrics_plain: Dict[str,float], metrics_instr: Dict[str,float], ks: List[int]) -> Dict[str,float]:
    out = {}
    for k in ks:
        a = metrics_plain.get(f"MRR@{k}", 0.0)
        b = metrics_instr.get(f"MRR@{k}", 0.0)
        out[f"pMRR@{k}"] = b - a
    return out

def mean_pool(last_hidden, attention_mask):
    m = attention_mask.unsqueeze(-1).expand_as(last_hidden).float()
    return (last_hidden * m).sum(1) / m.sum(1).clamp(min=1e-9)

def corpus_signature(doc_ids: List[str]) -> str:
    head = "||".join(doc_ids[:50]); tail = "||".join(doc_ids[-50:]) if len(doc_ids) > 50 else ""
    return hashlib.md5((head + "|" + tail + f"|{len(doc_ids)}").encode()).hexdigest()[:8]

def signature(model_or_adapter: str) -> str:
    name = os.path.basename(model_or_adapter.rstrip("/")) if os.path.isdir(model_or_adapter) else model_or_adapter.replace("/", "_")
    return f"{corpus_signature(doc_ids)}__L{MAX_LEN}__{name}"

def load_tok_model_any(model_or_adapter: str, base_model: str):
    """Load tokenizer + model"""
    is_adapter = os.path.isdir(model_or_adapter) and os.path.exists(os.path.join(model_or_adapter, "adapter_config.json"))
    tok_src = model_or_adapter if (os.path.isdir(model_or_adapter) and os.path.exists(os.path.join(model_or_adapter, "tokenizer.json"))) else base_model
    tok = AutoTokenizer.from_pretrained(tok_src, use_fast=True)

    def _load_base(to_device: str):
        dtype = torch.float16 if to_device == "cuda" else None
        return AutoModel.from_pretrained(base_model, torch_dtype=dtype, low_cpu_mem_usage=True).to(to_device).eval()

    try:
        base = _load_base("cuda" if DEVICE == "cuda" else "cpu")
    except RuntimeError as e:
        print(f"[warn] GPU load failed ({type(e).__name__}): falling back to CPU.")
        base = _load_base("cpu")

    if is_adapter:
        mdl = PeftModel.from_pretrained(base, model_or_adapter).eval()
    else:
        mdl = base
    return tok, mdl

@torch.no_grad()
def encode_texts(mdl, tok, texts: List[str], bs: int, max_len: int) -> torch.Tensor:
    dev = next(mdl.parameters()).device
    outs = []
    for i in tqdm(range(0, len(texts), bs), total=(len(texts)+bs-1)//bs, desc=f"Encode@{dev}"):
        batch = texts[i:i+bs]
        enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        for k in enc: enc[k] = enc[k].to(dev)
        last = mdl(**enc).last_hidden_state
        pooled = mean_pool(last, enc["attention_mask"])
        outs.append(F.normalize(pooled, p=2, dim=1).cpu())
    return torch.cat(outs, 0) if outs else torch.empty(0, mdl.config.hidden_size)

def build_index(doc_embs_cpu: torch.Tensor):
    try:
        import faiss
        index = faiss.IndexFlatIP(doc_embs_cpu.shape[1])
        index.add(doc_embs_cpu.numpy().astype("float32"))
        return index, True
    except Exception:
        return doc_embs_cpu, False

@torch.no_grad()
def search_index(index, is_faiss: bool, q_embs: torch.Tensor, topk: int):
    if is_faiss:
        import faiss
        D, I = index.search(q_embs.numpy().astype("float32"), topk)
        return I, D
    sims = torch.mm(q_embs, index.t())
    D, I = torch.topk(sims, k=min(topk, sims.size(1)), dim=1)
    return I.numpy(), D.numpy()

def eval_model(model_or_adapter: str, label: str,
               queries_map_pref: Dict[str,str], qids_list: List[str]):
    print(f"\n=== {label} ===")
    t0 = tic("Loading model/tokenizer…")
    tok, mdl = load_tok_model_any(model_or_adapter, BASE_MODEL)
    toc(t0)

    sig = signature(model_or_adapter)
    cache_path = os.path.join(EMB_CACHE, f"docs__{sig}.pt")

    if os.path.exists(cache_path):
        doc_embs = torch.load(cache_path, map_location="cpu")
        print(f"[cache] doc embs loaded: {doc_embs.shape}")
    else:
        t1 = tic(f"Encoding docs (N={len(doc_texts_pref):,})…")
        doc_embs = encode_texts(mdl, tok, doc_texts_pref, BATCH_SIZE_DOC, MAX_LEN)
        toc(t1)
        torch.save(doc_embs, cache_path)
        print(f"[cache] saved -> {cache_path}")

    index, is_faiss = build_index(doc_embs)

    q_texts = [queries_map_pref[qid] for qid in qids_list]
    t2 = tic(f"Encoding queries (N={len(q_texts):,})…")
    q_embs  = encode_texts(mdl, tok, q_texts, BATCH_SIZE_Q, MAX_LEN)
    toc(t2)

    t3 = tic("Searching…")
    I, D = search_index(index, is_faiss, q_embs, topk=max(K_LIST))
    toc(t3)

    run = {}
    for i, qid in enumerate(qids_list):
        ids = I[i]; scores = D[i]
        run[qid] = [(doc_ids[j], float(scores[k])) for k, j in enumerate(ids)]

    m = eval_multi_k(run, qrels, K_LIST)
    print("Metrics:", fmt(m))
    return m

# BM25 (raw text, no prefixes)
token_re = re.compile(r"[a-zа-яё0-9]+", re.IGNORECASE)
def ru_tok(s: str): return token_re.findall(s.lower())

def bm25_run(doc_texts: List[str], doc_ids: List[str], qids: List[str], queries_map: Dict[str, str], topk: int):
    toks = []
    for t in tqdm(doc_texts, desc="BM25 tokenize docs", total=len(doc_texts)):
        toks.append(ru_tok(t))
    t0 = tic("Building BM25 index…")
    bm = BM25Okapi(toks)
    toc(t0)

    run = {}
    for qid in tqdm(qids, desc="BM25 search", total=len(qids)):
        q_tokens = ru_tok(queries_map[qid])
        scores = bm.get_scores(q_tokens)
        if topk >= len(doc_ids):
            idx = np.argsort(scores)[::-1]
        else:
            part = np.argpartition(scores, -topk)[-topk:]
            idx = part[np.argsort(scores[part])[::-1]]
        run[qid] = [(doc_ids[j], float(scores[j])) for j in idx]
    return run

results = {}

# 1) BM25 (plain)
print("\n>>> BM25 (plain)")
bm_run_plain = bm25_run(doc_texts_raw, doc_ids, qids_all, query_map_raw, topk=max(K_LIST))
results["bm25_plain"] = eval_multi_k(bm_run_plain, qrels, K_LIST)
print("BM25 (plain) ->", fmt(results["bm25_plain"]))

if qids_with_instr:
    print("\n>>> BM25 (q+instr)  [on queries with instructions only]")
    raw_instr_map = {qid: (queries[qid] + " " + instructions[qid]).strip() for qid in qids_with_instr}
    bm_run_instr = bm25_run(doc_texts_raw, doc_ids, qids_with_instr, raw_instr_map, topk=max(K_LIST))
    results["bm25_instr"] = eval_multi_k(bm_run_instr, qrels, K_LIST)
    print("BM25 (instr) ->", fmt(results["bm25_instr"]))

# 2) Baseline encoder (no adapters) — plain
results["base_plain"] = eval_model(BASE_MODEL, "Baseline (mE5-base) :: plain", query_map_pref_plain, qids_all)

# 2b) Baseline encoder — q+instr 
if qids_with_instr:
    results["base_instr"] = eval_model(BASE_MODEL, "Baseline (mE5-base) :: q+instr", query_map_pref_instr, qids_with_instr)

# 3) LoRA adapters
if os.path.isdir(FINETUNED_INST_DIR):
    results["ft_inst_plain"] = eval_model(FINETUNED_INST_DIR, "Finetuned (instruction-aware, LoRA) :: plain", query_map_pref_plain, qids_all)
    if qids_with_instr:
        results["ft_inst_instr"] = eval_model(FINETUNED_INST_DIR, "Finetuned (instruction-aware, LoRA) :: q+instr", query_map_pref_instr, qids_with_instr)
else:
    print(f"[skip] {FINETUNED_INST_DIR} not found.")

if os.path.isdir(FINETUNED_QUERY_DIR):
    results["ft_query_plain"] = eval_model(FINETUNED_QUERY_DIR, "Finetuned (query-only, LoRA) :: plain", query_map_pref_plain, qids_all)
    if qids_with_instr:
        results["ft_query_instr"] = eval_model(FINETUNED_QUERY_DIR, "Finetuned (query-only, LoRA) :: q+instr", query_map_pref_instr, qids_with_instr)
else:
    print(f"[skip] {FINETUNED_QUERY_DIR} not found.")

# Summary
print("\n=== SUMMARY (Mr.TyDi-RU) ===")
for k, v in results.items():
    print(f"{k:14s} ->", fmt(v))

# p-MRR 
if qids_with_instr:
    print("\n=== p-MRR (instr − plain) on queries with instructions only ===")
    def safe_pmrr(tag_plain, tag_instr):
        if tag_plain in results and tag_instr in results:
            return pmrr(results[tag_plain], results[tag_instr], K_LIST)
        return None

    pmrr_base = safe_pmrr("base_plain", "base_instr")
    if pmrr_base:
        print("Baseline:", fmt(pmrr_base))

    pmrr_ft_inst = safe_pmrr("ft_inst_plain", "ft_inst_instr")
    if pmrr_ft_inst:
        print("FT-Inst:", fmt(pmrr_ft_inst))

    pmrr_ft_q = safe_pmrr("ft_query_plain", "ft_query_instr")
    if pmrr_ft_q:
        print("FT-Query:", fmt(pmrr_ft_q))

    pmrr_bm25 = safe_pmrr("bm25_plain", "bm25_instr")
    if pmrr_bm25:
        print("BM25:   ", fmt(pmrr_bm25))
else:
    print("\n[p-MRR] No instructions.jsonl found or no overlap with qrels — skipping p-MRR.")


Device: cuda | DOC_CAP=20000 | FAST_Q_CAP=None
Loading dataset files…

Loading corpus: 100%|██████████| 5.84G/5.84G [01:19<00:00, 73.4MB/s]
Loading queries: 100%|██████████| 113k/113k [00:00<00:00, 29.2MB/s]
Loading qrels: 100%|██████████| 19.0k/19.0k [00:00<00:00, 8.20MB/s]
Loading instructions: 100%|██████████| 268k/268k [00:00<00:00, 56.0MB/s]


 Loaded (docs=9,597,504, queries=995, qrels_q=995, instr_q=995) in 79.6s.
Eval set -> docs: 20,000 (positives kept: 1,100, random negatives added: 18900) | queries: 995

>>> BM25 (plain)


BM25 tokenize docs: 100%|██████████| 20000/20000 [00:00<00:00, 20717.27it/s]

Building BM25 index…




 done in 0.4s.


BM25 search: 100%|██████████| 995/995 [00:38<00:00, 25.53it/s]


BM25 (plain) -> {'N': 995, 'Hit@1': 0.4171, 'Recall@1': 0.4171, 'MRR@1': 0.4171, 'nDCG@1': 0.4171, 'Recall@5': 0.5548, 'MRR@5': 0.4713, 'nDCG@5': 0.4676, 'Recall@10': 0.599, 'MRR@10': 0.477, 'nDCG@10': 0.4813}

>>> BM25 (q+instr)  [on queries with instructions only]


BM25 tokenize docs: 100%|██████████| 20000/20000 [00:00<00:00, 45590.95it/s]

Building BM25 index…




 done in 0.5s.


BM25 search: 100%|██████████| 995/995 [02:04<00:00,  7.97it/s]

BM25 (instr) -> {'N': 995, 'Hit@1': 0.3789, 'Recall@1': 0.3789, 'MRR@1': 0.3789, 'nDCG@1': 0.3789, 'Recall@5': 0.5025, 'MRR@5': 0.4265, 'nDCG@5': 0.4215, 'Recall@10': 0.5457, 'MRR@10': 0.4322, 'nDCG@10': 0.4346}

=== Baseline (mE5-base) :: plain ===
Loading model/tokenizer…




 done in 3.2s.
[cache] doc embs loaded: torch.Size([20000, 768])
Encoding queries (N=995)…

Encode@cuda:0: 100%|██████████| 16/16 [00:00<00:00, 31.24it/s]

 done in 0.5s.
Searching…




 done in 0.1s.
Metrics: {'N': 995, 'Hit@1': 0.808, 'Recall@1': 0.808, 'MRR@1': 0.808, 'nDCG@1': 0.808, 'Recall@5': 0.9357, 'MRR@5': 0.862, 'nDCG@5': 0.8686, 'Recall@10': 0.9588, 'MRR@10': 0.8653, 'nDCG@10': 0.8782}

=== Baseline (mE5-base) :: q+instr ===
Loading model/tokenizer… done in 3.1s.
[cache] doc embs loaded: torch.Size([20000, 768])
Encoding queries (N=995)…

Encode@cuda:0: 100%|██████████| 16/16 [00:00<00:00, 18.40it/s]

 done in 0.9s.
Searching…




 done in 0.1s.
Metrics: {'N': 995, 'Hit@1': 0.7779, 'Recall@1': 0.7779, 'MRR@1': 0.7779, 'nDCG@1': 0.7779, 'Recall@5': 0.9156, 'MRR@5': 0.8352, 'nDCG@5': 0.8426, 'Recall@10': 0.9357, 'MRR@10': 0.8381, 'nDCG@10': 0.8507}

=== Finetuned (instruction-aware, LoRA) :: plain ===
Loading model/tokenizer… done in 1.8s.
[cache] doc embs loaded: torch.Size([20000, 768])
Encoding queries (N=995)…

Encode@cuda:0: 100%|██████████| 16/16 [00:01<00:00, 14.88it/s]

 done in 1.1s.
Searching…




 done in 0.1s.
Metrics: {'N': 995, 'Hit@1': 0.7357, 'Recall@1': 0.7357, 'MRR@1': 0.7357, 'nDCG@1': 0.7357, 'Recall@5': 0.8975, 'MRR@5': 0.803, 'nDCG@5': 0.8124, 'Recall@10': 0.9276, 'MRR@10': 0.8071, 'nDCG@10': 0.8233}

=== Finetuned (instruction-aware, LoRA) :: q+instr ===
Loading model/tokenizer… done in 1.7s.
[cache] doc embs loaded: torch.Size([20000, 768])
Encoding queries (N=995)…

Encode@cuda:0: 100%|██████████| 16/16 [00:01<00:00,  8.43it/s]

 done in 1.9s.
Searching…




 done in 0.1s.
Metrics: {'N': 995, 'Hit@1': 0.7477, 'Recall@1': 0.7477, 'MRR@1': 0.7477, 'nDCG@1': 0.7477, 'Recall@5': 0.8894, 'MRR@5': 0.8084, 'nDCG@5': 0.8135, 'Recall@10': 0.9216, 'MRR@10': 0.8126, 'nDCG@10': 0.8257}

=== Finetuned (query-only, LoRA) :: plain ===
Loading model/tokenizer… done in 1.8s.
[cache] doc embs loaded: torch.Size([20000, 768])
Encoding queries (N=995)…

Encode@cuda:0: 100%|██████████| 16/16 [00:01<00:00, 14.99it/s]

 done in 1.1s.
Searching…




 done in 0.1s.
Metrics: {'N': 995, 'Hit@1': 0.7518, 'Recall@1': 0.7518, 'MRR@1': 0.7518, 'nDCG@1': 0.7518, 'Recall@5': 0.8975, 'MRR@5': 0.8131, 'nDCG@5': 0.8201, 'Recall@10': 0.9286, 'MRR@10': 0.8171, 'nDCG@10': 0.8308}

=== Finetuned (query-only, LoRA) :: q+instr ===
Loading model/tokenizer… done in 1.8s.
[cache] doc embs loaded: torch.Size([20000, 768])
Encoding queries (N=995)…

Encode@cuda:0: 100%|██████████| 16/16 [00:01<00:00,  8.39it/s]

 done in 1.9s.
Searching…




 done in 0.1s.
Metrics: {'N': 995, 'Hit@1': 0.7427, 'Recall@1': 0.7427, 'MRR@1': 0.7427, 'nDCG@1': 0.7427, 'Recall@5': 0.8915, 'MRR@5': 0.8046, 'nDCG@5': 0.8118, 'Recall@10': 0.9166, 'MRR@10': 0.8081, 'nDCG@10': 0.8217}

=== SUMMARY (Mr.TyDi-RU) ===
bm25_plain     -> {'N': 995, 'Hit@1': 0.4171, 'Recall@1': 0.4171, 'MRR@1': 0.4171, 'nDCG@1': 0.4171, 'Recall@5': 0.5548, 'MRR@5': 0.4713, 'nDCG@5': 0.4676, 'Recall@10': 0.599, 'MRR@10': 0.477, 'nDCG@10': 0.4813}
bm25_instr     -> {'N': 995, 'Hit@1': 0.3789, 'Recall@1': 0.3789, 'MRR@1': 0.3789, 'nDCG@1': 0.3789, 'Recall@5': 0.5025, 'MRR@5': 0.4265, 'nDCG@5': 0.4215, 'Recall@10': 0.5457, 'MRR@10': 0.4322, 'nDCG@10': 0.4346}
base_plain     -> {'N': 995, 'Hit@1': 0.808, 'Recall@1': 0.808, 'MRR@1': 0.808, 'nDCG@1': 0.808, 'Recall@5': 0.9357, 'MRR@5': 0.862, 'nDCG@5': 0.8686, 'Recall@10': 0.9588, 'MRR@10': 0.8653, 'nDCG@10': 0.8782}
base_instr     -> {'N': 995, 'Hit@1': 0.7779, 'Recall@1': 0.7779, 'MRR@1': 0.7779, 'nDCG@1': 0.7779, 'Recall@5': 0.

In [17]:
# RuBQ_2.0 
from pathlib import Path
import requests, zipfile, io, shutil, os

TARGET = Path("/kaggle/working/RuBQ_2.0")

def ensure_rubq_2_0(target: Path = TARGET) -> Path:
    target.mkdir(parents=True, exist_ok=True)
    has_any = any(target.rglob("*.json")) or any(target.rglob("*.jsonl"))
    if has_any:
        print(f"[ok] RuBQ_2.0 already present at: {target}")
        return target

    url = "https://codeload.github.com/vladislavneon/RuBQ/zip/refs/heads/master"
    print("[fetch] downloading RuBQ repo zip…")
    r = requests.get(url, timeout=90)
    r.raise_for_status()

    with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
        top_levels = {name.split("/")[0] for name in zf.namelist() if "/" in name}
        if not top_levels:
            raise RuntimeError("Unexpected zip layout (no top-level folder).")
        root_dir = sorted(top_levels)[0]  # e.g., 'RuBQ-master'
        inner_prefix = f"{root_dir}/RuBQ_2.0/"
        members = [m for m in zf.namelist() if m.startswith(inner_prefix)]
        if not members:
            raise RuntimeError("Couldn’t find 'RuBQ_2.0/' inside the repo zip.")
        extract_base = Path("/kaggle/working/_rubq_tmp")
        if extract_base.exists():
            shutil.rmtree(extract_base, ignore_errors=True)
        extract_base.mkdir(parents=True, exist_ok=True)
        zf.extractall(extract_base)

    src = extract_base / inner_prefix
    if target.exists():
        shutil.rmtree(target, ignore_errors=True)
    shutil.move(str(src), str(target))
    shutil.rmtree(extract_base, ignore_errors=True)

    print(f"[ok] RuBQ_2.0 ready at: {target}")
    return target

RUBQ_DIR = str(ensure_rubq_2_0())

json_count  = sum(1 for _ in Path(RUBQ_DIR).rglob("*.json"))
jsonl_count = sum(1 for _ in Path(RUBQ_DIR).rglob("*.jsonl"))
print(f"[scan] Found JSON: {json_count:,} | JSONL: {jsonl_count:,}")
print("Set RUBQ_DIR =", RUBQ_DIR)


[fetch] downloading RuBQ repo zip…
[ok] RuBQ_2.0 ready at: /kaggle/working/RuBQ_2.0
[scan] Found JSON: 3 | JSONL: 0
Set RUBQ_DIR = /kaggle/working/RuBQ_2.0


In [22]:
# RuBQ_2.0 -> Minimal IR Eval (BM25 + Base + LoRA)

import os, json, re, math, random, hashlib
from collections import defaultdict, Counter
from typing import Dict, List, Tuple

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

import numpy as np
from tqdm import tqdm
from rank_bm25 import BM25Okapi

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel

DEVICE         = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE_DOC = 64
BATCH_SIZE_Q   = 64
MAX_LEN        = 256
K_LIST         = [1, 5, 10]
SEED           = 42

DOC_CAP        = None     
FAST_Q_CAP     = None      

USE_PREFIXES   = True

BASE_MODEL          = "intfloat/multilingual-e5-base"
FINETUNED_INST_DIR  = "/kaggle/working/mE5_lora8bit_infoNCE_v1/best"      # instruction-aware
FINETUNED_QUERY_DIR = "/kaggle/working/mE5_lora8bit_queryonly_v1/best"    # query-only

RUBQ_DIR = "/kaggle/working/RuBQ_2.0"

ROOT_OUT  = "/kaggle/working/rubq_ir_eval"
EMB_CACHE = os.path.join(ROOT_OUT, "emb_cache")
os.makedirs(ROOT_OUT, exist_ok=True)
os.makedirs(EMB_CACHE, exist_ok=True)

random.seed(SEED); np.random.seed(SEED)
print(f"Device: {DEVICE} | DOC_CAP={DOC_CAP} | FAST_Q_CAP={FAST_Q_CAP}")

def add_prefix(s: str, kind: str) -> str:
    if not USE_PREFIXES: return s
    return (("query: " if kind == "query" else "passage: ") + s).strip()

def fmt(d): return {k:(round(v,4) if isinstance(v,float) else v) for k,v in d.items()}

def signature(doc_ids: List[str], name: str):
    head="||".join(doc_ids[:50]); tail="||".join(doc_ids[-50:]) if len(doc_ids)>50 else ""
    return hashlib.md5((head+"|"+tail+f"|{len(doc_ids)}").encode()).hexdigest()[:8] + f"__L{MAX_LEN}__{name.replace('/','_')}"

# Parse RuBQ JSONs
def first_nonempty(d: dict, keys: List[str]):
    for k in keys:
        v = d.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()
    return None

def extract_evidence_texts(item) -> List[str]:
    texts = []
    for k in ["evidence", "evidences"]:
        if k in item:
            ev = item[k]
            if isinstance(ev, str) and ev.strip():
                texts.append(ev.strip())
            elif isinstance(ev, list):
                for e in ev:
                    if isinstance(e, str) and e.strip():
                        texts.append(e.strip())
                    elif isinstance(e, dict):
                        t = first_nonempty(e, ["text", "snippet", "abstract", "passage", "content"])
                        if t: texts.append(t)
    return texts

def extract_answer_texts(item) -> List[str]:
    out = []
    if "answers" in item and isinstance(item["answers"], list):
        for a in item["answers"]:
            if isinstance(a, str) and a.strip():
                out.append(a.strip())
            elif isinstance(a, dict):
                t = first_nonempty(a, ["label", "text", "name", "description"])
                if t: out.append(t)
    if "answer" in item and isinstance(item["answer"], dict):
        t = first_nonempty(item["answer"], ["label", "text", "name", "description"])
        if t: out.append(t)
    for k in ["label", "description"]:
        if isinstance(item.get(k), str) and item[k].strip():
            out.append(item[k].strip())
    uniq = []
    seen = set()
    for s in out:
        s2 = s.strip()
        if s2 and s2 not in seen:
            seen.add(s2); uniq.append(s2)
    return uniq

def scan_jsons(root: str) -> List[dict]:
    all_items = []
    if not os.path.isdir(root):
        return []
    for dirpath, _, files in os.walk(root):
        for fn in files:
            if not fn.lower().endswith(".json"): continue
            fp = os.path.join(dirpath, fn)
            try:
                data = json.load(open(fp, "r", encoding="utf-8"))
                if isinstance(data, list):
                    all_items.extend(data)
                elif isinstance(data, dict):
                    for v in data.values():
                        if isinstance(v, list):
                            all_items.extend(v)
            except Exception as e:
                print(f"[warn] bad JSON: {fp} ({type(e).__name__})")
    return all_items

print("\n[prep] Scanning RuBQ JSON…")
raw_items = scan_jsons(RUBQ_DIR)
print(f"[prep] Loaded raw entries: {len(raw_items)}")

corpus: Dict[str, str] = {}
queries: Dict[str, str] = {}
qrels: Dict[str, Dict[str, int]] = defaultdict(dict)

def add_doc_get_id(text: str, text2id: Dict[str, str], prefix="D") -> str:
    h = hashlib.md5(text.encode("utf-8")).hexdigest()[:16]
    did = f"{prefix}_{h}"
    text2id[did] = text
    return did

q_missing = 0
has_evidence = 0
used_fallback = 0

for i, item in enumerate(tqdm(raw_items, desc="Parsing rows")):
    q = first_nonempty(item, ["question", "question_text", "question_ru", "text"])
    if not q:
        q_missing += 1
        continue
    qid = f"Q_{i:08d}"
    queries[qid] = q

    ev_txts = extract_evidence_texts(item)
    if ev_txts:
        has_evidence += 1
        for t in ev_txts:
            did = add_doc_get_id(t, corpus, prefix="E")
            qrels[qid][did] = 1
        continue

    ans_txts = extract_answer_texts(item)
    if ans_txts:
        used_fallback += 1
        for t in ans_txts:
            did = add_doc_get_id(t, corpus, prefix="A")
            qrels[qid][did] = 1

print(f"[prep] Done. Docs={len(corpus)} | Queries={len(queries)} | QrelsQ={len(qrels)}")
print(f"[prep] Items without question: {q_missing}")
print(f"[prep] Items with 'evidence*': {has_evidence} | using fallback(Label/Desc): {used_fallback}")

if len(corpus) == 0 or len(queries) == 0 or len(qrels) == 0:
    print("\n[STOP] Parsed dataset is too small/empty for IR.")
    print(" - Ensure your RuBQ JSONs include 'evidence/evidences' or answer labels/descriptions.")
    raise SystemExit(0)

doc_ids = list(corpus.keys())
if DOC_CAP and DOC_CAP < len(doc_ids):
    rng = random.Random(SEED); rng.shuffle(doc_ids); doc_ids = doc_ids[:DOC_CAP]
corpus = {d: corpus[d] for d in doc_ids}

qids_all = [qid for qid, rels in qrels.items() if any(d in corpus for d in rels)]
if FAST_Q_CAP and FAST_Q_CAP < len(qids_all):
    rng = random.Random(SEED); rng.shuffle(qids_all); qids_all = qids_all[:FAST_Q_CAP]

print(f"\n[eval] Final set -> docs={len(doc_ids)} | queries={len(qids_all)}")

# Metrics
def eval_multi_k(run, qrels_map, ks):
    ks = sorted(set(ks))
    stats = {"N": 0, "Hit@1": 0.0}
    for k in ks:
        stats.update({
            f"Recall@{k}": 0.0,
            f"MRR@{k}":    0.0,
            f"nDCG@{k}":   0.0,
            f"MAP@{k}":    0.0,   
        })

    n = 0
    for qid, ranking in run.items():
        if qid not in qrels_map:
            continue
        n += 1
        rels = qrels_map[qid]                 
        pos_ranks = [r for r, (did, _) in enumerate(ranking, start=1) if did in rels]
        pos_ranks.sort()

        # Hit@1
        if pos_ranks and pos_ranks[0] == 1:
            stats["Hit@1"] += 1.0

        m = len(rels)                         

        for k in ks:
            stats[f"Recall@{k}"] += float(any(r <= k for r in pos_ranks))

            stats[f"MRR@{k}"] += (1.0 / pos_ranks[0]) if (pos_ranks and pos_ranks[0] <= k) else 0.0

            dcg  = sum(1.0 / math.log2(r + 1) for r in pos_ranks if r <= k)
            idcg = sum(1.0 / math.log2(r + 1) for r in range(1, min(m, k) + 1))
            stats[f"nDCG@{k}"] += (dcg / idcg) if idcg > 0 else 0.0

            denom = min(k, m)
            if denom > 0:
                ap_k = 0.0
                rel_found_so_far = 0
                j = 0
                for r in pos_ranks:
                    if r > k:
                        break
                    j += 1                    
                    rel_found_so_far = j
                    prec_at_r = rel_found_so_far / r
                    ap_k += prec_at_r
                stats[f"MAP@{k}"] += ap_k / denom
    if n == 0:
        return stats

    stats["N"] = n
    for key in ["Hit@1"] + \
               [f"Recall@{x}" for x in ks] + \
               [f"MRR@{x}" for x in ks] + \
               [f"nDCG@{x}" for x in ks] + \
               [f"MAP@{x}" for x in ks]:
        stats[key] /= n
    return stats

# BM25 
token_re = re.compile(r"[a-zа-яё0-9]+", re.IGNORECASE)
def ru_tok(s: str): return token_re.findall(s.lower())

def bm25_run(doc_texts: List[str], doc_ids: List[str], qids: List[str], qmap: Dict[str,str], topk: int):
    print("\n[BM25] Building index over docs…")
    bm = BM25Okapi([ru_tok(t) for t in tqdm(doc_texts, desc="BM25 build")])
    run={}
    for qid in tqdm(qids, desc="BM25 search"):
        toks = ru_tok(qmap[qid])
        scores = bm.get_scores(toks)
        if topk >= len(doc_ids): idx = np.argsort(scores)[::-1]
        else:
            part = np.argpartition(scores, -topk)[-topk:]
            idx = part[np.argsort(scores[part])[::-1]]
        run[qid]=[(doc_ids[j], float(scores[j])) for j in idx]
    return run

def mean_pool(last_hidden, attention_mask):
    m = attention_mask.unsqueeze(-1).expand_as(last_hidden).float()
    return (last_hidden * m).sum(1) / m.sum(1).clamp(min=1e-9)

def load_tok_model_any(model_or_adapter: str, base_model: str):
    is_adapter = os.path.isdir(model_or_adapter) and os.path.exists(os.path.join(model_or_adapter, "adapter_config.json"))
    tok_src = model_or_adapter if (os.path.isdir(model_or_adapter) and os.path.exists(os.path.join(model_or_adapter, "tokenizer.json"))) else base_model
    tok = AutoTokenizer.from_pretrained(tok_src, use_fast=True)
    if DEVICE=="cuda":
        base = AutoModel.from_pretrained(base_model, torch_dtype=torch.float16, low_cpu_mem_usage=True).to("cuda").eval()
    else:
        base = AutoModel.from_pretrained(base_model, low_cpu_mem_usage=True).to("cpu").eval()
    mdl = PeftModel.from_pretrained(base, model_or_adapter).eval() if is_adapter else base
    return tok, mdl

@torch.no_grad()
def encode_texts(mdl, tok, texts: List[str], bs: int, max_len: int, device: str):
    outs=[]
    for i in tqdm(range(0,len(texts),bs), desc=f"Encode@{device}"):
        batch=texts[i:i+bs]
        enc=tok(batch,padding=True,truncation=True,max_length=max_len,return_tensors="pt")
        for k in enc: enc[k]=enc[k].to(device)
        last=mdl(**enc).last_hidden_state
        pooled=mean_pool(last, enc["attention_mask"])
        outs.append(F.normalize(pooled,p=2,dim=1).cpu())
    return torch.cat(outs,0) if outs else torch.empty(0, mdl.config.hidden_size)

def build_index(doc_embs_cpu: torch.Tensor):
    try:
        import faiss
        index = faiss.IndexFlatIP(doc_embs_cpu.shape[1])
        index.add(doc_embs_cpu.numpy().astype("float32"))
        return index, True
    except Exception:
        return doc_embs_cpu, False

@torch.no_grad()
def search_index(index, is_faiss: bool, q_embs: torch.Tensor, topk: int):
    if is_faiss:
        import faiss
        D, I = index.search(q_embs.numpy().astype("float32"), topk)
        return I, D
    sims = torch.mm(q_embs, index.t())
    D, I = torch.topk(sims, k=min(topk, sims.size(1)), dim=1)
    return I.numpy(), D.numpy()

def eval_model(model_or_adapter: str, label: str, doc_texts_pref: List[str], doc_ids: List[str],
               qids: List[str], qmap_pref: Dict[str,str]):
    print(f"\n=== {label} ===")
    tok, mdl = load_tok_model_any(model_or_adapter, BASE_MODEL)
    dev = str(next(mdl.parameters()).device)
    sig = signature(doc_ids, os.path.basename(model_or_adapter) if os.path.isdir(model_or_adapter) else model_or_adapter)
    cache_p = os.path.join(EMB_CACHE, f"docs__{sig}.pt")
    if os.path.exists(cache_p):
        doc_embs = torch.load(cache_p, map_location="cpu")
        print(f"[cache] doc embs: {doc_embs.shape}")
    else:
        doc_embs = encode_texts(mdl, tok, doc_texts_pref, BATCH_SIZE_DOC, MAX_LEN, dev)
        torch.save(doc_embs, cache_p)
        print(f"[cache] saved doc embs: {doc_embs.shape}")
    index, is_faiss = build_index(doc_embs)
    q_texts = [qmap_pref[qid] for qid in qids]
    q_embs  = encode_texts(mdl, tok, q_texts, BATCH_SIZE_Q, MAX_LEN, dev)
    I, D = search_index(index, is_faiss, q_embs, topk=max(K_LIST))
    run={}
    for i,qid in enumerate(qids):
        ids = I[i]; scores = D[i]
        run[qid]=[(doc_ids[j], float(scores[k])) for k,j in enumerate(ids)]
    m = eval_multi_k(run, qrels, K_LIST)
    print(fmt(m))
    return m

doc_ids = list(corpus.keys())
doc_texts_raw  = [corpus[d] for d in doc_ids]
doc_texts_pref = [add_prefix(t, "passage") for t in doc_texts_raw]

qids = list(qids_all)
query_map_pref = {qid: add_prefix(queries[qid], "query") for qid in qids}

# 1) BM25 ----------------
raw_query_map = {qid: queries[qid] for qid in qids}
bm_run = bm25_run(doc_texts_raw, doc_ids, qids, raw_query_map, topk=max(K_LIST))
bm25_metrics = eval_multi_k(bm_run, qrels, K_LIST)
print("\nBM25 ->", fmt(bm25_metrics))

# 2) Baseline encoder ----------------
base_metrics = eval_model(BASE_MODEL, "Baseline (mE5-base)", doc_texts_pref, doc_ids, qids, query_map_pref)

# 3) Adapters
if os.path.isdir(FINETUNED_INST_DIR):
    inst_metrics = eval_model(FINETUNED_INST_DIR, "Finetuned (instruction-aware, LoRA)", doc_texts_pref, doc_ids, qids, query_map_pref)
else:
    inst_metrics = None

if os.path.isdir(FINETUNED_QUERY_DIR):
    qo_metrics = eval_model(FINETUNED_QUERY_DIR, "Finetuned (query-only, LoRA)", doc_texts_pref, doc_ids, qids, query_map_pref)
else:
    qo_metrics = None

print("\n=== SUMMARY (RuBQ-derived IR) ===")
print("BM25   ->", fmt(bm25_metrics))
print("BASE   ->", fmt(base_metrics))
if inst_metrics is not None: print("FT-inst->", fmt(inst_metrics))
if qo_metrics   is not None: print("FT-qonly->", fmt(qo_metrics))

Device: cuda | DOC_CAP=None | FAST_Q_CAP=None

[prep] Scanning RuBQ JSON…
[prep] Loaded raw entries: 59862


Parsing rows: 100%|██████████| 59862/59862 [00:00<00:00, 388857.46it/s]


[prep] Done. Docs=3139 | Queries=59862 | QrelsQ=2400
[prep] Items without question: 0
[prep] Items with 'evidence*': 0 | using fallback(Label/Desc): 2400

[eval] Final set -> docs=3139 | queries=2400

[BM25] Building index over docs…


BM25 build: 100%|██████████| 3139/3139 [00:00<00:00, 512791.44it/s]
BM25 search: 100%|██████████| 2400/2400 [00:07<00:00, 328.87it/s]



BM25 -> {'N': 2400, 'Hit@1': 0.0229, 'Recall@1': 0.0229, 'MRR@1': 0.0229, 'nDCG@1': 0.0229, 'MAP@1': 0.0229, 'Recall@5': 0.0396, 'MRR@5': 0.029, 'nDCG@5': 0.0246, 'MAP@5': 0.0207, 'Recall@10': 0.0425, 'MRR@10': 0.0294, 'nDCG@10': 0.025, 'MAP@10': 0.0206}

=== Baseline (mE5-base) ===
[cache] doc embs: torch.Size([3139, 768])


Encode@cuda:0: 100%|██████████| 38/38 [00:01<00:00, 36.23it/s]


{'N': 2400, 'Hit@1': 0.0758, 'Recall@1': 0.0758, 'MRR@1': 0.0758, 'nDCG@1': 0.0758, 'MAP@1': 0.0758, 'Recall@5': 0.1787, 'MRR@5': 0.1135, 'nDCG@5': 0.1117, 'MAP@5': 0.0925, 'Recall@10': 0.2308, 'MRR@10': 0.1203, 'nDCG@10': 0.128, 'MAP@10': 0.0987}

=== Finetuned (instruction-aware, LoRA) ===
[cache] doc embs: torch.Size([3139, 768])


Encode@cuda:0: 100%|██████████| 38/38 [00:02<00:00, 16.30it/s]


{'N': 2400, 'Hit@1': 0.0692, 'Recall@1': 0.0692, 'MRR@1': 0.0692, 'nDCG@1': 0.0692, 'MAP@1': 0.0692, 'Recall@5': 0.1671, 'MRR@5': 0.1045, 'nDCG@5': 0.1023, 'MAP@5': 0.0841, 'Recall@10': 0.2175, 'MRR@10': 0.1113, 'nDCG@10': 0.118, 'MAP@10': 0.0902}

=== Finetuned (query-only, LoRA) ===
[cache] doc embs: torch.Size([3139, 768])


Encode@cuda:0: 100%|██████████| 38/38 [00:02<00:00, 16.49it/s]


{'N': 2400, 'Hit@1': 0.0779, 'Recall@1': 0.0779, 'MRR@1': 0.0779, 'nDCG@1': 0.0779, 'MAP@1': 0.0779, 'Recall@5': 0.1825, 'MRR@5': 0.1164, 'nDCG@5': 0.1139, 'MAP@5': 0.0944, 'Recall@10': 0.2267, 'MRR@10': 0.1221, 'nDCG@10': 0.1279, 'MAP@10': 0.0998}

=== SUMMARY (RuBQ-derived IR) ===
BM25   -> {'N': 2400, 'Hit@1': 0.0229, 'Recall@1': 0.0229, 'MRR@1': 0.0229, 'nDCG@1': 0.0229, 'MAP@1': 0.0229, 'Recall@5': 0.0396, 'MRR@5': 0.029, 'nDCG@5': 0.0246, 'MAP@5': 0.0207, 'Recall@10': 0.0425, 'MRR@10': 0.0294, 'nDCG@10': 0.025, 'MAP@10': 0.0206}
BASE   -> {'N': 2400, 'Hit@1': 0.0758, 'Recall@1': 0.0758, 'MRR@1': 0.0758, 'nDCG@1': 0.0758, 'MAP@1': 0.0758, 'Recall@5': 0.1787, 'MRR@5': 0.1135, 'nDCG@5': 0.1117, 'MAP@5': 0.0925, 'Recall@10': 0.2308, 'MRR@10': 0.1203, 'nDCG@10': 0.128, 'MAP@10': 0.0987}
FT-inst-> {'N': 2400, 'Hit@1': 0.0692, 'Recall@1': 0.0692, 'MRR@1': 0.0692, 'nDCG@1': 0.0692, 'MAP@1': 0.0692, 'Recall@5': 0.1671, 'MRR@5': 0.1045, 'nDCG@5': 0.1023, 'MAP@5': 0.0841, 'Recall@10': 0.217

In [23]:
import os, requests

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")  

def ensure_lang_file(lang: str, out_dir: str) -> str:
    os.makedirs(out_dir, exist_ok=True)
    fp = os.path.join(out_dir, f"{lang}.json")
    if os.path.exists(fp):
        return fp

    url = f"https://raw.githubusercontent.com/google-research-datasets/lareqa/master/xquad-r/{lang}.json"
    print(f"[fetch] downloading {lang}.json …")
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open(fp, "wb") as f:
        f.write(r.content)
    print(f"[fetch] saved -> {fp}")
    return fp

LAREQA_DIR = "/kaggle/working/lareqa_xquad_r"  
ru_path = ensure_lang_file("ru", LAREQA_DIR)
en_path = ensure_lang_file("en", LAREQA_DIR)
print("Files ready:", ru_path, en_path)

[fetch] downloading ru.json …
[fetch] saved -> /kaggle/working/lareqa_xquad_r/ru.json
[fetch] downloading en.json …
[fetch] saved -> /kaggle/working/lareqa_xquad_r/en.json
Files ready: /kaggle/working/lareqa_xquad_r/ru.json /kaggle/working/lareqa_xquad_r/en.json


In [24]:
# LAReQA (XQuAD-R) RU↔EN — BM25 + Base + LoRA adapters

import os, io, re, json, math, time, random, hashlib, requests
from collections import defaultdict
from typing import Dict, List, Tuple

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

import numpy as np
from tqdm import tqdm
import torch
import torch.nn.functional as F

from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel

DEVICE           = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE_DOC   = 64
BATCH_SIZE_Q     = 64
MAX_LEN          = 256
K_LIST           = [1, 5, 10]
SEED             = 42

DOC_CAP_TARGET   = None   
FAST_Q_CAP       = None  
USE_PREFIXES     = True   

FINETUNED_INST_DIR  = "/kaggle/working/mE5_lora8bit_infoNCE_v1/best"
FINETUNED_QUERY_DIR = "/kaggle/working/mE5_lora8bit_queryonly_v1/best"
BASE_MODEL          = "intfloat/multilingual-e5-base"

LAREQA_DIR = "/kaggle/working/lareqa_xquad_r"   
INSTR_JSONL = os.path.join(LAREQA_DIR, "instructions.jsonl")  
os.makedirs(LAREQA_DIR, exist_ok=True)

def tic(msg): print(f"\n{msg}"); return time.time()
def toc(t0, prefix="Elapsed"): print(f"{prefix}: {time.time()-t0:.1f}s")

random.seed(SEED); np.random.seed(SEED)
print(f"Device: {DEVICE} | DOC_CAP_TARGET={DOC_CAP_TARGET} | FAST_Q_CAP={FAST_Q_CAP}")

RAW_URL = "https://raw.githubusercontent.com/google-research-datasets/lareqa/master/dataset/xquad-r/{lang}.json"

def ensure_lang_file(lang: str, out_dir: str) -> str:
    fp = os.path.join(out_dir, f"{lang}.json")
    if os.path.exists(fp):
        return fp
    try:
        r = requests.get(RAW_URL.format(lang=lang), timeout=30)
        r.raise_for_status()
        with open(fp, "w", encoding="utf-8") as f:
            f.write(r.text)
        return fp
    except Exception as e:
        print(f"[warn] Could not download {lang}.json automatically: {e}")
        print("       Please upload en.json and ru.json into", out_dir)
        return fp  

ru_path = ensure_lang_file("ru", LAREQA_DIR)
en_path = ensure_lang_file("en", LAREQA_DIR)
if not (os.path.exists(ru_path) and os.path.exists(en_path)):
    raise FileNotFoundError("Missing ru.json or en.json in LAREQA_DIR. Upload them and re-run.")

def split_sentences(context: str, sent_field) -> List[str]:
    if isinstance(sent_field, list) and len(sent_field) > 0 and isinstance(sent_field[0], list):
        sents = []
        for start, end in sent_field:
            try:
                s = context[start:end].strip()
                if s: sents.append(s)
            except Exception:
                pass
        return sents
    if isinstance(sent_field, list) and sent_field and isinstance(sent_field[0], str):
        return [s.strip() for s in sent_field if str(s).strip()]
    return re.split(r'\s*(?<=\.|\?|!)\s+', context.strip())

def load_xquadr(path: str):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    qa_map = {}     
    para_sents = []
    for art in tqdm(data.get("data", []), desc=f"Parse {os.path.basename(path)}"):
        for par in art.get("paragraphs", []):
            context = par.get("context", "") or ""
            sents   = split_sentences(context, par.get("sentences"))
            base = len(para_sents)
            para_sents.extend(sents)
            for qa in par.get("qas", []):
                qid   = qa.get("id") or qa.get("qid") or ""
                qtext = qa.get("question", "").strip()
                answers = [a.get("text", "").strip() for a in qa.get("answers", []) if str(a.get("text","")).strip()]
                qa_map[qid] = {'question': qtext, 'answers': answers}
    return para_sents, qa_map

t0 = tic("[load] Reading ru.json & en.json…")
ru_sents, ru_qas = load_xquadr(ru_path)
en_sents, en_qas = load_xquadr(en_path)
toc(t0)

print(f"Sents: RU={len(ru_sents):,} | EN={len(en_sents):,}")
print(f"QAs:   RU={len(ru_qas):,} | EN={len(en_qas):,}")

def normalize(s): return re.sub(r"\s+", " ", s.strip().lower())

def build_corpus_and_qrels(q_lang: str, cand_lang: str,
                           qas_src: Dict[str,dict], answers_tgt: Dict[str,dict],
                           cand_sents: List[str],
                           doc_cap_target: int = DOC_CAP_TARGET,
                           fast_q_cap: int = FAST_Q_CAP):
    doc_ids = [f"{cand_lang}-s{ix}" for ix in range(len(cand_sents))]
    if doc_cap_target and doc_cap_target < len(doc_ids):
        rng = random.Random(SEED)
        order = list(range(len(doc_ids))); rng.shuffle(order)
        keep = sorted(order[:doc_cap_target])
        cand_sents = [cand_sents[i] for i in keep]
        doc_ids    = [doc_ids[i]    for i in keep]

    qids_all = sorted(set(qas_src.keys()) & set(answers_tgt.keys()))
    if fast_q_cap and fast_q_cap < len(qids_all):
        rng = random.Random(SEED); rng.shuffle(qids_all)
        qids_all = sorted(qids_all[:fast_q_cap])

    q_texts = {qid: qas_src[qid]['question'] for qid in qids_all}

    cand_norm = [normalize(x) for x in cand_sents]
    qrels = defaultdict(dict)
    for qid in tqdm(qids_all, desc=f"qrels {q_lang}->{cand_lang}"):
        golds = [normalize(a) for a in answers_tgt[qid].get('answers', []) if a.strip()]
        if not golds: 
            continue
        for si, s in enumerate(cand_norm):
            if any(g and g in s for g in golds):
                qrels[qid][doc_ids[si]] = 1
    qids = [qid for qid in qids_all if qrels.get(qid)]
    qrels = {qid: qrels[qid] for qid in qids}
    return doc_ids, cand_sents, qids, q_texts, qrels

# RU->EN
doc_ids_en, docs_en, qids_ru2en, q_ru_map, qrels_ru2en = build_corpus_and_qrels(
    "ru", "en", qas_src=ru_qas, answers_tgt=en_qas, cand_sents=en_sents,
    doc_cap_target=DOC_CAP_TARGET, fast_q_cap=FAST_Q_CAP
)
print(f"[RU→EN] candidates={len(doc_ids_en):,} | queries={len(qids_ru2en):,} | qrels_q={len(qrels_ru2en):,}")

# EN->RU
doc_ids_ru, docs_ru, qids_en2ru, q_en_map, qrels_en2ru = build_corpus_and_qrels(
    "en", "ru", qas_src=en_qas, answers_tgt=ru_qas, cand_sents=ru_sents,
    doc_cap_target=DOC_CAP_TARGET, fast_q_cap=FAST_Q_CAP
)
print(f"[EN→RU] candidates={len(doc_ids_ru):,} | queries={len(qids_en2ru):,} | qrels_q={len(qrels_en2ru):,}")

def load_instructions(jsonl_path: str) -> Dict[str, str]:
    """
    Accepts flexible fields per line:
      {"id": "QID", "instruction": "..."}  OR
      {"qid": "QID", "inst": "..."}        OR
      {"qid": "...", "instruction": "..."} etc.
    """
    m = {}
    if not os.path.exists(jsonl_path):
        return m
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line: continue
            try:
                j = json.loads(line)
            except:
                continue
            qid = str(j.get("qid") or j.get("id") or "").strip()
            inst = str(j.get("instruction") or j.get("inst") or "").strip()
            if qid and inst:
                m[qid] = inst
    return m

instr_map = load_instructions(INSTR_JSONL)
if instr_map:
    print(f"[instr] Loaded instructions for {len(instr_map):,} qids from {INSTR_JSONL}")
else:
    print("[instr] No instructions.jsonl found or empty — p-MRR will be skipped.")

def add_prefix(s: str, kind: str) -> str:
    if not USE_PREFIXES: return s
    return (("query: " if kind=="query" else "passage: ") + s).strip()

def fmt(d):
    return {k:(round(v,4) if isinstance(v,float) else v) for k,v in d.items()}

def mean_pool(last_hidden_state, attention_mask):
    m = attention_mask.unsqueeze(-1).expand_as(last_hidden_state).float()
    return (last_hidden_state * m).sum(dim=1) / m.sum(dim=1).clamp(min=1e-9)

def eval_multi_k(run, qrels: Dict[str, Dict[str,int]], ks: List[int]):
    ks = sorted(set(ks))
    stats = {"N": 0, "Hit@1": 0.0}
    for k in ks:
        stats.update({f"Recall@{k}":0.0, f"MRR@{k}":0.0, f"nDCG@{k}":0.0, f"MAP@{k}":0.0})
    n=0
    for qid, ranking in run.items():
        if qid not in qrels: continue
        n += 1
        rels = qrels[qid]
        pos = [r for r,(did,_) in enumerate(ranking, start=1) if did in rels]
        pos.sort()

        if pos and pos[0]==1:
            stats["Hit@1"] += 1.0

        m_rel = len(rels)
        for k in ks:
            stats[f"Recall@{k}"] += float(any(r<=k for r in pos))
            stats[f"MRR@{k}"]    += (1.0/pos[0]) if (pos and pos[0]<=k) else 0.0
            dcg  = sum(1.0/math.log2(r+1) for r in pos if r<=k)
            idcg = sum(1.0/math.log2(r+1) for r in range(1, min(m_rel,k)+1))
            stats[f"nDCG@{k}"] += (dcg/idcg) if idcg>0 else 0.0

            hits=0; ap_sum=0.0
            for r,(did,_) in enumerate(ranking[:k], start=1):
                if did in rels:
                    hits += 1
                    ap_sum += hits / r
            denom = float(min(m_rel, k)) if m_rel>0 else 1.0
            stats[f"MAP@{k}"] += (ap_sum/denom) if denom>0 else 0.0

    if n==0: return stats
    stats["N"]=n
    for k in ["Hit@1"]+[f"Recall@{x}" for x in ks]+[f"MRR@{x}" for x in ks]+[f"nDCG@{x}" for x in ks]+[f"MAP@{x}" for x in ks]:
        stats[k]/=n
    return stats

# BM25
token_re = re.compile(r"[a-z\u0400-\u04FF0-9]+", re.IGNORECASE)
def tok(s): return token_re.findall(s.lower())

def bm25_eval(doc_texts: List[str], doc_ids: List[str], qids: List[str], qmap: Dict[str,str], qrels: Dict[str,Dict[str,int]], topk: int):
    bm = BM25Okapi([tok(t) for t in tqdm(doc_texts, desc="BM25 build")])
    run = {}
    for qid in tqdm(qids, desc="BM25 search"):
        scores = bm.get_scores(tok(qmap[qid]))
        if topk >= len(doc_ids):
            idx = np.argsort(scores)[::-1]
        else:
            part = np.argpartition(scores, -topk)[-topk:]
            idx  = part[np.argsort(scores[part])[::-1]]
        run[qid] = [(doc_ids[j], float(scores[j])) for j in idx]
    return eval_multi_k(run, qrels, K_LIST)

def load_tok_and_model(model_or_adapter: str, base_model: str):
    is_adapter_dir = os.path.isdir(model_or_adapter) and os.path.exists(os.path.join(model_or_adapter, "adapter_config.json"))
    tok_src = model_or_adapter if (os.path.isdir(model_or_adapter) and os.path.exists(os.path.join(model_or_adapter, "tokenizer.json"))) else base_model
    tok = AutoTokenizer.from_pretrained(tok_src, use_fast=True)
    if DEVICE=="cuda":
        base = AutoModel.from_pretrained(base_model, torch_dtype=torch.float16, low_cpu_mem_usage=True).to("cuda").eval()
    else:
        base = AutoModel.from_pretrained(base_model, low_cpu_mem_usage=True).to("cpu").eval()
    mdl = PeftModel.from_pretrained(base, model_or_adapter).eval() if is_adapter_dir else base
    return tok, mdl

@torch.no_grad()
def encode_many(mdl, tok, texts: List[str], bs: int, max_len: int, device: str):
    out = []
    for i in tqdm(range(0, len(texts), bs), desc=f"Encode@{device}"):
        batch = texts[i:i+bs]
        enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        for k in enc: enc[k] = enc[k].to(device)
        last = mdl(**enc).last_hidden_state
        pooled = mean_pool(last, enc["attention_mask"])
        out.append(F.normalize(pooled, p=2, dim=1).cpu())
    return torch.cat(out, 0) if out else torch.empty(0, mdl.config.hidden_size)

def build_index(doc_embs_cpu: torch.Tensor):
    try:
        import faiss
        index = faiss.IndexFlatIP(doc_embs_cpu.shape[1])
        index.add(doc_embs_cpu.numpy().astype("float32"))
        return index, True
    except Exception:
        return doc_embs_cpu, False

@torch.no_grad()
def search(index, is_faiss: bool, q_embs: torch.Tensor, topk: int):
    if is_faiss:
        import faiss
        D, I = index.search(q_embs.numpy().astype("float32"), topk)
        return I, D
    sims = torch.mm(q_embs, index.t())
    D, I = torch.topk(sims, k=min(topk, sims.size(1)), dim=1)
    return I.numpy(), D.numpy()

def neural_eval(model_or_adapter: str, base_model: str,
                doc_texts: List[str], doc_ids: List[str],
                qids: List[str], qmap: Dict[str,str], qrels: Dict[str,Dict[str,int]]):
    print(f"\n[neural] {os.path.basename(model_or_adapter) if os.path.isdir(model_or_adapter) else model_or_adapter}")
    tok, mdl = load_tok_and_model(model_or_adapter, base_model)
    dev = next(mdl.parameters()).device

    doc_texts_pref = [add_prefix(t, "passage") for t in doc_texts]
    q_texts_pref   = [add_prefix(qmap[q], "query") for q in qids]

    t0 = tic("Encode docs…")
    d_embs = encode_many(mdl, tok, doc_texts_pref, BATCH_SIZE_DOC, MAX_LEN, str(dev))
    toc(t0)

    index, is_faiss = build_index(d_embs)

    t0 = tic("Encode queries…")
    q_embs = encode_many(mdl, tok, q_texts_pref, BATCH_SIZE_Q, MAX_LEN, str(dev))
    toc(t0)

    I, D = search(index, is_faiss, q_embs, topk=max(K_LIST))
    run = {}
    for i, q in enumerate(qids):
        ids = I[i]; scores = D[i]
        run[q] = [(doc_ids[j], float(scores[k])) for k, j in enumerate(ids)]
    m = eval_multi_k(run, qrels, K_LIST)
    print("Metrics:", fmt(m))
    return m

def p_mrr(stats_plain: dict, stats_instr: dict) -> dict:
    out = {}
    for k in list(stats_plain.keys()):
        if k.startswith("MRR@") and k in stats_instr:
            out[f"p{ k }"] = float(stats_instr[k] - stats_plain[k])
    return out

# Run: RU→EN & EN→RU ----------------
def run_one_direction(tag, doc_ids, docs, qids, qmap, qrels):
    print(f"\n=== Direction: {tag} ===")
    print(f"Docs={len(doc_ids):,} | Queries={len(qids):,} | QrelsQ={len(qrels):,}")

    # 1) BM25 (plain)
    bm_plain = bm25_eval(docs, doc_ids, qids, qmap, qrels, topk=max(K_LIST))
    print("BM25 (plain) ->", fmt(bm_plain))

    # 2) Base / LoRAs (plain)
    base_plain = neural_eval(BASE_MODEL, BASE_MODEL, docs, doc_ids, qids, qmap, qrels)
    ft_i_plain = neural_eval(FINETUNED_INST_DIR, BASE_MODEL, docs, doc_ids, qids, qmap, qrels) if os.path.isdir(FINETUNED_INST_DIR) else None
    ft_q_plain = neural_eval(FINETUNED_QUERY_DIR, BASE_MODEL, docs, doc_ids, qids, qmap, qrels) if os.path.isdir(FINETUNED_QUERY_DIR) else None

    # Optional p-MRR with instructions ---
    pmrr_block = {}
    if instr_map:
        qids_with_inst = [qid for qid in qids if qid in instr_map and instr_map[qid].strip()]
        if qids_with_inst:
            qmap_instr = {qid: (qmap[qid] + " " + instr_map[qid]).strip() for qid in qids_with_inst}
            qrels_sub  = {qid: qrels[qid] for qid in qids_with_inst}
            # BM25/BASE/LoRA with instruction-augmented queries
            bm_instr   = bm25_eval(docs, doc_ids, qids_with_inst, qmap_instr, qrels_sub, topk=max(K_LIST))
            base_instr = neural_eval(BASE_MODEL, BASE_MODEL, docs, doc_ids, qids_with_inst, qmap_instr, qrels_sub)
            ft_i_instr = neural_eval(FINETUNED_INST_DIR, BASE_MODEL, docs, doc_ids, qids_with_inst, qmap_instr, qrels_sub) if os.path.isdir(FINETUNED_INST_DIR) else None
            ft_q_instr = neural_eval(FINETUNED_QUERY_DIR, BASE_MODEL, docs, doc_ids, qids_with_inst, qmap_instr, qrels_sub) if os.path.isdir(FINETUNED_QUERY_DIR) else None

            pmrr_block["BM25"] = p_mrr({k:v for k,v in bm_plain.items() if k.startswith("MRR@")}, {k:v for k,v in bm_instr.items() if k.startswith("MRR@")})
            pmrr_block["BASE"] = p_mrr({k:v for k,v in base_plain.items() if k.startswith("MRR@")}, {k:v for k,v in base_instr.items() if k.startswith("MRR@")})
            if ft_i_plain and ft_i_instr:
                pmrr_block["FT_INST"] = p_mrr({k:v for k,v in ft_i_plain.items() if k.startswith("MRR@")}, {k:v for k,v in ft_i_instr.items() if k.startswith("MRR@")})
            if ft_q_plain and ft_q_instr:
                pmrr_block["FT_QUERY"] = p_mrr({k:v for k,v in ft_q_plain.items() if k.startswith("MRR@")}, {k:v for k,v in ft_q_instr.items() if k.startswith("MRR@")})
        else:
            print("[p-MRR] Instructions provided, but none overlap with current qids — skipping p-MRR.")
    else:
        print("[p-MRR] No instructions available — skipping p-MRR.")

    # Summary
    print("\n-- Summary:", tag)
    print("BM25:", fmt(bm_plain))
    print("BASE:", fmt(base_plain))
    if ft_i_plain: print("FT_INST:", fmt(ft_i_plain))
    if ft_q_plain: print("FT_QUERY:", fmt(ft_q_plain))
    if pmrr_block:
        print("\n=== p-MRR (instr − plain) ===")
        for name, d in pmrr_block.items():
            print(name + ":", fmt(d))

# RU→EN
run_one_direction("RU→EN", doc_ids_en, docs_en, qids_ru2en, q_ru_map, qrels_ru2en)

# EN→RU
run_one_direction("EN→RU", doc_ids_ru, docs_ru, qids_en2ru, q_en_map, qrels_en2ru)


Device: cuda | DOC_CAP_TARGET=None | FAST_Q_CAP=None

[load] Reading ru.json & en.json…


Parse ru.json: 100%|██████████| 48/48 [00:00<00:00, 18651.71it/s]
Parse en.json: 100%|██████████| 48/48 [00:00<00:00, 18991.28it/s]


Elapsed: 0.0s
Sents: RU=1,219 | EN=1,180
QAs:   RU=1,190 | EN=1,190


qrels ru->en: 100%|██████████| 1190/1190 [00:00<00:00, 1683.61it/s]


[RU→EN] candidates=1,180 | queries=1,187 | qrels_q=1,187


qrels en->ru: 100%|██████████| 1190/1190 [00:00<00:00, 1706.06it/s]


[EN→RU] candidates=1,219 | queries=1,188 | qrels_q=1,188
[instr] No instructions.jsonl found or empty — p-MRR will be skipped.

=== Direction: RU→EN ===
Docs=1,180 | Queries=1,187 | QrelsQ=1,187


BM25 build: 100%|██████████| 1180/1180 [00:00<00:00, 88013.78it/s]
BM25 search: 100%|██████████| 1187/1187 [00:02<00:00, 480.93it/s]


BM25 (plain) -> {'N': 1187, 'Hit@1': 0.0885, 'Recall@1': 0.0885, 'MRR@1': 0.0885, 'nDCG@1': 0.0885, 'MAP@1': 0.0885, 'Recall@5': 0.1457, 'MRR@5': 0.1083, 'nDCG@5': 0.1005, 'MAP@5': 0.0885, 'Recall@10': 0.1685, 'MRR@10': 0.1115, 'nDCG@10': 0.1036, 'MAP@10': 0.0887}

[neural] intfloat/multilingual-e5-base

Encode docs…


Encode@cuda:0: 100%|██████████| 19/19 [00:01<00:00, 12.31it/s]


Elapsed: 1.5s

Encode queries…


Encode@cuda:0: 100%|██████████| 19/19 [00:00<00:00, 34.64it/s]


Elapsed: 0.6s
Metrics: {'N': 1187, 'Hit@1': 0.7666, 'Recall@1': 0.7666, 'MRR@1': 0.7666, 'nDCG@1': 0.7666, 'MAP@1': 0.7666, 'Recall@5': 0.9334, 'MRR@5': 0.8353, 'nDCG@5': 0.7423, 'MAP@5': 0.6889, 'Recall@10': 0.9646, 'MRR@10': 0.8396, 'nDCG@10': 0.7488, 'MAP@10': 0.6877}

[neural] best

Encode docs…


Encode@cuda:0: 100%|██████████| 19/19 [00:03<00:00,  5.33it/s]


Elapsed: 3.6s

Encode queries…


Encode@cuda:0: 100%|██████████| 19/19 [00:01<00:00, 15.01it/s]


Elapsed: 1.3s
Metrics: {'N': 1187, 'Hit@1': 0.7195, 'Recall@1': 0.7195, 'MRR@1': 0.7195, 'nDCG@1': 0.7195, 'MAP@1': 0.7195, 'Recall@5': 0.9208, 'MRR@5': 0.8029, 'nDCG@5': 0.7181, 'MAP@5': 0.6617, 'Recall@10': 0.9596, 'MRR@10': 0.8081, 'nDCG@10': 0.7266, 'MAP@10': 0.6614}

[neural] best

Encode docs…


Encode@cuda:0: 100%|██████████| 19/19 [00:03<00:00,  5.27it/s]


Elapsed: 3.6s

Encode queries…


Encode@cuda:0: 100%|██████████| 19/19 [00:01<00:00, 14.94it/s]


Elapsed: 1.3s
Metrics: {'N': 1187, 'Hit@1': 0.7439, 'Recall@1': 0.7439, 'MRR@1': 0.7439, 'nDCG@1': 0.7439, 'MAP@1': 0.7439, 'Recall@5': 0.9301, 'MRR@5': 0.8198, 'nDCG@5': 0.736, 'MAP@5': 0.6805, 'Recall@10': 0.9638, 'MRR@10': 0.8242, 'nDCG@10': 0.7417, 'MAP@10': 0.6787}
[p-MRR] No instructions available — skipping p-MRR.

-- Summary: RU→EN
BM25: {'N': 1187, 'Hit@1': 0.0885, 'Recall@1': 0.0885, 'MRR@1': 0.0885, 'nDCG@1': 0.0885, 'MAP@1': 0.0885, 'Recall@5': 0.1457, 'MRR@5': 0.1083, 'nDCG@5': 0.1005, 'MAP@5': 0.0885, 'Recall@10': 0.1685, 'MRR@10': 0.1115, 'nDCG@10': 0.1036, 'MAP@10': 0.0887}
BASE: {'N': 1187, 'Hit@1': 0.7666, 'Recall@1': 0.7666, 'MRR@1': 0.7666, 'nDCG@1': 0.7666, 'MAP@1': 0.7666, 'Recall@5': 0.9334, 'MRR@5': 0.8353, 'nDCG@5': 0.7423, 'MAP@5': 0.6889, 'Recall@10': 0.9646, 'MRR@10': 0.8396, 'nDCG@10': 0.7488, 'MAP@10': 0.6877}
FT_INST: {'N': 1187, 'Hit@1': 0.7195, 'Recall@1': 0.7195, 'MRR@1': 0.7195, 'nDCG@1': 0.7195, 'MAP@1': 0.7195, 'Recall@5': 0.9208, 'MRR@5': 0.8029, '

BM25 build: 100%|██████████| 1219/1219 [00:00<00:00, 29365.16it/s]
BM25 search: 100%|██████████| 1188/1188 [00:02<00:00, 398.95it/s]


BM25 (plain) -> {'N': 1188, 'Hit@1': 0.0581, 'Recall@1': 0.0581, 'MRR@1': 0.0581, 'nDCG@1': 0.0581, 'MAP@1': 0.0581, 'Recall@5': 0.1347, 'MRR@5': 0.0864, 'nDCG@5': 0.0853, 'MAP@5': 0.0719, 'Recall@10': 0.1709, 'MRR@10': 0.091, 'nDCG@10': 0.0959, 'MAP@10': 0.076}

[neural] intfloat/multilingual-e5-base

Encode docs…


Encode@cuda:0: 100%|██████████| 20/20 [00:01<00:00, 10.25it/s]


Elapsed: 2.0s

Encode queries…


Encode@cuda:0: 100%|██████████| 19/19 [00:00<00:00, 36.78it/s]


Elapsed: 0.5s
Metrics: {'N': 1188, 'Hit@1': 0.7449, 'Recall@1': 0.7449, 'MRR@1': 0.7449, 'nDCG@1': 0.7449, 'MAP@1': 0.7449, 'Recall@5': 0.9276, 'MRR@5': 0.8207, 'nDCG@5': 0.7613, 'MAP@5': 0.7149, 'Recall@10': 0.9596, 'MRR@10': 0.8251, 'nDCG@10': 0.7705, 'MAP@10': 0.7168}

[neural] best

Encode docs…


Encode@cuda:0: 100%|██████████| 20/20 [00:04<00:00,  4.52it/s]


Elapsed: 4.4s

Encode queries…


Encode@cuda:0: 100%|██████████| 19/19 [00:01<00:00, 16.08it/s]


Elapsed: 1.2s
Metrics: {'N': 1188, 'Hit@1': 0.6995, 'Recall@1': 0.6995, 'MRR@1': 0.6995, 'nDCG@1': 0.6995, 'MAP@1': 0.6995, 'Recall@5': 0.915, 'MRR@5': 0.786, 'nDCG@5': 0.7346, 'MAP@5': 0.6829, 'Recall@10': 0.9537, 'MRR@10': 0.7913, 'nDCG@10': 0.7469, 'MAP@10': 0.6866}

[neural] best

Encode docs…


Encode@cuda:0: 100%|██████████| 20/20 [00:04<00:00,  4.49it/s]


Elapsed: 4.5s

Encode queries…


Encode@cuda:0: 100%|██████████| 19/19 [00:01<00:00, 15.99it/s]


Elapsed: 1.2s
Metrics: {'N': 1188, 'Hit@1': 0.7222, 'Recall@1': 0.7222, 'MRR@1': 0.7222, 'nDCG@1': 0.7222, 'MAP@1': 0.7222, 'Recall@5': 0.9285, 'MRR@5': 0.8044, 'nDCG@5': 0.7511, 'MAP@5': 0.7005, 'Recall@10': 0.9588, 'MRR@10': 0.8086, 'nDCG@10': 0.7602, 'MAP@10': 0.7027}
[p-MRR] No instructions available — skipping p-MRR.

-- Summary: EN→RU
BM25: {'N': 1188, 'Hit@1': 0.0581, 'Recall@1': 0.0581, 'MRR@1': 0.0581, 'nDCG@1': 0.0581, 'MAP@1': 0.0581, 'Recall@5': 0.1347, 'MRR@5': 0.0864, 'nDCG@5': 0.0853, 'MAP@5': 0.0719, 'Recall@10': 0.1709, 'MRR@10': 0.091, 'nDCG@10': 0.0959, 'MAP@10': 0.076}
BASE: {'N': 1188, 'Hit@1': 0.7449, 'Recall@1': 0.7449, 'MRR@1': 0.7449, 'nDCG@1': 0.7449, 'MAP@1': 0.7449, 'Recall@5': 0.9276, 'MRR@5': 0.8207, 'nDCG@5': 0.7613, 'MAP@5': 0.7149, 'Recall@10': 0.9596, 'MRR@10': 0.8251, 'nDCG@10': 0.7705, 'MAP@10': 0.7168}
FT_INST: {'N': 1188, 'Hit@1': 0.6995, 'Recall@1': 0.6995, 'MRR@1': 0.6995, 'nDCG@1': 0.6995, 'MAP@1': 0.6995, 'Recall@5': 0.915, 'MRR@5': 0.786, 'nDC

In [25]:
# eval_mfollowir_full_with_pMRR.py
# - Evaluates BASE and FINETUNED on mFollowIR (RU)
# - Tests plain and (q + instruction_og) against qrels_og
# - OOM-safe loading + caching doc embeddings
# - Adds p-MRR@k (paired uplift: instr MRR - plain MRR)

import os, math, random, hashlib
from collections import defaultdict
from typing import Dict, List, Tuple
import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE_DOC = 32
BATCH_SIZE_Q   = 64
MAX_LEN        = 256
K_LIST         = [1, 10, 20]
SEED           = 42
FAST_Q_CAP     = None
DOC_CAP        = None
FP16_ON_GPU    = True
CACHE_DIR      = "/kaggle/working/mfollowir_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

BASE_MODEL         = "intfloat/multilingual-e5-base"
FINETUNED_INST_DIR = "/kaggle/working/mE5_lora8bit_infoNCE_v1/best"  # LoRA adapters folder or full model dir

def add_prefix(s, kind): 
    return (("query: " if kind == "query" else "passage: ") + s).strip()

@torch.no_grad()
def encode_texts(model, tok, texts, batch_size, max_len, device):
    outs = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Encoding@{device}", leave=False):
        T = texts[i:i+batch_size]
        enc = tok(T, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        for k in enc: enc[k] = enc[k].to(device)
        out  = model(**enc).last_hidden_state
        mask = enc["attention_mask"].unsqueeze(-1).expand_as(out).float()
        pooled = (out * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
        outs.append(F.normalize(pooled, p=2, dim=1).cpu())
    return torch.cat(outs, 0) if outs else torch.empty(0)

def build_index(embs):
    import faiss
    idx = faiss.IndexFlatIP(embs.shape[1])
    idx.add(embs.numpy().astype("float32"))
    return idx

def search(idx, q_embs, topk):
    import faiss
    D, I = idx.search(q_embs.numpy().astype("float32"), topk)
    return I, D

def is_adapter_dir(path: str) -> bool:
    return (os.path.isdir(path) and os.path.exists(os.path.join(path, "adapter_config.json")))

def safe_load(model_or_adapter_path: str, base_model: str = BASE_MODEL):
    
    tok_src = model_or_adapter_path if (os.path.isdir(model_or_adapter_path) and os.path.exists(os.path.join(model_or_adapter_path, "tokenizer.json"))) else base_model
    tok = AutoTokenizer.from_pretrained(tok_src, use_fast=True)

    try:
        mdl_base = AutoModel.from_pretrained(
            base_model if is_adapter_dir(model_or_adapter_path) else model_or_adapter_path,
            torch_dtype=(torch.float16 if (DEVICE == "cuda" and FP16_ON_GPU) else None),
            low_cpu_mem_usage=True
        ).to(DEVICE if DEVICE == "cuda" else "cpu").eval()
    except RuntimeError:
        mdl_base = AutoModel.from_pretrained(
            base_model if is_adapter_dir(model_or_adapter_path) else model_or_adapter_path,
            low_cpu_mem_usage=True
        ).to("cpu").eval()

    if is_adapter_dir(model_or_adapter_path):
        mdl = PeftModel.from_pretrained(mdl_base, model_or_adapter_path).eval()
    else:
        mdl = mdl_base
    dev = "cuda" if next(mdl.parameters()).is_cuda else "cpu"
    return tok, mdl, dev

def eval_multi_k_and_perquery(run, qrels, ks) -> Tuple[Dict, Dict[int, Dict[str, float]]]:

    ks = sorted(set(ks))
    stats = {"N": 0, "Hit@1": 0.0}
    for k in ks: stats.update({f"Recall@{k}":0.0, f"MRR@{k}":0.0, f"nDCG@{k}":0.0})
    per_mrr = {k:{} for k in ks}
    n = 0
    for qid, ranking in run.items():
        if qid not in qrels: continue
        n += 1
        rels = qrels[qid]
        pos = [r for r,(did,_) in enumerate(ranking, start=1) if did in rels and rels[did] > 0]
        pos.sort()
        if pos and pos[0] == 1: stats["Hit@1"] += 1.0
        for k in ks:
            mrr_q = (1.0 / pos[0]) if (pos and pos[0] <= k) else 0.0
            per_mrr[k][qid] = mrr_q
            stats[f"Recall@{k}"] += float(any(r <= k for r in pos))
            stats[f"MRR@{k}"]    += mrr_q
            dcg  = sum(1.0/math.log2(r+1) for r in pos if r <= k)
            m    = sum(1 for _ in rels if rels[_] > 0)
            idcg = sum(1.0/math.log2(r+1) for r in range(1, min(m, k)+1))
            stats[f"nDCG@{k}"] += (dcg/idcg) if idcg > 0 else 0.0
    if n == 0: return stats, per_mrr
    stats["N"] = n
    for k in ["Hit@1"]+[f"Recall@{x}" for x in ks]+[f"MRR@{x}" for x in ks]+[f"nDCG@{x}" for x in ks]:
        stats[k] /= n
    return stats, per_mrr

def compute_p_mrr(per_mrr_plain: Dict[str,float], per_mrr_instr: Dict[str,float]) -> float:

    qids = sorted(set(per_mrr_plain.keys()) & set(per_mrr_instr.keys()))
    if not qids: return 0.0
    diffs = [(per_mrr_instr[q] - per_mrr_plain[q]) for q in qids]
    return float(np.mean(diffs))

print("Loading mFollowIR (rus)…")
ds_q = load_dataset("jhu-clsp/mFollowIR-parquet", "queries-rus")["queries"]
ds_d = load_dataset("jhu-clsp/mFollowIR-parquet", "corpus-rus")["corpus"]
ds_og = load_dataset("jhu-clsp/mFollowIR-parquet", "qrels_og-rus")["test"]

doc_ids = [r["_id"] for r in ds_d]
doc_txt = [((r.get("title") or "") + " " + (r.get("text") or "")).strip() for r in ds_d]
if DOC_CAP:
    rng = random.Random(SEED)
    idx = list(range(len(doc_ids))); rng.shuffle(idx); keep = sorted(idx[:DOC_CAP])
    doc_ids = [doc_ids[i] for i in keep]
    doc_txt = [doc_txt[i] for i in keep]
doc_txt = [add_prefix(t, "passage") for t in doc_txt]
doc_set = set(doc_ids)

def to_qrels(rows):
    qd = defaultdict(dict)
    for r in rows:
        if r["corpus-id"] in doc_set:
            qd[r["query-id"]][r["corpus-id"]] = float(r["score"])
    return {k:v for k,v in qd.items() if v}

qrels_og = to_qrels(ds_og)

q_plain_map = {r["_id"]: add_prefix(r["text"].strip(), "query") for r in ds_q}
q_og_map    = {r["_id"]: add_prefix((r["text"] + " " + r["instruction_og"]).strip(), "query") for r in ds_q}
qids = [qid for qid in q_plain_map if qid in qrels_og]
if FAST_Q_CAP:
    rng = random.Random(SEED); rng.shuffle(qids); qids = qids[:FAST_Q_CAP]
print(f"Docs={len(doc_ids)}, Queries={len(qids)}")

def evaluate(model_or_adapter_path: str, label: str, qmap: Dict[str,str]):
    print(f"\n=== {label} ===")
    tok, mdl, dev = safe_load(model_or_adapter_path, BASE_MODEL)

    sig = hashlib.md5(("|".join(doc_ids[:50])+f"|{len(doc_ids)}").encode()).hexdigest()[:8]
    base_name = os.path.basename(model_or_adapter_path) if os.path.isdir(model_or_adapter_path) else model_or_adapter_path.replace("/","_")
    dcache = os.path.join(CACHE_DIR, f"docs__{base_name}__{sig}__L{MAX_LEN}.pt")
    if os.path.exists(dcache):
        doc_embs = torch.load(dcache, map_location="cpu")
    else:
        doc_embs = encode_texts(mdl, tok, doc_txt, BATCH_SIZE_DOC, MAX_LEN, dev)
        torch.save(doc_embs, dcache)

    index = build_index(doc_embs)

    q_texts = [qmap[qid] for qid in qids]
    q_embs  = encode_texts(mdl, tok, q_texts, BATCH_SIZE_Q, MAX_LEN, dev)

    I, D = search(index, q_embs, max(K_LIST))
    run = { qid: [(doc_ids[j], float(D[i][k])) for k, j in enumerate(I[i])] for i, qid in enumerate(qids) }

    metrics, per_mrr = eval_multi_k_and_perquery(run, qrels_og, K_LIST)
    print({k:(round(v,4) if isinstance(v,float) else v) for k,v in metrics.items()})
    return metrics, per_mrr

res = {}
per = {}

# Baseline (plain vs q+inst)
res["baseline_plain"], per["baseline_plain"] = evaluate(BASE_MODEL, "BASELINE :: plain vs qrels_og", q_plain_map)
res["baseline_instr"], per["baseline_instr"] = evaluate(BASE_MODEL, "BASELINE :: q+inst vs qrels_og", q_og_map)

# Finetuned (instruction-aware LoRA) 
res["ft_plain"],     per["ft_plain"]     = evaluate(FINETUNED_INST_DIR, "FINETUNED :: plain vs qrels_og", q_plain_map)
res["ft_instr"],     per["ft_instr"]     = evaluate(FINETUNED_INST_DIR, "FINETUNED :: q+inst vs qrels_og", q_og_map)

# p-MRR@k = mean_q ( MRR_instr@k(q) - MRR_plain@k(q) )
def p_mrr_block(tag_plain: str, tag_instr: str) -> Dict[str, float]:
    out = {}
    for k in K_LIST:
        pmrr = compute_p_mrr(per[tag_plain][k], per[tag_instr][k])
        out[f"pMRR@{k}"] = round(pmrr, 6)
    return out

p_baseline = p_mrr_block("baseline_plain", "baseline_instr")
p_ft       = p_mrr_block("ft_plain",       "ft_instr")

print("\n=== SUMMARY (mFollowIR-RU, qrels_og) ===")
for k,v in res.items():
    print(f"{k:16s} ->", {kk:(round(vv,4) if isinstance(vv,float) else vv) for kk,vv in v.items()})

print("\n=== p-MRR (instr − plain) ===")
print("Baseline:", p_baseline)
print("Finetuned:", p_ft)

Loading mFollowIR (rus)…


README.md: 0.00B [00:00, ?B/s]

queries-rus/queries.parquet:   0%|          | 0.00/46.6k [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/40 [00:00<?, ? examples/s]

corpus-rus/corpus.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/39326 [00:00<?, ? examples/s]

qrels_og-rus/test.parquet:   0%|          | 0.00/467k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/12067 [00:00<?, ? examples/s]

Docs=39326, Queries=40

=== BASELINE :: plain vs qrels_og ===


                                                                  

{'N': 40, 'Hit@1': 0.475, 'Recall@1': 0.475, 'MRR@1': 0.475, 'nDCG@1': 0.475, 'Recall@10': 0.825, 'MRR@10': 0.597, 'nDCG@10': 0.3762, 'Recall@20': 0.875, 'MRR@20': 0.6004, 'nDCG@20': 0.3877}

=== BASELINE :: q+inst vs qrels_og ===


                                                            

{'N': 40, 'Hit@1': 0.575, 'Recall@1': 0.575, 'MRR@1': 0.575, 'nDCG@1': 0.575, 'Recall@10': 0.775, 'MRR@10': 0.6308, 'nDCG@10': 0.389, 'Recall@20': 0.85, 'MRR@20': 0.6369, 'nDCG@20': 0.4076}

=== FINETUNED :: plain vs qrels_og ===


                                                                  

{'N': 40, 'Hit@1': 0.55, 'Recall@1': 0.55, 'MRR@1': 0.55, 'nDCG@1': 0.55, 'Recall@10': 0.8, 'MRR@10': 0.6184, 'nDCG@10': 0.4079, 'Recall@20': 0.875, 'MRR@20': 0.6232, 'nDCG@20': 0.4013}

=== FINETUNED :: q+inst vs qrels_og ===


                                                            

{'N': 40, 'Hit@1': 0.575, 'Recall@1': 0.575, 'MRR@1': 0.575, 'nDCG@1': 0.575, 'Recall@10': 0.8, 'MRR@10': 0.6276, 'nDCG@10': 0.4282, 'Recall@20': 0.85, 'MRR@20': 0.6312, 'nDCG@20': 0.4364}

=== SUMMARY (mFollowIR-RU, qrels_og) ===
baseline_plain   -> {'N': 40, 'Hit@1': 0.475, 'Recall@1': 0.475, 'MRR@1': 0.475, 'nDCG@1': 0.475, 'Recall@10': 0.825, 'MRR@10': 0.597, 'nDCG@10': 0.3762, 'Recall@20': 0.875, 'MRR@20': 0.6004, 'nDCG@20': 0.3877}
baseline_instr   -> {'N': 40, 'Hit@1': 0.575, 'Recall@1': 0.575, 'MRR@1': 0.575, 'nDCG@1': 0.575, 'Recall@10': 0.775, 'MRR@10': 0.6308, 'nDCG@10': 0.389, 'Recall@20': 0.85, 'MRR@20': 0.6369, 'nDCG@20': 0.4076}
ft_plain         -> {'N': 40, 'Hit@1': 0.55, 'Recall@1': 0.55, 'MRR@1': 0.55, 'nDCG@1': 0.55, 'Recall@10': 0.8, 'MRR@10': 0.6184, 'nDCG@10': 0.4079, 'Recall@20': 0.875, 'MRR@20': 0.6232, 'nDCG@20': 0.4013}
ft_instr         -> {'N': 40, 'Hit@1': 0.575, 'Recall@1': 0.575, 'MRR@1': 0.575, 'nDCG@1': 0.575, 'Recall@10': 0.8, 'MRR@10': 0.6276, 'nDCG@10

In [26]:
# rus-XQuAD / rus-NFCorpus / WikiFacts-Articles — HF qrels evaluation

import os, io, gzip, glob, json, math, shutil, random, csv
from typing import Dict, List, Tuple
from collections import defaultdict

import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from huggingface_hub import snapshot_download
from tqdm import tqdm

DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE    = 64
MAX_LEN       = 256
K_LIST        = [2, 5, 10]
SEED          = 42
random.seed(SEED); np.random.seed(SEED)

FINETUNED_INST_DIR  = "/kaggle/working/mE5_lora8bit_infoNCE_v1/best"  
FINETUNED_QUERY_DIR = "/kaggle/working/mE5_lora8bit_queryonly_v1/best"               
BASE_MODEL          = "intfloat/multilingual-e5-base"

FAST_Q_CAP   = 300     
DOC_CAP      = None    
USE_PREFIXES = False  

CACHE_DIR    = "/kaggle/working/rusbeir_eval_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

OUT_ROOT     = "/kaggle/working/rusbeir_eval_new"
HF_CACHE     = "/kaggle/working/hf_cache"
os.makedirs(OUT_ROOT, exist_ok=True)
os.makedirs(HF_CACHE,  exist_ok=True)

print("Device:", DEVICE)

SETS = {
    "rus-xquad": {
        "data_repo":  "kaengreg/rus-xquad",
        "qrels_repo": "kaengreg/rus-xquad-qrels",
        "qrels_file": "dev.tsv",
    },
    "rus-nfcorpus": {
        "data_repo":  "kaengreg/rus-nfcorpus",
        "qrels_repo": "kaengreg/rus-nfcorpus-qrels",
        "qrels_file": "test.tsv",
    },
    "wikifacts-articles": {
        "data_repo":  "kaengreg/wikifacts-articles",
        "qrels_repo": "kaengreg/wikifacts-articles-qrels",
        "qrels_file": "dev.tsv",
    },
}

def _copy(src, dst):
    os.makedirs(os.path.dirname(dst), exist_ok=True)
    shutil.copy2(src, dst)

def _maybe_gunzip(src, dst):
    if src.endswith(".gz"):
        with gzip.open(src, "rt", encoding="utf-8") as f_in, open(dst, "w", encoding="utf-8") as f_out:
            for line in f_in: f_out.write(line)
    else:
        _copy(src, dst)

def fetch_raw_dataset(name: str, data_repo: str) -> str:
    out_dir = os.path.join(OUT_ROOT, name)
    os.makedirs(out_dir, exist_ok=True)
    root = snapshot_download(repo_id=data_repo, repo_type="dataset",
                             allow_patterns=["corpus.jsonl","queries.jsonl","*.gz"],
                             local_dir=HF_CACHE, local_dir_use_symlinks=False)
    cand_corpus  = glob.glob(os.path.join(root, "**", "corpus.jsonl*"),  recursive=True)
    cand_queries = glob.glob(os.path.join(root, "**", "queries.jsonl*"), recursive=True)
    if not cand_corpus or not cand_queries:
        raise FileNotFoundError(f"Could not find corpus.jsonl/queries.jsonl in {data_repo}")
    _maybe_gunzip(sorted(cand_corpus)[-1],  os.path.join(out_dir, "corpus.jsonl"))
    _maybe_gunzip(sorted(cand_queries)[-1], os.path.join(out_dir, "queries.jsonl"))
    return out_dir

def fetch_qrels(name: str, qrels_repo: str, qrels_file: str) -> str:
    out_dir = os.path.join(OUT_ROOT, name)
    os.makedirs(out_dir, exist_ok=True)
    root = snapshot_download(repo_id=qrels_repo, repo_type="dataset",
                             allow_patterns=[qrels_file],
                             local_dir=HF_CACHE, local_dir_use_symlinks=False)
    src = glob.glob(os.path.join(root, "**", qrels_file), recursive=True)
    if not src:
        raise FileNotFoundError(f"{qrels_file} not found in {qrels_repo}")
    dst = os.path.join(out_dir, "qrels.tsv")
    _copy(sorted(src)[-1], dst)
    return dst

def smart_open(path):
    if path.endswith(".gz"):
        return io.TextIOWrapper(gzip.open(path, "rb"), encoding="utf-8")
    return open(path, "r", encoding="utf-8")

def load_corpus(path: str) -> Dict[str, str]:
    corpus = {}
    with smart_open(path) as f:
        for line in f:
            j = json.loads(line)
            cid   = str(j.get("_id") or j.get("id"))
            title = (j.get("title") or "").strip()
            text  = (j.get("text")  or "").strip()
            corpus[cid] = (title + " " + text).strip()
    return corpus

def load_queries(path: str) -> Dict[str, str]:
    queries = {}
    with smart_open(path) as f:
        for line in f:
            j = json.loads(line)
            qid  = str(j.get("_id") or j.get("id"))
            text = (j.get("text") or "").strip()
            queries[qid] = text
    return queries

def load_qrels_tsv(path: str) -> Dict[str, Dict[str, int]]:
    qrels = defaultdict(dict)
    with open(path, "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter="\t")
        header = None
        qid_idx, did_idx, rel_idx = 0, 1, 2
        for i, row in enumerate(reader):
            if not row or all((c.strip() == "" for c in row)):
                continue
            if i == 0:
                lower = [c.strip().lower() for c in row]
                if any(k in lower for k in ("score", "relevance", "label", "rel")):
                    header = lower
                    def _idx(names, default):
                        for n in names:
                            if n in header:
                                return header.index(n)
                        return default
                    qid_idx = _idx(("query-id", "qid", "query_id"), 0)
                    did_idx = _idx(("corpus-id", "docid", "doc_id", "did"), 1)
                    rel_idx = _idx(("score", "relevance", "label", "rel"), 2)
                    continue
            try:
                qid = row[qid_idx].strip()
                did = row[did_idx].strip()
                rel_str = row[rel_idx].strip()
            except IndexError:
                continue
            try:
                rel_val = int(float(rel_str))
            except ValueError:
                continue
            rel = 1 if rel_val > 0 else 0
            if rel > 0:
                qrels[qid][did] = rel
    return qrels

def add_prefix(s: str, kind: str, mode: str = "plain") -> str:
    if not USE_PREFIXES:
        return s
    if kind == "query":
        if mode == "neutral_inst":
            return f"query: {s} Используй точное соответствие смысла запроса и отрывка; не делай выводов."
        else:
            return f"query: {s}"
    else:
        return f"passage: {s}"

@torch.no_grad()
def encode_texts(model, tok, texts: List[str], batch_size=BATCH_SIZE, max_len=MAX_LEN, device=DEVICE):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", leave=False):
        batch = texts[i:i+batch_size]
        enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        for k in enc: enc[k] = enc[k].to(device)
        out = model(**enc).last_hidden_state
        mask = enc["attention_mask"].unsqueeze(-1).expand_as(out).float()
        pooled = (out * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
        pooled = F.normalize(pooled, p=2, dim=1)
        all_embs.append(pooled.detach().cpu())
    return torch.cat(all_embs, dim=0) if all_embs else torch.empty(0, model.config.hidden_size)

def build_index(embs_cpu: torch.Tensor):
    try:
        import faiss
        index = faiss.IndexFlatIP(embs_cpu.shape[1])
        index.add(embs_cpu.numpy().astype("float32"))
        return index, True
    except Exception:
        return embs_cpu, False

@torch.no_grad()
def search(index, is_faiss: bool, q_embs: torch.Tensor, topk: int):
    if is_faiss:
        import faiss
        D, I = index.search(q_embs.numpy().astype("float32"), topk)
        return I, D
    q = q_embs
    d = index
    I_all, D_all = [], []
    chunk = 256
    for i in tqdm(range(0, q.shape[0], chunk), desc="Searching", leave=False):
        sims = torch.mm(q[i:i+chunk], d.t())
        D, I = torch.topk(sims, k=min(topk, sims.size(1)), dim=1)
        I_all.append(I.numpy()); D_all.append(D.numpy())
    return np.vstack(I_all), np.vstack(D_all)

def eval_multi_k(run, qrels: Dict[str, Dict[str,int]], ks: List[int]):
    ks = sorted(set(ks))
    stats = {"N": 0, "Hit@1": 0.0}
    for k in ks:
        stats[f"Recall@{k}"] = 0.0
        stats[f"MRR@{k}"]    = 0.0
        stats[f"nDCG@{k}"]   = 0.0
        stats[f"MAP@{k}"]    = 0.0   

    n = 0
    for qid, ranking in run.items():
        if qid not in qrels: 
            continue
        n += 1
        rels = qrels[qid]  
        pos_ranks = [r for r,(did,_) in enumerate(ranking, start=1) if did in rels and rels[did] > 0]
        pos_ranks.sort()

        if pos_ranks and pos_ranks[0] == 1:
            stats["Hit@1"] += 1.0

        m_total = sum(1 for _ in rels if rels[_] > 0)  

        for k in ks:
            stats[f"Recall@{k}"] += float(any(r <= k for r in pos_ranks))
            stats[f"MRR@{k}"] += (1.0/pos_ranks[0]) if (pos_ranks and pos_ranks[0] <= k) else 0.0
            dcg = sum(1.0 / math.log2(r+1) for r in pos_ranks if r <= k)
            idcg = sum(1.0 / math.log2(r+1) for r in range(1, min(m_total, k)+1))
            stats[f"nDCG@{k}"] += (dcg / idcg) if idcg > 0 else 0.0

            # MAP@k
            m_k = min(m_total, k)
            if m_k > 0:
                hits = 0
                ap_sum = 0.0
                for r in pos_ranks:
                    if r > k:
                        break
                    hits += 1
                    ap_sum += hits / r
                stats[f"MAP@{k}"] += ap_sum / m_k
            else:
                stats[f"MAP@{k}"] += 0.0

    if n == 0:
        return stats
    stats["N"] = n
    stats["Hit@1"] /= n
    for k in ks:
        stats[f"Recall@{k}"] /= n
        stats[f"MRR@{k}"]    /= n
        stats[f"nDCG@{k}"]   /= n
        stats[f"MAP@{k}"]    /= n
    return stats

@torch.no_grad()
def evaluate(model_dir_or_name: str, tag: str, doc_texts: List[str], q_texts: List[str],
             qids_eval: List[str], qrels: Dict[str, Dict[str,int]], ks: List[int], doc_ids: List[str],
             cache_key: str = None):
    print(f"\n=== {tag} ===")
    tok = AutoTokenizer.from_pretrained(model_dir_or_name, use_fast=True)
    mdl = AutoModel.from_pretrained(model_dir_or_name).to(DEVICE).eval()

    cache_path = None
    if cache_key:
        cache_path = os.path.join(CACHE_DIR, f"{cache_key}__{tag.replace(' ','_')}.pt")
    if cache_path and os.path.exists(cache_path):
        doc_embs = torch.load(cache_path, map_location="cpu")
    else:
        doc_embs = encode_texts(mdl, tok, doc_texts)
        if cache_path: torch.save(doc_embs, cache_path)

    index, is_faiss = build_index(doc_embs)

    q_embs = encode_texts(mdl, tok, q_texts)
    kmax = max(ks)
    I, D = search(index, is_faiss, q_embs, kmax)

    run = {}
    for i, qid in enumerate(qids_eval):
        ids = I[i]; scores = D[i]
        run[qid] = [(doc_ids[j], float(scores[k])) for k, j in enumerate(ids)]
    m = eval_multi_k(run, qrels, ks)
    print({k:(round(v,4) if isinstance(v,float) else v) for k,v in m.items()})
    return m

def run_on_dataset(name: str, spec: dict):
    base_dir = fetch_raw_dataset(name, spec["data_repo"])
    qrels_path = fetch_qrels(name, spec["qrels_repo"], spec["qrels_file"])

    corpus  = load_corpus(os.path.join(base_dir, "corpus.jsonl"))
    queries = load_queries(os.path.join(base_dir, "queries.jsonl"))
    if DOC_CAP:
        rng = random.Random(SEED); doc_ids_all = list(corpus.keys()); rng.shuffle(doc_ids_all)
        keep = set(doc_ids_all[:DOC_CAP])
        corpus = {k:v for k,v in corpus.items() if k in keep}

    qrels = load_qrels_tsv(qrels_path)
    qrels = {qid: {did:rel for did,rel in dd.items() if did in corpus} for qid,dd in qrels.items()}
    qrels = {qid: dd for qid,dd in qrels.items() if dd}

    qids = [qid for qid in qrels.keys() if qid in queries]
    if FAST_Q_CAP and len(qids) > FAST_Q_CAP:
        rng = random.Random(SEED); rng.shuffle(qids); qids = qids[:FAST_Q_CAP]

    print(f"Dataset: {name} | Eval queries: {len(qids)} | Docs: {len(corpus)}")

    doc_ids   = list(corpus.keys())
    doc_texts = [add_prefix(corpus[d], "passage") for d in doc_ids]
    q_plain   = [add_prefix(queries[q], "query", mode="plain") for q in qids]
    q_neutral = [add_prefix(queries[q], "query", mode="neutral_inst") for q in qids]

    results = {}

    if FINETUNED_INST_DIR and os.path.exists(FINETUNED_INST_DIR):
        results["finetuned_instruction"] = evaluate(
            FINETUNED_INST_DIR, "Finetuned (instruction-aware)",
            doc_texts, q_neutral, qids, qrels, K_LIST, doc_ids,
            cache_key=f"{name}"
        )

    if FINETUNED_QUERY_DIR and os.path.exists(FINETUNED_QUERY_DIR):
        results["finetuned_queryonly"] = evaluate(
            FINETUNED_QUERY_DIR, "Finetuned (query-only)",
            doc_texts, q_plain, qids, qrels, K_LIST, doc_ids,
            cache_key=f"{name}"
        )

    results["baseline"] = evaluate(
        BASE_MODEL, "Baseline (zero-shot)",
        doc_texts, q_plain, qids, qrels, K_LIST, doc_ids,
        cache_key=f"{name}"
    )

    print("\n--- Summary:", name, "---")
    for k, v in results.items():
        print(k, "->", {kk:(round(vv,3) if isinstance(vv,float) else vv) for kk,vv in (v or {}).items()})
    return results

all_results = {}
for name, spec in SETS.items():
    print(f"\n=== Fetch & Evaluate: {name} ===")
    all_results[name] = run_on_dataset(name, spec)

print("\n=== GRAND SUMMARY ===")
for ds, res in all_results.items():
    print("\n", ds)
    for k, v in res.items():
        print(" ", k, "->", {kk:(round(vv,3) if isinstance(vv,float) else vv) for kk,vv in (v or {}).items()})


Device: cuda

=== Fetch & Evaluate: rus-xquad ===


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


corpus.jsonl: 0.00B [00:00, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

dev.tsv: 0.00B [00:00, ?B/s]

Dataset: rus-xquad | Eval queries: 300 | Docs: 240

=== Finetuned (instruction-aware) ===


                                                       

{'N': 300, 'Hit@1': 0.8767, 'Recall@2': 0.94, 'MRR@2': 0.9083, 'nDCG@2': 0.9166, 'MAP@2': 0.9083, 'Recall@5': 0.99, 'MRR@5': 0.9224, 'nDCG@5': 0.9395, 'MAP@5': 0.9224, 'Recall@10': 0.9933, 'MRR@10': 0.923, 'nDCG@10': 0.9407, 'MAP@10': 0.923}

=== Finetuned (query-only) ===


                                                       

{'N': 300, 'Hit@1': 0.84, 'Recall@2': 0.93, 'MRR@2': 0.885, 'nDCG@2': 0.8968, 'MAP@2': 0.885, 'Recall@5': 0.9733, 'MRR@5': 0.8978, 'nDCG@5': 0.9171, 'MAP@5': 0.8978, 'Recall@10': 0.9833, 'MRR@10': 0.8991, 'nDCG@10': 0.9203, 'MAP@10': 0.8991}

=== Baseline (zero-shot) ===


                                                       

{'N': 300, 'Hit@1': 0.8933, 'Recall@2': 0.95, 'MRR@2': 0.9217, 'nDCG@2': 0.9291, 'MAP@2': 0.9217, 'Recall@5': 0.98, 'MRR@5': 0.9302, 'nDCG@5': 0.9429, 'MAP@5': 0.9302, 'Recall@10': 0.9933, 'MRR@10': 0.932, 'nDCG@10': 0.9472, 'MAP@10': 0.932}

--- Summary: rus-xquad ---
finetuned_instruction -> {'N': 300, 'Hit@1': 0.877, 'Recall@2': 0.94, 'MRR@2': 0.908, 'nDCG@2': 0.917, 'MAP@2': 0.908, 'Recall@5': 0.99, 'MRR@5': 0.922, 'nDCG@5': 0.939, 'MAP@5': 0.922, 'Recall@10': 0.993, 'MRR@10': 0.923, 'nDCG@10': 0.941, 'MAP@10': 0.923}
finetuned_queryonly -> {'N': 300, 'Hit@1': 0.84, 'Recall@2': 0.93, 'MRR@2': 0.885, 'nDCG@2': 0.897, 'MAP@2': 0.885, 'Recall@5': 0.973, 'MRR@5': 0.898, 'nDCG@5': 0.917, 'MAP@5': 0.898, 'Recall@10': 0.983, 'MRR@10': 0.899, 'nDCG@10': 0.92, 'MAP@10': 0.899}
baseline -> {'N': 300, 'Hit@1': 0.893, 'Recall@2': 0.95, 'MRR@2': 0.922, 'nDCG@2': 0.929, 'MAP@2': 0.922, 'Recall@5': 0.98, 'MRR@5': 0.93, 'nDCG@5': 0.943, 'MAP@5': 0.93, 'Recall@10': 0.993, 'MRR@10': 0.932, 'nDCG@10'

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

corpus.jsonl:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

test.tsv: 0.00B [00:00, ?B/s]

Dataset: rus-nfcorpus | Eval queries: 300 | Docs: 3633

=== Finetuned (instruction-aware) ===


                                                         

{'N': 300, 'Hit@1': 0.37, 'Recall@2': 0.46, 'MRR@2': 0.415, 'nDCG@2': 0.3412, 'MAP@2': 0.31, 'Recall@5': 0.5667, 'MRR@5': 0.4433, 'nDCG@5': 0.2999, 'MAP@5': 0.2357, 'Recall@10': 0.63, 'MRR@10': 0.4524, 'nDCG@10': 0.2688, 'MAP@10': 0.1873}

=== Finetuned (query-only) ===


                                                         

{'N': 300, 'Hit@1': 0.3633, 'Recall@2': 0.4433, 'MRR@2': 0.4033, 'nDCG@2': 0.3216, 'MAP@2': 0.2892, 'Recall@5': 0.5467, 'MRR@5': 0.4313, 'nDCG@5': 0.2831, 'MAP@5': 0.2183, 'Recall@10': 0.6233, 'MRR@10': 0.4419, 'nDCG@10': 0.2552, 'MAP@10': 0.1736}

=== Baseline (zero-shot) ===


                                                         

{'N': 300, 'Hit@1': 0.37, 'Recall@2': 0.4833, 'MRR@2': 0.4267, 'nDCG@2': 0.3468, 'MAP@2': 0.3117, 'Recall@5': 0.5767, 'MRR@5': 0.4516, 'nDCG@5': 0.3018, 'MAP@5': 0.2343, 'Recall@10': 0.6433, 'MRR@10': 0.4607, 'nDCG@10': 0.268, 'MAP@10': 0.1841}

--- Summary: rus-nfcorpus ---
finetuned_instruction -> {'N': 300, 'Hit@1': 0.37, 'Recall@2': 0.46, 'MRR@2': 0.415, 'nDCG@2': 0.341, 'MAP@2': 0.31, 'Recall@5': 0.567, 'MRR@5': 0.443, 'nDCG@5': 0.3, 'MAP@5': 0.236, 'Recall@10': 0.63, 'MRR@10': 0.452, 'nDCG@10': 0.269, 'MAP@10': 0.187}
finetuned_queryonly -> {'N': 300, 'Hit@1': 0.363, 'Recall@2': 0.443, 'MRR@2': 0.403, 'nDCG@2': 0.322, 'MAP@2': 0.289, 'Recall@5': 0.547, 'MRR@5': 0.431, 'nDCG@5': 0.283, 'MAP@5': 0.218, 'Recall@10': 0.623, 'MRR@10': 0.442, 'nDCG@10': 0.255, 'MAP@10': 0.174}
baseline -> {'N': 300, 'Hit@1': 0.37, 'Recall@2': 0.483, 'MRR@2': 0.427, 'nDCG@2': 0.347, 'MAP@2': 0.312, 'Recall@5': 0.577, 'MRR@5': 0.452, 'nDCG@5': 0.302, 'MAP@5': 0.234, 'Recall@10': 0.643, 'MRR@10': 0.461, '

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

corpus.jsonl:   0%|          | 0.00/639M [00:00<?, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

dev.tsv: 0.00B [00:00, ?B/s]

Dataset: wikifacts-articles | Eval queries: 300 | Docs: 12848

=== Finetuned (instruction-aware) ===


                                                           

{'N': 300, 'Hit@1': 0.49, 'Recall@2': 0.6567, 'MRR@2': 0.5733, 'nDCG@2': 0.511, 'MAP@2': 0.4708, 'Recall@5': 0.74, 'MRR@5': 0.5975, 'nDCG@5': 0.5559, 'MAP@5': 0.4987, 'Recall@10': 0.8033, 'MRR@10': 0.6062, 'nDCG@10': 0.585, 'MAP@10': 0.5143}

=== Finetuned (query-only) ===


                                                           

{'N': 300, 'Hit@1': 0.42, 'Recall@2': 0.5633, 'MRR@2': 0.4917, 'nDCG@2': 0.4389, 'MAP@2': 0.4042, 'Recall@5': 0.69, 'MRR@5': 0.5274, 'nDCG@5': 0.4914, 'MAP@5': 0.4322, 'Recall@10': 0.73, 'MRR@10': 0.5327, 'nDCG@10': 0.5146, 'MAP@10': 0.4452}

=== Baseline (zero-shot) ===


                                                           

{'N': 300, 'Hit@1': 0.4467, 'Recall@2': 0.6067, 'MRR@2': 0.5267, 'nDCG@2': 0.4712, 'MAP@2': 0.4342, 'Recall@5': 0.6967, 'MRR@5': 0.5519, 'nDCG@5': 0.5172, 'MAP@5': 0.4624, 'Recall@10': 0.78, 'MRR@10': 0.5629, 'nDCG@10': 0.5501, 'MAP@10': 0.4787}

--- Summary: wikifacts-articles ---
finetuned_instruction -> {'N': 300, 'Hit@1': 0.49, 'Recall@2': 0.657, 'MRR@2': 0.573, 'nDCG@2': 0.511, 'MAP@2': 0.471, 'Recall@5': 0.74, 'MRR@5': 0.598, 'nDCG@5': 0.556, 'MAP@5': 0.499, 'Recall@10': 0.803, 'MRR@10': 0.606, 'nDCG@10': 0.585, 'MAP@10': 0.514}
finetuned_queryonly -> {'N': 300, 'Hit@1': 0.42, 'Recall@2': 0.563, 'MRR@2': 0.492, 'nDCG@2': 0.439, 'MAP@2': 0.404, 'Recall@5': 0.69, 'MRR@5': 0.527, 'nDCG@5': 0.491, 'MAP@5': 0.432, 'Recall@10': 0.73, 'MRR@10': 0.533, 'nDCG@10': 0.515, 'MAP@10': 0.445}
baseline -> {'N': 300, 'Hit@1': 0.447, 'Recall@2': 0.607, 'MRR@2': 0.527, 'nDCG@2': 0.471, 'MAP@2': 0.434, 'Recall@5': 0.697, 'MRR@5': 0.552, 'nDCG@5': 0.517, 'MAP@5': 0.462, 'Recall@10': 0.78, 'MRR@10': 

In [27]:
# Evaluate: rus-arguana & rus-scifact (RU) with LoRA adapters
# - Baseline E5-base, LoRA instruction-aware, LoRA query-only
# - Rebuild qrels from BEIR via ir_datasets, filtered to RU IDs
# - Caches doc embeddings per (dataset, model)

import os, io, gzip, glob, json, math, random, sys, hashlib
from collections import defaultdict
from typing import Dict, List, Tuple

import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from huggingface_hub import snapshot_download
from tqdm import tqdm

import ir_datasets as irds

try:
    import faiss
    FAISS_OK = True
except Exception:
    FAISS_OK = False

os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE    = 64           
MAX_LEN       = 256
K_LIST        = [1, 5, 10]
SEED          = 42
FAST_Q_CAP    = 300           
DOC_CAP       = None         

FINETUNED_INST_DIR  = "/kaggle/working/mE5_lora8bit_infoNCE_v1/best"    
FINETUNED_QUERY_DIR = "/kaggle/working/mE5_lora8bit_queryonly_v1/best"   
BASE_MODEL          = "intfloat/multilingual-e5-base"                    

USE_PREFIXES       = True
USE_NEUTRAL_INST   = True  
NEUTRAL_RU         = "Используй точное соответствие смысла запроса и отрывка; не делай выводов."

CACHE_DIR   = "/kaggle/working/rusbeir_eval_cache"
OUT_ROOT    = "/kaggle/working/rusbeir_eval_runs"
HF_CACHE    = "/kaggle/working/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(OUT_ROOT,  exist_ok=True)
os.makedirs(HF_CACHE,  exist_ok=True)

print(f"Device: {DEVICE} | BATCH={BATCH_SIZE} | MAX_LEN={MAX_LEN} | FAST_Q_CAP={FAST_Q_CAP} | DOC_CAP={DOC_CAP}")

SETS = {
    "rus-arguana": {
        "data_repo": "kaengreg/rus-arguana",
        "beir_ids":  ["beir/arguana", "beir-v1.0.0/arguana", "arguana"],   # try in order
    },
    "rus-scifact": {
        "data_repo": "kaengreg/rus-scifact",
        "beir_ids":  ["beir/scifact/test", "beir/scifact"],                # try in order
    },
}

def _copy(src, dst):
    os.makedirs(os.path.dirname(dst), exist_ok=True)
    if src == dst:
        return
    with open(src, "rb") as f_in, open(dst, "wb") as f_out:
        f_out.write(f_in.read())

def _maybe_gunzip(src, dst):
    if src.endswith(".gz"):
        with gzip.open(src, "rt", encoding="utf-8") as f_in, open(dst, "w", encoding="utf-8") as f_out:
            for line in f_in: f_out.write(line)
    else:
        _copy(src, dst)

def fetch_raw_dataset(name: str, data_repo: str) -> str:
    out_dir = os.path.join(OUT_ROOT, name)
    os.makedirs(out_dir, exist_ok=True)
    root = snapshot_download(repo_id=data_repo, repo_type="dataset",
                             allow_patterns=["corpus.jsonl", "queries.jsonl", "*.gz"],
                             local_dir=HF_CACHE, local_dir_use_symlinks=False)
    cand_corpus  = glob.glob(os.path.join(root, "**", "corpus.jsonl*"),  recursive=True)
    cand_queries = glob.glob(os.path.join(root, "**", "queries.jsonl*"), recursive=True)
    if not cand_corpus or not cand_queries:
        raise FileNotFoundError(f"Could not find corpus.jsonl/queries.jsonl in {data_repo}")
    _maybe_gunzip(sorted(cand_corpus)[-1],  os.path.join(out_dir, "corpus.jsonl"))
    _maybe_gunzip(sorted(cand_queries)[-1], os.path.join(out_dir, "queries.jsonl"))
    return out_dir

def smart_open(path):
    if path.endswith(".gz"):
        return io.TextIOWrapper(gzip.open(path, "rb"), encoding="utf-8")
    return open(path, "r", encoding="utf-8")

def load_corpus(path: str) -> Dict[str, str]:
    corpus = {}
    with smart_open(path) as f:
        for line in f:
            j = json.loads(line)
            cid   = str(j.get("_id") or j.get("id"))
            title = (j.get("title") or "").strip()
            text  = (j.get("text")  or "").strip()
            corpus[cid] = (title + " " + text).strip()
    return corpus

def load_queries(path: str) -> Dict[str, str]:
    queries = {}
    with smart_open(path) as f:
        for line in f:
            j = json.loads(line)
            qid  = str(j.get("_id") or j.get("id"))
            text = (j.get("text") or "").strip()
            queries[qid] = text
    return queries

def build_qrels_from_beir(beir_ids: List[str], hf_qids: set, hf_docs: set):
    ds = None; chosen = None
    tried = []
    for dsid in beir_ids:
        tried.append(dsid)
        try:
            ds = irds.load(dsid)
            chosen = dsid
            break
        except KeyError:
            continue
    if ds is None:
        raise KeyError(f"None of these IRDS ids exist: {tried}")

    qrels = defaultdict(dict)
    kept = skipped = 0
    for q in ds.qrels_iter():
        qid = str(q.query_id)
        did = str(q.doc_id)
        rel = int(getattr(q, "relevance", 1))
        if (qid in hf_qids) and (did in hf_docs) and rel > 0:
            qrels[qid][did] = 1
            kept += 1
        else:
            skipped += 1
    print(f"  qrels from {chosen} -> kept {kept:,}, skipped {skipped:,}")
    return dict(qrels)

def add_prefix_query(s: str, for_inst_model: bool) -> str:
    if not USE_PREFIXES:
        return s
    q = f"query: {s}".strip()
    if for_inst_model and USE_NEUTRAL_INST and NEUTRAL_RU:
        q = f"{q} {NEUTRAL_RU}".strip()
    return q

def add_prefix_passage(s: str) -> str:
    if not USE_PREFIXES:
        return s
    return f"passage: {s}".strip()

def corpus_signature(ids: List[str]) -> str:
    key = "||".join(ids[:50]) + "||" + "||".join(ids[-50:]) + f"||{len(ids)}"
    return hashlib.md5(key.encode()).hexdigest()[:8]

@torch.no_grad()
def encode_texts(model, tok, texts: List[str], batch_size=BATCH_SIZE, max_len=MAX_LEN, device=DEVICE):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Encoding@{device}", leave=False):
        batch = texts[i:i+batch_size]
        enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        for k in enc: enc[k] = enc[k].to(device)
        out = model(**enc).last_hidden_state   # [B, L, H]
        mask = enc["attention_mask"].unsqueeze(-1).expand_as(out).float()
        pooled = (out * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)  # mean pool
        pooled = F.normalize(pooled, p=2, dim=1)
        all_embs.append(pooled.detach().cpu())
    return torch.cat(all_embs, dim=0) if all_embs else torch.empty(0, model.config.hidden_size)

def build_index(embs_cpu: torch.Tensor):
    if FAISS_OK:
        index = faiss.IndexFlatIP(embs_cpu.shape[1])
        index.add(embs_cpu.numpy().astype("float32"))
        return index, True
    return embs_cpu, False

@torch.no_grad()
def search(index, is_faiss: bool, q_embs: torch.Tensor, topk: int):
    if is_faiss:
        D, I = index.search(q_embs.numpy().astype("float32"), topk)
        return I, D
    # torch fallback
    q = q_embs; d = index
    I_all, D_all = [], []
    chunk = 256
    for i in tqdm(range(0, q.shape[0], chunk), desc="Searching", leave=False):
        sims = torch.mm(q[i:i+chunk], d.t())
        D, I = torch.topk(sims, k=min(topk, sims.size(1)), dim=1)
        I_all.append(I.numpy()); D_all.append(D.numpy())
    return np.vstack(I_all), np.vstack(D_all)

def eval_multi_k(run, qrels: Dict[str, Dict[str,int]], ks: List[int]):
    ks = sorted(set(ks))
    stats = {"N": 0, "Hit@1": 0.0}
    for k in ks:
        stats[f"Recall@{k}"] = 0.0
        stats[f"MRR@{k}"]    = 0.0
        stats[f"nDCG@{k}"]   = 0.0
        stats[f"MAP@{k}"]    = 0.0   

    n = 0
    for qid, ranking in run.items():
        if qid not in qrels: 
            continue
        n += 1
        rels = qrels[qid]
        pos_ranks = [r for r,(did,_) in enumerate(ranking, start=1) if did in rels and rels[did] > 0]
        pos_ranks.sort()

        if pos_ranks and pos_ranks[0] == 1:
            stats["Hit@1"] += 1.0

        m_total = sum(1 for _ in rels if rels[_] > 0)  

        for k in ks:
            stats[f"Recall@{k}"] += float(any(r <= k for r in pos_ranks))
            stats[f"MRR@{k}"]    += (1.0/pos_ranks[0]) if (pos_ranks and pos_ranks[0] <= k) else 0.0
            dcg  = sum(1.0/math.log2(r+1) for r in pos_ranks if r <= k)
            idcg = sum(1.0/math.log2(r+1) for r in range(1, min(m_total, k)+1))
            stats[f"nDCG@{k}"] += (dcg/idcg) if idcg > 0 else 0.0
            m_k = min(m_total, k)
            if m_k > 0:
                hits = 0
                ap_sum = 0.0
                for r in pos_ranks:
                    if r > k:
                        break
                    hits += 1
                    ap_sum += hits / r
                stats[f"MAP@{k}"] += ap_sum / m_k
            else:
                stats[f"MAP@{k}"] += 0.0

    if n == 0:
        return stats
    stats["N"] = n
    stats["Hit@1"] /= n
    for k in ks:
        stats[f"Recall@{k}"] /= n
        stats[f"MRR@{k}"]    /= n
        stats[f"nDCG@{k}"]   /= n
        stats[f"MAP@{k}"]    /= n
    return stats

def fmt(d):  
    return {k:(round(v,4) if isinstance(v,float) else v) for k,v in d.items()}

def safe_load_peft_for_inference(base_model_name: str, adapter_dir: str, device: str):

    from peft import PeftModel
    torch.cuda.empty_cache()
    dtype = torch.float16 if (device=="cuda") else None
    try:
        base = AutoModel.from_pretrained(base_model_name, torch_dtype=dtype, low_cpu_mem_usage=True)
        base = base.to(device).eval()
    except RuntimeError as e:
        print(f"[warn] base on GPU failed ({type(e).__name__}): using CPU fp32")
        base = AutoModel.from_pretrained(base_model_name, low_cpu_mem_usage=True).to("cpu").eval()
        device = "cpu"
    try:
        tok = AutoTokenizer.from_pretrained(adapter_dir, use_fast=True)
    except Exception:
        tok = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
    model = PeftModel.from_pretrained(base, adapter_dir).eval()
    return tok, model, device

def safe_load_baseline(model_name: str, device: str):
    dtype = torch.float16 if (device=="cuda") else None
    try:
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        mdl = AutoModel.from_pretrained(model_name, torch_dtype=dtype, low_cpu_mem_usage=True).to(device).eval()
        return tok, mdl, device
    except RuntimeError as e:
        print(f"[warn] baseline on GPU failed ({type(e).__name__}): using CPU fp32")
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        mdl = AutoModel.from_pretrained(model_name, low_cpu_mem_usage=True).to("cpu").eval()
        return tok, mdl, "cpu"

def evaluate_model(tag: str, tok, mdl, dev: str,
                   doc_ids: List[str], doc_texts: List[str],
                   qids_eval: List[str], q_texts: List[str],
                   qrels: Dict[str, Dict[str,int]],
                   cache_key: str):
    print(f"\n=== {tag} ===")

    cache_path = os.path.join(CACHE_DIR, f"docs__{cache_key}.pt")
    if os.path.exists(cache_path):
        doc_embs = torch.load(cache_path, map_location="cpu")
        print(f"[cache] loaded docs {doc_embs.shape} from {cache_path}")
    else:
        doc_embs = encode_texts(mdl, tok, doc_texts, batch_size=BATCH_SIZE, max_len=MAX_LEN, device=dev)
        torch.save(doc_embs, cache_path)
        print(f"[cache] saved docs {doc_embs.shape} -> {cache_path}")

    index, is_faiss = build_index(doc_embs)

    q_embs = encode_texts(mdl, tok, q_texts, batch_size=BATCH_SIZE, max_len=MAX_LEN, device=dev)

    I, D = search(index, is_faiss, q_embs, topk=max(K_LIST))

    run = {}
    for i, qid in enumerate(qids_eval):
        ids = I[i]; scores = D[i]
        run[qid] = [(doc_ids[j], float(scores[k])) for k, j in enumerate(ids)]

    m = eval_multi_k(run, qrels, K_LIST)
    print(fmt(m))
    return m

def run_on_dataset(name: str, spec: dict):
    base_dir = fetch_raw_dataset(name, spec["data_repo"])
    corpus   = load_corpus(os.path.join(base_dir, "corpus.jsonl"))
    queries  = load_queries(os.path.join(base_dir, "queries.jsonl"))

    if DOC_CAP:
        rng = random.Random(SEED)
        keys = list(corpus.keys()); rng.shuffle(keys)
        keep = set(keys[:DOC_CAP])
        corpus = {k:v for k,v in corpus.items() if k in keep}

    hf_qids = set(queries.keys())
    hf_docs = set(corpus.keys())
    qrels   = build_qrels_from_beir(spec["beir_ids"], hf_qids, hf_docs)

    qids = [qid for qid in queries if qid in qrels]
    if FAST_Q_CAP and len(qids) > FAST_Q_CAP:
        rng = random.Random(SEED)
        rng.shuffle(qids)
        qids = qids[:FAST_Q_CAP]

    print(f"\nDataset: {name} | Eval queries: {len(qids)} | Docs: {len(corpus)}")

    doc_ids   = list(corpus.keys())
    doc_texts = [add_prefix_passage(corpus[d]) for d in doc_ids]

    q_texts_base   = [add_prefix_query(queries[q], for_inst_model=False) for q in qids]
    q_texts_inst   = [add_prefix_query(queries[q], for_inst_model=True ) for q in qids]
    q_texts_qonly  = [add_prefix_query(queries[q], for_inst_model=False) for q in qids]  

    results = {}

    tok_b, mdl_b, dev_b = safe_load_baseline(BASE_MODEL, DEVICE)
    cache_key_b = f"{name}__baseline__{corpus_signature(doc_ids)}__L{MAX_LEN}"
    results["baseline"] = evaluate_model("Baseline (mE5-base)", tok_b, mdl_b, dev_b,
                                         doc_ids, doc_texts, qids, q_texts_base, qrels, cache_key_b)
    del mdl_b; torch.cuda.empty_cache()

    if FINETUNED_INST_DIR and os.path.exists(FINETUNED_INST_DIR):
        tok_i, mdl_i, dev_i = safe_load_peft_for_inference(BASE_MODEL, FINETUNED_INST_DIR, DEVICE)
        cache_key_i = f"{name}__inst__{corpus_signature(doc_ids)}__L{MAX_LEN}"
        results["finetuned_instruction"] = evaluate_model("LoRA (instruction-aware)", tok_i, mdl_i, dev_i,
                                                          doc_ids, doc_texts, qids, q_texts_inst, qrels, cache_key_i)
        del mdl_i; torch.cuda.empty_cache()
    else:
        print("(!) Skipping finetuned_instruction — dir not found.")

    if FINETUNED_QUERY_DIR and os.path.exists(FINETUNED_QUERY_DIR):
        tok_q, mdl_q, dev_q = safe_load_peft_for_inference(BASE_MODEL, FINETUNED_QUERY_DIR, DEVICE)
        cache_key_q = f"{name}__queryonly__{corpus_signature(doc_ids)}__L{MAX_LEN}"
        results["finetuned_queryonly"] = evaluate_model("LoRA (query-only)", tok_q, mdl_q, dev_q,
                                                        doc_ids, doc_texts, qids, q_texts_qonly, qrels, cache_key_q)
        del mdl_q; torch.cuda.empty_cache()
    else:
        print("(!) Skipping finetuned_queryonly — dir not found.")

    print("\n--- Summary:", name, "---")
    for k, v in results.items():
        print(k, "->", fmt(v))
    return results

all_results = {}
for name, spec in SETS.items():
    all_results[name] = run_on_dataset(name, spec)

print("\n=== GRAND SUMMARY ===")
for ds, res in all_results.items():
    print("\n", ds)
    for k, v in res.items():
        print(" ", k, "->", fmt(v))


Device: cuda | BATCH=64 | MAX_LEN=256 | FAST_Q_CAP=300 | DOC_CAP=None


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

queries.jsonl: 0.00B [00:00, ?B/s]

corpus.jsonl:   0%|          | 0.00/33.8M [00:00<?, ?B/s]

[INFO] [starting] opening zip file
[INFO] [starting] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/arguana.zip
[INFO] [finished] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/arguana.zip: [00:00] [3.77MB] [4.83MB/s]
[INFO] [finished] opening zip file [5.25s]                                                                 


  qrels from beir/arguana -> kept 1,401, skipped 5

Dataset: rus-arguana | Eval queries: 300 | Docs: 8674

=== Baseline (mE5-base) ===


                                                                

[cache] saved docs torch.Size([8674, 768]) -> /kaggle/working/rusbeir_eval_cache/docs__rus-arguana__baseline__9369b9db__L256.pt


                                                            

{'N': 300, 'Hit@1': 0.0, 'Recall@1': 0.0, 'MRR@1': 0.0, 'nDCG@1': 0.0, 'MAP@1': 0.0, 'Recall@5': 0.49, 'MRR@5': 0.1803, 'nDCG@5': 0.2572, 'MAP@5': 0.1803, 'Recall@10': 0.6733, 'MRR@10': 0.2045, 'nDCG@10': 0.3163, 'MAP@10': 0.2045}

=== LoRA (instruction-aware) ===


                                                                

[cache] saved docs torch.Size([8674, 768]) -> /kaggle/working/rusbeir_eval_cache/docs__rus-arguana__inst__9369b9db__L256.pt


                                                            

{'N': 300, 'Hit@1': 0.0, 'Recall@1': 0.0, 'MRR@1': 0.0, 'nDCG@1': 0.0, 'MAP@1': 0.0, 'Recall@5': 0.6067, 'MRR@5': 0.2337, 'nDCG@5': 0.3272, 'MAP@5': 0.2337, 'Recall@10': 0.7667, 'MRR@10': 0.2554, 'nDCG@10': 0.3793, 'MAP@10': 0.2554}

=== LoRA (query-only) ===


                                                                

[cache] saved docs torch.Size([8674, 768]) -> /kaggle/working/rusbeir_eval_cache/docs__rus-arguana__queryonly__9369b9db__L256.pt


                                                            

{'N': 300, 'Hit@1': 0.0, 'Recall@1': 0.0, 'MRR@1': 0.0, 'nDCG@1': 0.0, 'MAP@1': 0.0, 'Recall@5': 0.55, 'MRR@5': 0.2108, 'nDCG@5': 0.2957, 'MAP@5': 0.2108, 'Recall@10': 0.73, 'MRR@10': 0.236, 'nDCG@10': 0.3551, 'MAP@10': 0.236}

--- Summary: rus-arguana ---
baseline -> {'N': 300, 'Hit@1': 0.0, 'Recall@1': 0.0, 'MRR@1': 0.0, 'nDCG@1': 0.0, 'MAP@1': 0.0, 'Recall@5': 0.49, 'MRR@5': 0.1803, 'nDCG@5': 0.2572, 'MAP@5': 0.1803, 'Recall@10': 0.6733, 'MRR@10': 0.2045, 'nDCG@10': 0.3163, 'MAP@10': 0.2045}
finetuned_instruction -> {'N': 300, 'Hit@1': 0.0, 'Recall@1': 0.0, 'MRR@1': 0.0, 'nDCG@1': 0.0, 'MAP@1': 0.0, 'Recall@5': 0.6067, 'MRR@5': 0.2337, 'nDCG@5': 0.3272, 'MAP@5': 0.2337, 'Recall@10': 0.7667, 'MRR@10': 0.2554, 'nDCG@10': 0.3793, 'MAP@10': 0.2554}
finetuned_queryonly -> {'N': 300, 'Hit@1': 0.0, 'Recall@1': 0.0, 'MRR@1': 0.0, 'nDCG@1': 0.0, 'MAP@1': 0.0, 'Recall@5': 0.55, 'MRR@5': 0.2108, 'nDCG@5': 0.2957, 'MAP@5': 0.2108, 'Recall@10': 0.73, 'MRR@10': 0.236, 'nDCG@10': 0.3551, 'MAP@10':

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

corpus.jsonl:   0%|          | 0.00/28.7M [00:00<?, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

[INFO] [starting] opening zip file
[INFO] [starting] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip
[INFO] [finished] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip: [00:00] [2.82MB] [4.17MB/s]
[INFO] [finished] opening zip file [1.70s]                                                                 


  qrels from beir/scifact/test -> kept 339, skipped 0

Dataset: rus-scifact | Eval queries: 300 | Docs: 5183

=== Baseline (mE5-base) ===


                                                              

[cache] saved docs torch.Size([5183, 768]) -> /kaggle/working/rusbeir_eval_cache/docs__rus-scifact__baseline__fe705ae6__L256.pt


                                                            

{'N': 300, 'Hit@1': 0.49, 'Recall@1': 0.49, 'MRR@1': 0.49, 'nDCG@1': 0.49, 'MAP@1': 0.49, 'Recall@5': 0.69, 'MRR@5': 0.5646, 'nDCG@5': 0.5845, 'MAP@5': 0.5496, 'Recall@10': 0.75, 'MRR@10': 0.5729, 'nDCG@10': 0.6073, 'MAP@10': 0.5608}

=== LoRA (instruction-aware) ===


                                                              

[cache] saved docs torch.Size([5183, 768]) -> /kaggle/working/rusbeir_eval_cache/docs__rus-scifact__inst__fe705ae6__L256.pt


                                                            

{'N': 300, 'Hit@1': 0.45, 'Recall@1': 0.45, 'MRR@1': 0.45, 'nDCG@1': 0.45, 'MAP@1': 0.45, 'Recall@5': 0.6667, 'MRR@5': 0.5336, 'nDCG@5': 0.5567, 'MAP@5': 0.5198, 'Recall@10': 0.7267, 'MRR@10': 0.542, 'nDCG@10': 0.58, 'MAP@10': 0.5315}

=== LoRA (query-only) ===


                                                              

[cache] saved docs torch.Size([5183, 768]) -> /kaggle/working/rusbeir_eval_cache/docs__rus-scifact__queryonly__fe705ae6__L256.pt


                                                            

{'N': 300, 'Hit@1': 0.4133, 'Recall@1': 0.4133, 'MRR@1': 0.4133, 'nDCG@1': 0.4133, 'MAP@1': 0.4133, 'Recall@5': 0.63, 'MRR@5': 0.494, 'nDCG@5': 0.5155, 'MAP@5': 0.4782, 'Recall@10': 0.6933, 'MRR@10': 0.5029, 'nDCG@10': 0.54, 'MAP@10': 0.4905}

--- Summary: rus-scifact ---
baseline -> {'N': 300, 'Hit@1': 0.49, 'Recall@1': 0.49, 'MRR@1': 0.49, 'nDCG@1': 0.49, 'MAP@1': 0.49, 'Recall@5': 0.69, 'MRR@5': 0.5646, 'nDCG@5': 0.5845, 'MAP@5': 0.5496, 'Recall@10': 0.75, 'MRR@10': 0.5729, 'nDCG@10': 0.6073, 'MAP@10': 0.5608}
finetuned_instruction -> {'N': 300, 'Hit@1': 0.45, 'Recall@1': 0.45, 'MRR@1': 0.45, 'nDCG@1': 0.45, 'MAP@1': 0.45, 'Recall@5': 0.6667, 'MRR@5': 0.5336, 'nDCG@5': 0.5567, 'MAP@5': 0.5198, 'Recall@10': 0.7267, 'MRR@10': 0.542, 'nDCG@10': 0.58, 'MAP@10': 0.5315}
finetuned_queryonly -> {'N': 300, 'Hit@1': 0.4133, 'Recall@1': 0.4133, 'MRR@1': 0.4133, 'nDCG@1': 0.4133, 'MAP@1': 0.4133, 'Recall@5': 0.63, 'MRR@5': 0.494, 'nDCG@5': 0.5155, 'MAP@5': 0.4782, 'Recall@10': 0.6933, 'MRR@10'

In [28]:
# Unified Robustness / Prompt-Transfer Eval 
#  - Datasets: Mr.TyDi-RU (prepared), mFollowIR-RU, LAReQA XQuAD-R (RU↔EN)
#  - Models:   mE5-base, optional LoRA inst-aware, LoRA query-only
#  - Metrics:  Hit@1, Recall@K, MRR@K, MAP@K, nDCG@K, p-MRR@K
#  - Robustness: Prompt ablation, Paraphrase variance (5x), Light noise stress

import os, io, re, json, math, time, random, hashlib, gzip, glob
from collections import defaultdict
from typing import Dict, List, Tuple

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm

try:
    import faiss
    FAISS_OK = True
except Exception:
    FAISS_OK = False

DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE_D  = 64
BATCH_SIZE_Q  = 64
MAX_LEN       = 256
K_LIST        = [1, 5, 10]
SEED          = 42

DO_PROMPT_ABLATION   = True    
DO_PARAPHRASE_ROBUST = True    
DO_NOISE_STRESS      = True    

BASE_MODEL          = "intfloat/multilingual-e5-base"
FINETUNED_INST_DIR  = "/kaggle/working/mE5_lora8bit_infoNCE_v1/best"     
FINETUNED_QUERY_DIR = "/kaggle/working/mE5_lora8bit_queryonly_v1/best"   

USE_PREFIXES = True  

ROOT_WORK         = "/kaggle/working"
CACHE_DIR_EMB     = os.path.join(ROOT_WORK, "robust_eval_cache"); os.makedirs(CACHE_DIR_EMB, exist_ok=True)

MRTYDI_DIR        = os.path.join(ROOT_WORK, "mrtydi_ru_prepared")
LAREQA_DIR        = os.path.join(ROOT_WORK, "lareqa_xquad_r"); os.makedirs(LAREQA_DIR, exist_ok=True)

MRTYDI_DOC_CAP    = 20000
MRTYDI_FAST_Q     = None
MFOLLOWIR_DOC_CAP = None
MFOLLOWIR_FAST_Q  = None
LAREQA_DOC_CAP    = None
LAREQA_FAST_Q     = None

random.seed(SEED); np.random.seed(SEED)
print(f"Device: {DEVICE} | K={K_LIST} | MAX_LEN={MAX_LEN}")

PROMPTS_RU = {
    "plain": "",
    "minimal_ru": "Верни релевантные документы.",
    "qa_ru":      "Найди предложение, отвечающее на вопрос.",
    "search_ru":  "Найди веб-страницы, связанные с запросом.",
    "mfoll_ru":   "Ответь кратко и по делу. Найди наиболее релевантные документы."
}
PROMPTS_EN = {
    "plain": "",
    "minimal_en": "Return relevant documents.",
    "qa_en":      "Find the sentence that answers the question.",
    "search_en":  "Find web pages related to the query.",
    "mfoll_en":   "Answer briefly and to the point. Retrieve the most relevant documents."
}

PARAPHRASES_RU = [
    "Дай наиболее релевантные документы.",
    "Найди лучшие соответствующие тексты.",
    "Верни документы, максимально подходящие запросу.",
    "Подбери самые уместные тексты по запросу.",
    "Найди тексты, которые лучше всего отвечают запросу.",
]
PARAPHRASES_EN = [
    "Return the most relevant documents.",
    "Find the best matching texts.",
    "Provide documents that best fit the query.",
    "Select the most suitable texts for the query.",
    "Retrieve texts that best answer the query.",
]

# Instruction BANK (still used for p-MRR and transfer)
BANK = [
  "Ты — поисковая система. Найди наиболее релевантные документы, помогающие ответить на запрос. Возвращай документы, а не готовый ответ.",
  "Найди пассажи, содержащие факты для ответа на вопрос. Отдавай приоритет точности и контекстной релевантности.",
  "Выполни поиск по корпусу и верни документы, где максимально вероятно встретится ответ. Избегай нерелевантных совпадений."
]
def bank_for_qid(qid: str) -> str:
    return BANK[int(hashlib.md5(qid.encode()).hexdigest(), 16) % len(BANK)]

def add_prefix(s: str, kind: str) -> str:
    if not USE_PREFIXES: return s
    return (("query: " if kind == "query" else "passage: ") + s).strip()

def fmt(d): return {k:(round(v,4) if isinstance(v,float) else v) for k,v in d.items()}

def corpus_signature(ids: List[str]) -> str:
    key = "||".join(ids[:50]) + "||" + "||".join(ids[-50:]) + f"||{len(ids)}"
    return hashlib.md5(key.encode()).hexdigest()[:8]

def noise_variants(q: str) -> Dict[str, str]:
    base = re.sub(r"[^\w\s]", " ", q)
    base = re.sub(r"\s+", " ", base).strip()
    toks = base.split()
    swapped = base
    if len(toks) >= 2:
        t = toks[:]
        i = random.randrange(0, len(t)-1)
        t[i], t[i+1] = t[i+1], t[i]
        swapped = " ".join(t)
    return {"clean": q, "nopunct": base, "swap": swapped, "lower": q.lower()}

def mean_pool(last_hidden_state, attention_mask):
    m = attention_mask.unsqueeze(-1).expand_as(last_hidden_state).float()
    return (last_hidden_state * m).sum(dim=1) / m.sum(dim=1).clamp(min=1e-9)

@torch.no_grad()
def encode_texts(model, tok, texts: List[str], batch_size: int, max_len: int, device: str):
    outs=[]
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Encode@{device}", leave=False):
        batch = texts[i:i+batch_size]
        enc = tok(batch, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        for k in enc: enc[k] = enc[k].to(device)
        last = model(**enc).last_hidden_state
        pooled = mean_pool(last, enc["attention_mask"])
        outs.append(F.normalize(pooled, p=2, dim=1).cpu())
    return torch.cat(outs, 0) if outs else torch.empty(0, model.config.hidden_size)

def build_index(doc_embs_cpu: torch.Tensor):
    if FAISS_OK:
        index = faiss.IndexFlatIP(doc_embs_cpu.shape[1])
        index.add(doc_embs_cpu.numpy().astype("float32"))
        return index, True
    return doc_embs_cpu, False

@torch.no_grad()
def search_index(index, is_faiss: bool, q_embs: torch.Tensor, topk: int):
    if is_faiss:
        D, I = index.search(q_embs.numpy().astype("float32"), topk)
        return I, D
    sims = torch.mm(q_embs, index.t())
    D, I = torch.topk(sims, k=min(topk, sims.size(1)), dim=1)
    return I.numpy(), D.numpy()

def eval_multi_k_and_perquery(run, qrels: Dict[str, Dict[str,int]], ks: List[int]):
    ks = sorted(set(ks))
    stats = {"N": 0, "Hit@1": 0.0}
    for k in ks:
        stats.update({f"Recall@{k}":0.0, f"MRR@{k}":0.0, f"MAP@{k}":0.0, f"nDCG@{k}":0.0})
    per_mrr = {k:{} for k in ks}
    per_ap  = {k:{} for k in ks}
    n=0
    for qid, ranking in run.items():
        if qid not in qrels: continue
        n += 1
        rels = qrels[qid]
        pos = [r for r,(did,_) in enumerate(ranking, start=1) if did in rels]
        pos.sort()
        if pos and pos[0]==1: stats["Hit@1"] += 1.0
        m_rel = len(rels)
        for k in ks:
            stats[f"Recall@{k}"] += float(any(r<=k for r in pos))
            mrr_q = (1.0/pos[0]) if (pos and pos[0]<=k) else 0.0
            stats[f"MRR@{k}"] += mrr_q
            per_mrr[k][qid] = mrr_q
            dcg  = sum(1.0/math.log2(r+1) for r in pos if r<=k)
            idcg = sum(1.0/math.log2(r+1) for r in range(1, min(m_rel,k)+1))
            stats[f"nDCG@{k}"] += (dcg/idcg) if idcg>0 else 0.0
            hits=0; ap_sum=0.0
            for r,(did,_) in enumerate(ranking[:k], start=1):
                if did in rels:
                    hits += 1; ap_sum += hits / r
            denom = float(min(m_rel, k)) if m_rel>0 else 1.0
            ap_q = (ap_sum/denom) if denom>0 else 0.0
            stats[f"MAP@{k}"] += ap_q
            per_ap[k][qid] = ap_q
    if n==0: return stats, per_mrr, per_ap
    stats["N"]=n; stats["Hit@1"] /= n
    for k in ks:
        stats[f"Recall@{k}"] /= n; stats[f"MRR@{k}"]/=n; stats[f"MAP@{k}"]/=n; stats[f"nDCG@{k}"]/=n
    return stats, per_mrr, per_ap

def paired_diff_mean(per_a: Dict[str,float], per_b: Dict[str,float]) -> float:
    qids = sorted(set(per_a.keys()) & set(per_b.keys()))
    if not qids: return 0.0
    return float(np.mean([per_b[q]-per_a[q] for q in qids]))

from transformers import AutoTokenizer, AutoModel
def safe_load_baseline(model_name: str, device: str):
    dtype = torch.float16 if (device=="cuda") else None
    try:
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        mdl = AutoModel.from_pretrained(model_name, torch_dtype=dtype, low_cpu_mem_usage=True).to(device).eval()
    except RuntimeError:
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        mdl = AutoModel.from_pretrained(model_name, low_cpu_mem_usage=True).to("cpu").eval()
        device = "cpu"
    return tok, mdl, device

def safe_load_peft_for_inference(base_model_name: str, adapter_dir: str, device: str):
    try:
        import peft.import_utils as _iu
        _iu.is_bnb_available = lambda: False
    except Exception:
        pass
    from peft import PeftModel
    dtype = torch.float16 if (device=="cuda") else None
    try:
        base = AutoModel.from_pretrained(base_model_name, torch_dtype=dtype, low_cpu_mem_usage=True).to(device).eval()
    except RuntimeError:
        base = AutoModel.from_pretrained(base_model_name, low_cpu_mem_usage=True).to("cpu").eval()
        device = "cpu"
    try:
        tok = AutoTokenizer.from_pretrained(adapter_dir, use_fast=True)
    except Exception:
        tok = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
    model = PeftModel.from_pretrained(base, adapter_dir).eval()
    return tok, model, device

def evaluate_model(tag: str, tok, mdl, dev: str,
                   doc_ids: List[str], doc_texts_pref: List[str],
                   qids: List[str], q_texts_pref: List[str],
                   qrels: Dict[str, Dict[str,int]],
                   cache_key: str):
    print(f"\n=== {tag} ===")
    dcache = os.path.join(CACHE_DIR_EMB, f"docs__{cache_key}.pt")
    if os.path.exists(dcache):
        doc_embs = torch.load(dcache, map_location="cpu"); print(f"[cache] docs {doc_embs.shape} loaded")
    else:
        doc_embs = encode_texts(mdl, tok, doc_texts_pref, BATCH_SIZE_D, MAX_LEN, dev)
        torch.save(doc_embs, dcache); print(f"[cache] docs {doc_embs.shape} saved -> {dcache}")
    index, is_faiss = build_index(doc_embs)
    q_embs = encode_texts(mdl, tok, q_texts_pref, BATCH_SIZE_Q, MAX_LEN, dev)
    I, D = search_index(index, is_faiss, q_embs, topk=max(K_LIST))
    run = { qid: [(doc_ids[j], float(D[i][k])) for k, j in enumerate(I[i])] for i, qid in enumerate(qids) }
    stats, per_mrr, per_ap = eval_multi_k_and_perquery(run, qrels, K_LIST)
    print(fmt(stats))
    return stats, per_mrr, per_ap

def build_prompted_queries(qids: List[str], qmap: Dict[str,str], prompt_text: str):
    if prompt_text.strip()=="":
        return [add_prefix(qmap[q], "query") for q in qids]
    return [add_prefix((qmap[q] + " " + prompt_text).strip(), "query") for q in qids]

def paraphrase_suite(qids: List[str], qmap: Dict[str,str], paraphrases: List[str]):
    return {f"para{i+1}": [add_prefix((qmap[q] + " " + p).strip(), "query") for q in qids]
            for i,p in enumerate(paraphrases)}

def noise_suite(qids: List[str], qmap: Dict[str,str]):
    suite = {"clean": [add_prefix(qmap[q], "query") for q in qids]}
    # compute per-q variants and then stitch column-wise
    per = {q: noise_variants(qmap[q]) for q in qids}
    for key in ["nopunct","swap","lower"]:
        suite[key] = [add_prefix(per[q][key], "query") for q in qids]
    return suite

def agg_mean_std(stats_list: List[Dict[str,float]]):
    if not stats_list: return {}
    keys = stats_list[0].keys()
    out = {}
    for k in keys:
        if isinstance(stats_list[0][k], float):
            vals = [s[k] for s in stats_list]
            out[f"{k}_mean"] = float(np.mean(vals))
            out[f"{k}_std"]  = float(np.std(vals))
    return out

# A) Mr.TyDi-RU (prepared)

def prepare_mrtydi_ru(out_dir: str):
    if all(os.path.exists(os.path.join(out_dir, fn)) for fn in ["corpus.jsonl","queries.jsonl","qrels.tsv"]):
        print("[MrTyDi] prepared files already present."); return
    print("[MrTyDi] preparing via ir_datasets …")
    import ir_datasets as irds
    os.makedirs(out_dir, exist_ok=True)
    ds = irds.load("mr-tydi/ru/test")
    with open(os.path.join(out_dir, "corpus.jsonl"), "w", encoding="utf-8") as fc:
        for d in ds.docs_iter():
            j = {"_id": str(d.doc_id), "title": getattr(d, "title", "") or "", "text": d.text or ""}
            fc.write(json.dumps(j, ensure_ascii=False) + "\n")
    with open(os.path.join(out_dir, "queries.jsonl"), "w", encoding="utf-8") as fq:
        for q in ds.queries_iter():
            j = {"_id": str(q.query_id), "text": q.text}
            fq.write(json.dumps(j, ensure_ascii=False) + "\n")
    with open(os.path.join(out_dir, "qrels.tsv"), "w", encoding="utf-8") as fr:
        for r in ds.qrels_iter():
            rel = int(getattr(r, "relevance", 1) > 0)
            fr.write(f"{r.query_id}\t{r.doc_id}\t{rel}\n")
    print("[MrTyDi] wrote corpus.jsonl, queries.jsonl, qrels.tsv")

def load_jsonl_map(path: str, key="_id", text="text"):
    m={}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            j = json.loads(line); i = str(j.get(key) or j.get("id")); t=(j.get(text) or "").strip()
            if i and t: m[i]=t
    return m

def load_qrels_tsv(path: str):
    qrels=defaultdict(dict)
    with open(path, "r", encoding="utf-8") as f:
        for row in f:
            parts=row.strip().split("\t")
            if len(parts)<3: continue
            qid,did,rel=parts[0],parts[1],parts[2]
            try: r = int(float(rel))
            except: continue
            if r>0: qrels[qid][did]=1
    return dict(qrels)

def run_mrtydi():
    print("\n========== Mr.TyDi-RU ==========")
    prepare_mrtydi_ru(MRTYDI_DIR)
    # load
    corpus_raw = {}
    with open(os.path.join(MRTYDI_DIR,"corpus.jsonl"), "r", encoding="utf-8") as f:
        for line in f:
            j=json.loads(line); cid=str(j.get("_id") or j.get("id"))
            title=(j.get("title") or "").strip(); text=(j.get("text") or "").strip()
            corpus_raw[cid]=(title+" "+text).strip()
    queries = load_jsonl_map(os.path.join(MRTYDI_DIR,"queries.jsonl"))
    qrels   = load_qrels_tsv(os.path.join(MRTYDI_DIR,"qrels.tsv"))
    # subset: keep positives + fill negatives
    rng=random.Random(SEED)
    qids_all=[qid for qid in qrels if qid in queries]
    all_pos=set()
    for qid in qids_all: all_pos.update(qrels[qid].keys())
    pos_doc_ids=[d for d in all_pos if d in corpus_raw]
    if MRTYDI_DOC_CAP is None:
        doc_ids=list(corpus_raw.keys()); neg_added="N/A"
    else:
        if len(pos_doc_ids)>=MRTYDI_DOC_CAP:
            rng.shuffle(pos_doc_ids); doc_ids=pos_doc_ids[:MRTYDI_DOC_CAP]; neg_added=0
        else:
            need=MRTYDI_DOC_CAP-len(pos_doc_ids)
            neg_pool=[d for d in corpus_raw if d not in all_pos]; rng.shuffle(neg_pool)
            neg_added=min(need,len(neg_pool)); doc_ids=pos_doc_ids+neg_pool[:neg_added]
    corpus={d:corpus_raw[d] for d in doc_ids}
    doc_texts=[add_prefix(corpus[d],"passage") for d in doc_ids]
    qids=[qid for qid in qids_all if any((did in corpus) for did in qrels[qid])]
    if MRTYDI_FAST_Q and len(qids)>MRTYDI_FAST_Q:
        rng.shuffle(qids); qids=qids[:MRTYDI_FAST_Q]
    print(f"Docs={len(doc_ids):,} | Queries={len(qids):,} | PosDocs={len(pos_doc_ids):,} | NegAdded={neg_added}")

    loaders=[]
    tok_b, mdl_b, dev_b = safe_load_baseline(BASE_MODEL, DEVICE)
    loaders.append(("BASE", tok_b, mdl_b, dev_b, f"mrtydi__base__{corpus_signature(doc_ids)}__L{MAX_LEN}"))
    if FINETUNED_INST_DIR and os.path.isdir(FINETUNED_INST_DIR):
        tok_i, mdl_i, dev_i = safe_load_peft_for_inference(BASE_MODEL, FINETUNED_INST_DIR, DEVICE)
        loaders.append(("FT_INST", tok_i, mdl_i, dev_i, f"mrtydi__inst__{corpus_signature(doc_ids)}__L{MAX_LEN}"))
    if FINETUNED_QUERY_DIR and os.path.isdir(FINETUNED_QUERY_DIR):
        tok_q, mdl_q, dev_q = safe_load_peft_for_inference(BASE_MODEL, FINETUNED_QUERY_DIR, DEVICE)
        loaders.append(("FT_QUERY", tok_q, mdl_q, dev_q, f"mrtydi__qonly__{corpus_signature(doc_ids)}__L{MAX_LEN}"))

    # base query maps
    qmap = {qid: queries[qid] for qid in qids}
    results = {}

    for tag, tok, mdl, dev, sigbase in loaders:
        # plain
        q_plain = [add_prefix(qmap[q],"query") for q in qids]
        stats_plain, per_plain, _ = evaluate_model(f"MrTyDi | {tag} | plain", tok, mdl, dev, doc_ids, doc_texts, qids, q_plain, qrels, sigbase)

        # BANK (for p-MRR)
        q_bank  = [add_prefix((qmap[q]+" "+bank_for_qid(q)).strip(),"query") for q in qids]
        stats_bank, per_bank, _ = evaluate_model(f"MrTyDi | {tag} | q+BANK", tok, mdl, dev, doc_ids, doc_texts, qids, q_bank, qrels, sigbase)

        print(f"\n--- MrTyDi p-MRR (BANK − plain) :: {tag} ---")
        print({f"pMRR@{k}": round(paired_diff_mean(per_plain[k], per_bank[k]), 6) for k in K_LIST})

        # Prompt ablation
        if DO_PROMPT_ABLATION:
            print(f"\n--- Prompt ablation :: {tag} ---")
            for pname, prompt in PROMPTS_RU.items():
                qv = build_prompted_queries(qids, qmap, prompt)
                s,_,_ = evaluate_model(f"MrTyDi | {tag} | prompt:{pname}", tok, mdl, dev, doc_ids, doc_texts, qids, qv, qrels, sigbase)

        # Paraphrase robustness (mean/std)
        if DO_PARAPHRASE_ROBUST:
            print(f"\n--- Paraphrase robustness (5x) :: {tag} ---")
            suite = paraphrase_suite(qids, qmap, PARAPHRASES_RU)
            stats_list=[]
            for key, qv in suite.items():
                s,_,_ = evaluate_model(f"MrTyDi | {tag} | {key}", tok, mdl, dev, doc_ids, doc_texts, qids, qv, qrels, sigbase)
                stats_list.append(s)
            agg = agg_mean_std(stats_list)
            print("Paraphrase mean/std:", fmt(agg))

        # Noise stress
        if DO_NOISE_STRESS:
            print(f"\n--- Noise stress :: {tag} ---")
            suite = noise_suite(qids, qmap)
            base_stats=None
            for key, qv in suite.items():
                s,_,_ = evaluate_model(f"MrTyDi | {tag} | noise:{key}", tok, mdl, dev, doc_ids, doc_texts, qids, qv, qrels, sigbase)
                if key=="clean": base_stats=s
                else:
                    # report MRR@k drop vs clean
                    drops = {f"ΔMRR@{k}": round(s.get(f"MRR@{k}",0)-base_stats.get(f"MRR@{k}",0), 6) for k in K_LIST}
                    print(f"  drop vs clean ({key}):", drops)

        results[tag]=stats_plain  # keep at least plain for summary

        del mdl; torch.cuda.empty_cache()

    print("\n=== Mr.TyDi SUMMARY (plain only) ===")
    for k,v in results.items():
        print(k, "->", fmt(v))
    return results

# B) mFollowIR-RU

def run_mfollowir():
    print("\n========== mFollowIR-RU ==========")
    from datasets import load_dataset
    ds_q = load_dataset("jhu-clsp/mFollowIR-parquet","queries-rus")["queries"]
    ds_d = load_dataset("jhu-clsp/mFollowIR-parquet","corpus-rus")["corpus"]
    ds_og= load_dataset("jhu-clsp/mFollowIR-parquet","qrels_og-rus")["test"]

    doc_ids=[r["_id"] for r in ds_d]
    doc_txt=[add_prefix(((r.get("title") or "")+" "+(r.get("text") or "")).strip(),"passage") for r in ds_d]
    if MFOLLOWIR_DOC_CAP:
        rng=random.Random(SEED); idx=list(range(len(doc_ids))); rng.shuffle(idx); keep=sorted(idx[:MFOLLOWIR_DOC_CAP])
        doc_ids=[doc_ids[i] for i in keep]; doc_txt=[doc_txt[i] for i in keep]
    doc_set=set(doc_ids)
    qrels=defaultdict(dict)
    for r in ds_og:
        if r["corpus-id"] in doc_set and float(r["score"])>0:
            qrels[r["query-id"]][r["corpus-id"]]=1
    qrels={k:v for k,v in qrels.items() if v}

    qmap = {r["_id"]: r["text"].strip() for r in ds_q}
    qids = [qid for qid in qmap if qid in qrels]
    if MFOLLOWIR_FAST_Q and len(qids)>MFOLLOWIR_FAST_Q:
        rng=random.Random(SEED); rng.shuffle(qids); qids=qids[:MFOLLOWIR_FAST_Q]

    loaders=[]
    tok_b, mdl_b, dev_b = safe_load_baseline(BASE_MODEL, DEVICE)
    loaders.append(("BASE", tok_b, mdl_b, dev_b, f"mfollowir__base__{corpus_signature(doc_ids)}__L{MAX_LEN}"))
    if FINETUNED_INST_DIR and os.path.isdir(FINETUNED_INST_DIR):
        tok_i, mdl_i, dev_i = safe_load_peft_for_inference(BASE_MODEL, FINETUNED_INST_DIR, DEVICE)
        loaders.append(("FT_INST", tok_i, mdl_i, dev_i, f"mfollowir__inst__{corpus_signature(doc_ids)}__L{MAX_LEN}"))
    if FINETUNED_QUERY_DIR and os.path.isdir(FINETUNED_QUERY_DIR):
        tok_q, mdl_q, dev_q = safe_load_peft_for_inference(BASE_MODEL, FINETUNED_QUERY_DIR, DEVICE)
        loaders.append(("FT_QUERY", tok_q, mdl_q, dev_q, f"mfollowir__qonly__{corpus_signature(doc_ids)}__L{MAX_LEN}"))

    for tag, tok, mdl, dev, sigbase in loaders:
        # plain
        q_plain=[add_prefix(qmap[q],"query") for q in qids]
        stats_plain, per_plain, _ = evaluate_model(f"mFollowIR | {tag} | plain", tok, mdl, dev, doc_ids, doc_txt, qids, q_plain, qrels, sigbase)
        # BANK
        q_bank=[add_prefix((qmap[q]+" "+bank_for_qid(q)).strip(),"query") for q in qids]
        stats_bank, per_bank, _ = evaluate_model(f"mFollowIR | {tag} | q+BANK", tok, mdl, dev, doc_ids, doc_txt, qids, q_bank, qrels, sigbase)

        print(f"\n--- mFollowIR p-MRR (BANK − plain) :: {tag} ---")
        print({f"pMRR@{k}": round(paired_diff_mean(per_plain[k], per_bank[k]),6) for k in K_LIST})

        if DO_PROMPT_ABLATION:
            print(f"\n--- Prompt ablation :: {tag} ---")
            for pname, prompt in PROMPTS_RU.items():
                qv = build_prompted_queries(qids, qmap, prompt)
                evaluate_model(f"mFollowIR | {tag} | prompt:{pname}", tok, mdl, dev, doc_ids, doc_txt, qids, qv, qrels, sigbase)
        if DO_PARAPHRASE_ROBUST:
            print(f"\n--- Paraphrase robustness (5x) :: {tag} ---")
            suite=paraphrase_suite(qids, qmap, PARAPHRASES_RU)
            stats_list=[]
            for key,qv in suite.items():
                s,_,_ = evaluate_model(f"mFollowIR | {tag} | {key}", tok, mdl, dev, doc_ids, doc_txt, qids, qv, qrels, sigbase)
                stats_list.append(s)
            agg=agg_mean_std(stats_list); print("Paraphrase mean/std:", fmt(agg))
        if DO_NOISE_STRESS:
            print(f"\n--- Noise stress :: {tag} ---")
            suite=noise_suite(qids, qmap)
            base_stats=None
            for key,qv in suite.items():
                s,_,_ = evaluate_model(f"mFollowIR | {tag} | noise:{key}", tok, mdl, dev, doc_ids, doc_txt, qids, qv, qrels, sigbase)
                if key=="clean": base_stats=s
                else:
                    drops={f"ΔMRR@{k}": round(s.get(f"MRR@{k}",0)-base_stats.get(f"MRR@{k}",0),6) for k in K_LIST}
                    print(f"  drop vs clean ({key}):", drops)

        del mdl; torch.cuda.empty_cache()

    return {"done": True}

# C) LAReQA XQuAD-R RU↔EN

import requests
def ensure_lang_file(lang: str, out_dir: str) -> str:
    fp=os.path.join(out_dir, f"{lang}.json")
    if os.path.exists(fp): return fp
    url=f"https://raw.githubusercontent.com/google-research-datasets/lareqa/master/xquad-r/{lang}.json"
    print(f"[fetch] {lang}.json …"); r=requests.get(url,timeout=30); r.raise_for_status()
    with open(fp,"wb") as f: f.write(r.content); return fp

def split_sentences(context: str, sent_field):
    if isinstance(sent_field, list) and sent_field and isinstance(sent_field[0], list):
        sents=[]; 
        for s,e in sent_field:
            try:
                t=context[s:e].strip()
                if t: sents.append(t)
            except: pass
        return sents
    if isinstance(sent_field, list) and sent_field and isinstance(sent_field[0], str):
        return [s.strip() for s in sent_field if str(s).strip()]
    return re.split(r'\s*(?<=\.|\?|!)\s+', context.strip())

def load_xquadr(path: str):
    with open(path,"r",encoding="utf-8") as f: data=json.load(f)
    qa_map={}; sents=[]
    for art in data.get("data",[]):
        for par in art.get("paragraphs",[]):
            context=par.get("context","") or ""
            ss=split_sentences(context, par.get("sentences")); base=len(sents); sents.extend(ss)
            for qa in par.get("qas",[]):
                qid=qa.get("id") or qa.get("qid") or ""
                qtext=qa.get("question","").strip()
                answers=[a.get("text","").strip() for a in qa.get("answers",[]) if str(a.get("text","")).strip()]
                qa_map[qid]={"question":qtext,"answers":answers}
    return sents, qa_map

def normalize(s): return re.sub(r"\s+"," ", s.strip().lower())

def build_xling_ir(qas_src, answers_tgt, cand_sents, cand_lang: str, doc_cap: int, fast_q: int):
    doc_ids=[f"{cand_lang}-s{ix}" for ix in range(len(cand_sents))]
    if doc_cap and doc_cap<len(doc_ids):
        rng=random.Random(SEED); idx=list(range(len(doc_ids))); rng.shuffle(idx); keep=sorted(idx[:doc_cap])
        cand_sents=[cand_sents[i] for i in keep]; doc_ids=[doc_ids[i] for i in keep]
    qids_all=sorted(set(qas_src.keys()) & set(answers_tgt.keys()))
    if fast_q and fast_q<len(qids_all):
        rng=random.Random(SEED); rng.shuffle(qids_all); qids_all=sorted(qids_all[:fast_q])
    q_texts={qid: qas_src[qid]["question"] for qid in qids_all}
    cand_norm=[normalize(x) for x in cand_sents]
    qrels=defaultdict(dict)
    for qid in qids_all:
        golds=[normalize(a) for a in answers_tgt[qid].get("answers",[]) if a.strip()]
        if not golds: continue
        for i,s in enumerate(cand_norm):
            if any(g and g in s for g in golds):
                qrels[qid][doc_ids[i]]=1
    qids=[qid for qid in qids_all if qrels.get(qid)]
    qrels={qid:qrels[qid] for qid in qids}
    return doc_ids, cand_sents, qids, q_texts, qrels

def run_lareqa():
    print("\n========== LAReQA XQuAD-R ==========")
    ru_path=ensure_lang_file("ru", LAREQA_DIR); en_path=ensure_lang_file("en", LAREQA_DIR)
    ru_sents, ru_qas = load_xquadr(ru_path); en_sents, en_qas = load_xquadr(en_path)

    def run_dir(tag, qas_src, ans_tgt, cand_sents, cand_lang, prompts, paraphrases):
        doc_ids, docs, qids, qmap, qrels = build_xling_ir(qas_src, ans_tgt, cand_sents, cand_lang, LAREQA_DOC_CAP, LAREQA_FAST_Q)
        print(f"\n--- {tag} --- Docs={len(doc_ids):,} | Queries={len(qids):,} | QrelsQ={len(qrels):,}")
        doc_texts=[add_prefix(t,"passage") for t in docs]
        loaders=[]
        tok_b, mdl_b, dev_b = safe_load_baseline(BASE_MODEL, DEVICE)
        loaders.append(("BASE", tok_b, mdl_b, dev_b, f"lareqa_{tag}__base__{corpus_signature(doc_ids)}__L{MAX_LEN}"))
        if FINETUNED_INST_DIR and os.path.isdir(FINETUNED_INST_DIR):
            tok_i, mdl_i, dev_i = safe_load_peft_for_inference(BASE_MODEL, FINETUNED_INST_DIR, DEVICE)
            loaders.append(("FT_INST", tok_i, mdl_i, dev_i, f"lareqa_{tag}__inst__{corpus_signature(doc_ids)}__L{MAX_LEN}"))
        if FINETUNED_QUERY_DIR and os.path.isdir(FINETUNED_QUERY_DIR):
            tok_q, mdl_q, dev_q = safe_load_peft_for_inference(BASE_MODEL, FINETUNED_QUERY_DIR, DEVICE)
            loaders.append(("FT_QUERY", tok_q, mdl_q, dev_q, f"lareqa_{tag}__qonly__{corpus_signature(doc_ids)}__L{MAX_LEN}"))

        for mtag, tok, mdl, dev, sigbase in loaders:
            # plain
            q_plain=[add_prefix(qmap[q],"query") for q in qids]
            stats_plain, per_plain, _ = evaluate_model(f"LAReQA {tag} | {mtag} | plain", tok, mdl, dev, doc_ids, doc_texts, qids, q_plain, qrels, sigbase)
            # BANK
            q_bank=[add_prefix((qmap[q]+" "+bank_for_qid(q)).strip(),"query") for q in qids]
            stats_bank, per_bank, _ = evaluate_model(f"LAReQA {tag} | {mtag} | q+BANK", tok, mdl, dev, doc_ids, doc_texts, qids, q_bank, qrels, sigbase)
            print(f"\n--- LAReQA p-MRR (BANK − plain) :: {tag} :: {mtag} ---")
            print({f"pMRR@{k}": round(paired_diff_mean(per_plain[k], per_bank[k]),6) for k in K_LIST})

            if DO_PROMPT_ABLATION:
                print(f"\n--- Prompt ablation :: {tag} :: {mtag} ---")
                for pname, prompt in prompts.items():
                    qv = build_prompted_queries(qids, qmap, prompt)
                    evaluate_model(f"LAReQA {tag} | {mtag} | prompt:{pname}", tok, mdl, dev, doc_ids, doc_texts, qids, qv, qrels, sigbase)
            if DO_PARAPHRASE_ROBUST:
                print(f"\n--- Paraphrase robustness (5x) :: {tag} :: {mtag} ---")
                suite=paraphrase_suite(qids, qmap, paraphrases)
                stats_list=[]
                for key,qv in suite.items():
                    s,_,_ = evaluate_model(f"LAReQA {tag} | {mtag} | {key}", tok, mdl, dev, doc_ids, doc_texts, qids, qv, qrels, sigbase)
                    stats_list.append(s)
                agg=agg_mean_std(stats_list); print("Paraphrase mean/std:", fmt(agg))
            if DO_NOISE_STRESS:
                print(f"\n--- Noise stress :: {tag} :: {mtag} ---")
                suite=noise_suite(qids, qmap)
                base_stats=None
                for key,qv in suite.items():
                    s,_,_ = evaluate_model(f"LAReQA {tag} | {mtag} | noise:{key}", tok, mdl, dev, doc_ids, doc_texts, qids, qv, qrels, sigbase)
                    if key=="clean": base_stats=s
                    else:
                        drops={f"ΔMRR@{k}": round(s.get(f"MRR@{k}",0)-base_stats.get(f"MRR@{k}",0),6) for k in K_LIST}
                        print(f"  drop vs clean ({key}):", drops)
            del mdl; torch.cuda.empty_cache()

    # RU→EN: RU queries -> use RU prompts/paraphrases
    run_dir("RU→EN", ru_qas, en_qas, en_sents, "en", PROMPTS_RU, PARAPHRASES_RU)
    # EN→RU: EN queries -> use EN prompts/paraphrases
    run_dir("EN→RU", en_qas, ru_qas, ru_sents, "ru", PROMPTS_EN, PARAPHRASES_EN)
    return {"done": True}


all_results = {}
all_results["MrTyDi-RU"]  = run_mrtydi()
all_results["mFollowIR"]  = run_mfollowir()
all_results["LAReQA"]     = run_lareqa()

print("\n=============== DONE ===============")


Device: cuda | K=[1, 5, 10] | MAX_LEN=256

[MrTyDi] prepared files already present.
Docs=20,000 | Queries=995 | PosDocs=1,100 | NegAdded=18900

=== MrTyDi | BASE | plain ===


                                                              

[cache] docs torch.Size([20000, 768]) saved -> /kaggle/working/robust_eval_cache/docs__mrtydi__base__363db7cc__L256.pt


                                                            

{'N': 995, 'Hit@1': 0.808, 'Recall@1': 0.808, 'MRR@1': 0.808, 'MAP@1': 0.808, 'nDCG@1': 0.808, 'Recall@5': 0.9357, 'MRR@5': 0.862, 'MAP@5': 0.8446, 'nDCG@5': 0.8686, 'Recall@10': 0.9588, 'MRR@10': 0.8653, 'MAP@10': 0.8494, 'nDCG@10': 0.8782}

=== MrTyDi | BASE | q+BANK ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7859, 'Recall@1': 0.7859, 'MRR@1': 0.7859, 'MAP@1': 0.7859, 'nDCG@1': 0.7859, 'Recall@5': 0.9095, 'MRR@5': 0.8357, 'MAP@5': 0.8151, 'nDCG@5': 0.8394, 'Recall@10': 0.9347, 'MRR@10': 0.8391, 'MAP@10': 0.8203, 'nDCG@10': 0.85}

--- MrTyDi p-MRR (BANK − plain) :: BASE ---
{'pMRR@1': -0.022111, 'pMRR@5': -0.026382, 'pMRR@10': -0.026228}

--- Prompt ablation :: BASE ---

=== MrTyDi | BASE | prompt:plain ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.808, 'Recall@1': 0.808, 'MRR@1': 0.808, 'MAP@1': 0.808, 'nDCG@1': 0.808, 'Recall@5': 0.9357, 'MRR@5': 0.862, 'MAP@5': 0.8446, 'nDCG@5': 0.8686, 'Recall@10': 0.9588, 'MRR@10': 0.8653, 'MAP@10': 0.8494, 'nDCG@10': 0.8782}

=== MrTyDi | BASE | prompt:minimal_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.8, 'Recall@1': 0.8, 'MRR@1': 0.8, 'MAP@1': 0.8, 'nDCG@1': 0.8, 'Recall@5': 0.9347, 'MRR@5': 0.8548, 'MAP@5': 0.8378, 'nDCG@5': 0.8629, 'Recall@10': 0.9548, 'MRR@10': 0.8575, 'MAP@10': 0.842, 'nDCG@10': 0.8715}

=== MrTyDi | BASE | prompt:qa_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.803, 'Recall@1': 0.803, 'MRR@1': 0.803, 'MAP@1': 0.803, 'nDCG@1': 0.803, 'Recall@5': 0.9256, 'MRR@5': 0.8526, 'MAP@5': 0.8352, 'nDCG@5': 0.8589, 'Recall@10': 0.9437, 'MRR@10': 0.8552, 'MAP@10': 0.8389, 'nDCG@10': 0.8665}

=== MrTyDi | BASE | prompt:search_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.794, 'Recall@1': 0.794, 'MRR@1': 0.794, 'MAP@1': 0.794, 'nDCG@1': 0.794, 'Recall@5': 0.9266, 'MRR@5': 0.8486, 'MAP@5': 0.8313, 'nDCG@5': 0.8558, 'Recall@10': 0.9457, 'MRR@10': 0.8512, 'MAP@10': 0.8355, 'nDCG@10': 0.8641}

=== MrTyDi | BASE | prompt:mfoll_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.796, 'Recall@1': 0.796, 'MRR@1': 0.796, 'MAP@1': 0.796, 'nDCG@1': 0.796, 'Recall@5': 0.9226, 'MRR@5': 0.8483, 'MAP@5': 0.8293, 'nDCG@5': 0.8538, 'Recall@10': 0.9467, 'MRR@10': 0.8516, 'MAP@10': 0.8343, 'nDCG@10': 0.8638}

--- Paraphrase robustness (5x) :: BASE ---

=== MrTyDi | BASE | para1 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.8, 'Recall@1': 0.8, 'MRR@1': 0.8, 'MAP@1': 0.8, 'nDCG@1': 0.8, 'Recall@5': 0.9307, 'MRR@5': 0.8535, 'MAP@5': 0.8361, 'nDCG@5': 0.8608, 'Recall@10': 0.9518, 'MRR@10': 0.8563, 'MAP@10': 0.8401, 'nDCG@10': 0.8693}

=== MrTyDi | BASE | para2 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.8, 'Recall@1': 0.8, 'MRR@1': 0.8, 'MAP@1': 0.8, 'nDCG@1': 0.8, 'Recall@5': 0.9337, 'MRR@5': 0.8548, 'MAP@5': 0.836, 'nDCG@5': 0.8615, 'Recall@10': 0.9487, 'MRR@10': 0.8567, 'MAP@10': 0.8394, 'nDCG@10': 0.8684}

=== MrTyDi | BASE | para3 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.8, 'Recall@1': 0.8, 'MRR@1': 0.8, 'MAP@1': 0.8, 'nDCG@1': 0.8, 'Recall@5': 0.9236, 'MRR@5': 0.8509, 'MAP@5': 0.8326, 'nDCG@5': 0.8565, 'Recall@10': 0.9457, 'MRR@10': 0.854, 'MAP@10': 0.8368, 'nDCG@10': 0.8652}

=== MrTyDi | BASE | para4 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.794, 'Recall@1': 0.794, 'MRR@1': 0.794, 'MAP@1': 0.794, 'nDCG@1': 0.794, 'Recall@5': 0.9236, 'MRR@5': 0.8478, 'MAP@5': 0.8296, 'nDCG@5': 0.8542, 'Recall@10': 0.9427, 'MRR@10': 0.8503, 'MAP@10': 0.8332, 'nDCG@10': 0.8618}

=== MrTyDi | BASE | para5 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7849, 'Recall@1': 0.7849, 'MRR@1': 0.7849, 'MAP@1': 0.7849, 'nDCG@1': 0.7849, 'Recall@5': 0.9206, 'MRR@5': 0.84, 'MAP@5': 0.8201, 'nDCG@5': 0.8462, 'Recall@10': 0.9387, 'MRR@10': 0.8425, 'MAP@10': 0.8241, 'nDCG@10': 0.8541}
Paraphrase mean/std: {'Hit@1_mean': 0.7958, 'Hit@1_std': 0.0059, 'Recall@1_mean': 0.7958, 'Recall@1_std': 0.0059, 'MRR@1_mean': 0.7958, 'MRR@1_std': 0.0059, 'MAP@1_mean': 0.7958, 'MAP@1_std': 0.0059, 'nDCG@1_mean': 0.7958, 'nDCG@1_std': 0.0059, 'Recall@5_mean': 0.9264, 'Recall@5_std': 0.0049, 'MRR@5_mean': 0.8494, 'MRR@5_std': 0.0052, 'MAP@5_mean': 0.8309, 'MAP@5_std': 0.0059, 'nDCG@5_mean': 0.8559, 'nDCG@5_std': 0.0055, 'Recall@10_mean': 0.9455, 'Recall@10_std': 0.0046, 'MRR@10_mean': 0.852, 'MRR@10_std': 0.0052, 'MAP@10_mean': 0.8347, 'MAP@10_std': 0.0058, 'nDCG@10_mean': 0.8638, 'nDCG@10_std': 0.0055}

--- Noise stress :: BASE ---

=== MrTyDi | BASE | noise:clean ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.808, 'Recall@1': 0.808, 'MRR@1': 0.808, 'MAP@1': 0.808, 'nDCG@1': 0.808, 'Recall@5': 0.9357, 'MRR@5': 0.862, 'MAP@5': 0.8446, 'nDCG@5': 0.8686, 'Recall@10': 0.9588, 'MRR@10': 0.8653, 'MAP@10': 0.8494, 'nDCG@10': 0.8782}

=== MrTyDi | BASE | noise:nopunct ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7799, 'Recall@1': 0.7799, 'MRR@1': 0.7799, 'MAP@1': 0.7799, 'nDCG@1': 0.7799, 'Recall@5': 0.9126, 'MRR@5': 0.8343, 'MAP@5': 0.8167, 'nDCG@5': 0.8414, 'Recall@10': 0.9337, 'MRR@10': 0.8372, 'MAP@10': 0.8209, 'nDCG@10': 0.8502}
  drop vs clean (nopunct): {'ΔMRR@1': -0.028141, 'ΔMRR@5': -0.027772, 'ΔMRR@10': -0.028132}

=== MrTyDi | BASE | noise:swap ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7678, 'Recall@1': 0.7678, 'MRR@1': 0.7678, 'MAP@1': 0.7678, 'nDCG@1': 0.7678, 'Recall@5': 0.8985, 'MRR@5': 0.8225, 'MAP@5': 0.8027, 'nDCG@5': 0.8278, 'Recall@10': 0.9226, 'MRR@10': 0.8258, 'MAP@10': 0.8078, 'nDCG@10': 0.8381}
  drop vs clean (swap): {'ΔMRR@1': -0.040201, 'ΔMRR@5': -0.039497, 'ΔMRR@10': -0.039501}

=== MrTyDi | BASE | noise:lower ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.792, 'Recall@1': 0.792, 'MRR@1': 0.792, 'MAP@1': 0.792, 'nDCG@1': 0.792, 'Recall@5': 0.9166, 'MRR@5': 0.8454, 'MAP@5': 0.8266, 'nDCG@5': 0.8505, 'Recall@10': 0.9407, 'MRR@10': 0.8487, 'MAP@10': 0.8317, 'nDCG@10': 0.8609}
  drop vs clean (lower): {'ΔMRR@1': -0.01608, 'ΔMRR@5': -0.016667, 'ΔMRR@10': -0.016637}

=== MrTyDi | FT_INST | plain ===


                                                              

[cache] docs torch.Size([20000, 768]) saved -> /kaggle/working/robust_eval_cache/docs__mrtydi__inst__363db7cc__L256.pt


                                                            

{'N': 995, 'Hit@1': 0.7357, 'Recall@1': 0.7357, 'MRR@1': 0.7357, 'MAP@1': 0.7357, 'nDCG@1': 0.7357, 'Recall@5': 0.8975, 'MRR@5': 0.803, 'MAP@5': 0.7828, 'nDCG@5': 0.8124, 'Recall@10': 0.9276, 'MRR@10': 0.8071, 'MAP@10': 0.7877, 'nDCG@10': 0.8233}

=== MrTyDi | FT_INST | q+BANK ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7417, 'Recall@1': 0.7417, 'MRR@1': 0.7417, 'MAP@1': 0.7417, 'nDCG@1': 0.7417, 'Recall@5': 0.8884, 'MRR@5': 0.8044, 'MAP@5': 0.7846, 'nDCG@5': 0.8112, 'Recall@10': 0.9236, 'MRR@10': 0.8091, 'MAP@10': 0.7907, 'nDCG@10': 0.8246}

--- MrTyDi p-MRR (BANK − plain) :: FT_INST ---
{'pMRR@1': 0.00603, 'pMRR@5': 0.001323, 'pMRR@10': 0.002024}

--- Prompt ablation :: FT_INST ---

=== MrTyDi | FT_INST | prompt:plain ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7357, 'Recall@1': 0.7357, 'MRR@1': 0.7357, 'MAP@1': 0.7357, 'nDCG@1': 0.7357, 'Recall@5': 0.8975, 'MRR@5': 0.803, 'MAP@5': 0.7828, 'nDCG@5': 0.8124, 'Recall@10': 0.9276, 'MRR@10': 0.8071, 'MAP@10': 0.7877, 'nDCG@10': 0.8233}

=== MrTyDi | FT_INST | prompt:minimal_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7407, 'Recall@1': 0.7407, 'MRR@1': 0.7407, 'MAP@1': 0.7407, 'nDCG@1': 0.7407, 'Recall@5': 0.8945, 'MRR@5': 0.8073, 'MAP@5': 0.7865, 'nDCG@5': 0.8146, 'Recall@10': 0.9256, 'MRR@10': 0.8116, 'MAP@10': 0.7921, 'nDCG@10': 0.8266}

=== MrTyDi | FT_INST | prompt:qa_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7397, 'Recall@1': 0.7397, 'MRR@1': 0.7397, 'MAP@1': 0.7397, 'nDCG@1': 0.7397, 'Recall@5': 0.8955, 'MRR@5': 0.8068, 'MAP@5': 0.7864, 'nDCG@5': 0.8145, 'Recall@10': 0.9246, 'MRR@10': 0.8108, 'MAP@10': 0.7917, 'nDCG@10': 0.826}

=== MrTyDi | FT_INST | prompt:search_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7427, 'Recall@1': 0.7427, 'MRR@1': 0.7427, 'MAP@1': 0.7427, 'nDCG@1': 0.7427, 'Recall@5': 0.8955, 'MRR@5': 0.8086, 'MAP@5': 0.7885, 'nDCG@5': 0.8163, 'Recall@10': 0.9246, 'MRR@10': 0.8124, 'MAP@10': 0.7935, 'nDCG@10': 0.8274}

=== MrTyDi | FT_INST | prompt:mfoll_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7497, 'Recall@1': 0.7497, 'MRR@1': 0.7497, 'MAP@1': 0.7497, 'nDCG@1': 0.7497, 'Recall@5': 0.8945, 'MRR@5': 0.8129, 'MAP@5': 0.7921, 'nDCG@5': 0.8188, 'Recall@10': 0.9256, 'MRR@10': 0.8168, 'MAP@10': 0.7977, 'nDCG@10': 0.8308}

--- Paraphrase robustness (5x) :: FT_INST ---

=== MrTyDi | FT_INST | para1 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7347, 'Recall@1': 0.7347, 'MRR@1': 0.7347, 'MAP@1': 0.7347, 'nDCG@1': 0.7347, 'Recall@5': 0.8894, 'MRR@5': 0.802, 'MAP@5': 0.7818, 'nDCG@5': 0.81, 'Recall@10': 0.9236, 'MRR@10': 0.8065, 'MAP@10': 0.7871, 'nDCG@10': 0.8221}

=== MrTyDi | FT_INST | para2 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7377, 'Recall@1': 0.7377, 'MRR@1': 0.7377, 'MAP@1': 0.7377, 'nDCG@1': 0.7377, 'Recall@5': 0.8985, 'MRR@5': 0.8059, 'MAP@5': 0.7864, 'nDCG@5': 0.8152, 'Recall@10': 0.9246, 'MRR@10': 0.8095, 'MAP@10': 0.7911, 'nDCG@10': 0.8253}

=== MrTyDi | FT_INST | para3 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7357, 'Recall@1': 0.7357, 'MRR@1': 0.7357, 'MAP@1': 0.7357, 'nDCG@1': 0.7357, 'Recall@5': 0.8935, 'MRR@5': 0.8033, 'MAP@5': 0.7827, 'nDCG@5': 0.8114, 'Recall@10': 0.9256, 'MRR@10': 0.8077, 'MAP@10': 0.7882, 'nDCG@10': 0.8235}

=== MrTyDi | FT_INST | para4 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7417, 'Recall@1': 0.7417, 'MRR@1': 0.7417, 'MAP@1': 0.7417, 'nDCG@1': 0.7417, 'Recall@5': 0.8955, 'MRR@5': 0.8083, 'MAP@5': 0.7886, 'nDCG@5': 0.8163, 'Recall@10': 0.9296, 'MRR@10': 0.8129, 'MAP@10': 0.7944, 'nDCG@10': 0.8291}

=== MrTyDi | FT_INST | para5 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7447, 'Recall@1': 0.7447, 'MRR@1': 0.7447, 'MAP@1': 0.7447, 'nDCG@1': 0.7447, 'Recall@5': 0.8985, 'MRR@5': 0.8099, 'MAP@5': 0.79, 'nDCG@5': 0.8181, 'Recall@10': 0.9226, 'MRR@10': 0.8132, 'MAP@10': 0.7945, 'nDCG@10': 0.8276}
Paraphrase mean/std: {'Hit@1_mean': 0.7389, 'Hit@1_std': 0.0038, 'Recall@1_mean': 0.7389, 'Recall@1_std': 0.0038, 'MRR@1_mean': 0.7389, 'MRR@1_std': 0.0038, 'MAP@1_mean': 0.7389, 'MAP@1_std': 0.0038, 'nDCG@1_mean': 0.7389, 'nDCG@1_std': 0.0038, 'Recall@5_mean': 0.8951, 'Recall@5_std': 0.0034, 'MRR@5_mean': 0.8059, 'MRR@5_std': 0.003, 'MAP@5_mean': 0.7859, 'MAP@5_std': 0.0032, 'nDCG@5_mean': 0.8142, 'nDCG@5_std': 0.003, 'Recall@10_mean': 0.9252, 'Recall@10_std': 0.0024, 'MRR@10_mean': 0.8099, 'MRR@10_std': 0.0027, 'MAP@10_mean': 0.7911, 'MAP@10_std': 0.0031, 'nDCG@10_mean': 0.8255, 'nDCG@10_std': 0.0025}

--- Noise stress :: FT_INST ---

=== MrTyDi | FT_INST | noise:clean ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7357, 'Recall@1': 0.7357, 'MRR@1': 0.7357, 'MAP@1': 0.7357, 'nDCG@1': 0.7357, 'Recall@5': 0.8975, 'MRR@5': 0.803, 'MAP@5': 0.7828, 'nDCG@5': 0.8124, 'Recall@10': 0.9276, 'MRR@10': 0.8071, 'MAP@10': 0.7877, 'nDCG@10': 0.8233}

=== MrTyDi | FT_INST | noise:nopunct ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7106, 'Recall@1': 0.7106, 'MRR@1': 0.7106, 'MAP@1': 0.7106, 'nDCG@1': 0.7106, 'Recall@5': 0.8663, 'MRR@5': 0.7727, 'MAP@5': 0.7521, 'nDCG@5': 0.782, 'Recall@10': 0.9015, 'MRR@10': 0.7775, 'MAP@10': 0.7579, 'nDCG@10': 0.7947}
  drop vs clean (nopunct): {'ΔMRR@1': -0.025126, 'ΔMRR@5': -0.030369, 'ΔMRR@10': -0.02958}

=== MrTyDi | FT_INST | noise:swap ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.6915, 'Recall@1': 0.6915, 'MRR@1': 0.6915, 'MAP@1': 0.6915, 'nDCG@1': 0.6915, 'Recall@5': 0.8613, 'MRR@5': 0.7583, 'MAP@5': 0.7382, 'nDCG@5': 0.77, 'Recall@10': 0.8915, 'MRR@10': 0.7625, 'MAP@10': 0.7437, 'nDCG@10': 0.7818}
  drop vs clean (swap): {'ΔMRR@1': -0.044221, 'ΔMRR@5': -0.044791, 'ΔMRR@10': -0.044536}

=== MrTyDi | FT_INST | noise:lower ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.6935, 'Recall@1': 0.6935, 'MRR@1': 0.6935, 'MAP@1': 0.6935, 'nDCG@1': 0.6935, 'Recall@5': 0.8603, 'MRR@5': 0.7622, 'MAP@5': 0.7426, 'nDCG@5': 0.7732, 'Recall@10': 0.9015, 'MRR@10': 0.7678, 'MAP@10': 0.7491, 'nDCG@10': 0.7877}
  drop vs clean (lower): {'ΔMRR@1': -0.042211, 'ΔMRR@5': -0.040888, 'ΔMRR@10': -0.039269}

=== MrTyDi | FT_QUERY | plain ===


                                                              

[cache] docs torch.Size([20000, 768]) saved -> /kaggle/working/robust_eval_cache/docs__mrtydi__qonly__363db7cc__L256.pt


                                                            

{'N': 995, 'Hit@1': 0.7869, 'Recall@1': 0.7869, 'MRR@1': 0.7869, 'MAP@1': 0.7869, 'nDCG@1': 0.7869, 'Recall@5': 0.9075, 'MRR@5': 0.8376, 'MAP@5': 0.82, 'nDCG@5': 0.8429, 'Recall@10': 0.9367, 'MRR@10': 0.8417, 'MAP@10': 0.8245, 'nDCG@10': 0.853}

=== MrTyDi | FT_QUERY | q+BANK ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7789, 'Recall@1': 0.7789, 'MRR@1': 0.7789, 'MAP@1': 0.7789, 'nDCG@1': 0.7789, 'Recall@5': 0.9055, 'MRR@5': 0.8318, 'MAP@5': 0.8129, 'nDCG@5': 0.8374, 'Recall@10': 0.9317, 'MRR@10': 0.8356, 'MAP@10': 0.8173, 'nDCG@10': 0.847}

--- MrTyDi p-MRR (BANK − plain) :: FT_QUERY ---
{'pMRR@1': -0.00804, 'pMRR@5': -0.005829, 'pMRR@10': -0.006134}

--- Prompt ablation :: FT_QUERY ---

=== MrTyDi | FT_QUERY | prompt:plain ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7869, 'Recall@1': 0.7869, 'MRR@1': 0.7869, 'MAP@1': 0.7869, 'nDCG@1': 0.7869, 'Recall@5': 0.9075, 'MRR@5': 0.8376, 'MAP@5': 0.82, 'nDCG@5': 0.8429, 'Recall@10': 0.9367, 'MRR@10': 0.8417, 'MAP@10': 0.8245, 'nDCG@10': 0.853}

=== MrTyDi | FT_QUERY | prompt:minimal_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7779, 'Recall@1': 0.7779, 'MRR@1': 0.7779, 'MAP@1': 0.7779, 'nDCG@1': 0.7779, 'Recall@5': 0.9085, 'MRR@5': 0.8329, 'MAP@5': 0.8158, 'nDCG@5': 0.8402, 'Recall@10': 0.9347, 'MRR@10': 0.8366, 'MAP@10': 0.82, 'nDCG@10': 0.8493}

=== MrTyDi | FT_QUERY | prompt:qa_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7799, 'Recall@1': 0.7799, 'MRR@1': 0.7799, 'MAP@1': 0.7799, 'nDCG@1': 0.7799, 'Recall@5': 0.9106, 'MRR@5': 0.8345, 'MAP@5': 0.8157, 'nDCG@5': 0.8407, 'Recall@10': 0.9337, 'MRR@10': 0.8377, 'MAP@10': 0.8195, 'nDCG@10': 0.849}

=== MrTyDi | FT_QUERY | prompt:search_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7809, 'Recall@1': 0.7809, 'MRR@1': 0.7809, 'MAP@1': 0.7809, 'nDCG@1': 0.7809, 'Recall@5': 0.9116, 'MRR@5': 0.8348, 'MAP@5': 0.8168, 'nDCG@5': 0.8418, 'Recall@10': 0.9337, 'MRR@10': 0.838, 'MAP@10': 0.8207, 'nDCG@10': 0.8501}

=== MrTyDi | FT_QUERY | prompt:mfoll_ru ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7819, 'Recall@1': 0.7819, 'MRR@1': 0.7819, 'MAP@1': 0.7819, 'nDCG@1': 0.7819, 'Recall@5': 0.9075, 'MRR@5': 0.8352, 'MAP@5': 0.8169, 'nDCG@5': 0.8409, 'Recall@10': 0.9327, 'MRR@10': 0.8387, 'MAP@10': 0.8211, 'nDCG@10': 0.85}

--- Paraphrase robustness (5x) :: FT_QUERY ---

=== MrTyDi | FT_QUERY | para1 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7779, 'Recall@1': 0.7779, 'MRR@1': 0.7779, 'MAP@1': 0.7779, 'nDCG@1': 0.7779, 'Recall@5': 0.9085, 'MRR@5': 0.8328, 'MAP@5': 0.8157, 'nDCG@5': 0.8399, 'Recall@10': 0.9367, 'MRR@10': 0.8367, 'MAP@10': 0.8203, 'nDCG@10': 0.8501}

=== MrTyDi | FT_QUERY | para2 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7849, 'Recall@1': 0.7849, 'MRR@1': 0.7849, 'MAP@1': 0.7849, 'nDCG@1': 0.7849, 'Recall@5': 0.9126, 'MRR@5': 0.8381, 'MAP@5': 0.8201, 'nDCG@5': 0.8443, 'Recall@10': 0.9357, 'MRR@10': 0.8412, 'MAP@10': 0.824, 'nDCG@10': 0.8529}

=== MrTyDi | FT_QUERY | para3 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7799, 'Recall@1': 0.7799, 'MRR@1': 0.7799, 'MAP@1': 0.7799, 'nDCG@1': 0.7799, 'Recall@5': 0.9126, 'MRR@5': 0.835, 'MAP@5': 0.8172, 'nDCG@5': 0.8423, 'Recall@10': 0.9367, 'MRR@10': 0.8384, 'MAP@10': 0.8211, 'nDCG@10': 0.8508}

=== MrTyDi | FT_QUERY | para4 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7819, 'Recall@1': 0.7819, 'MRR@1': 0.7819, 'MAP@1': 0.7819, 'nDCG@1': 0.7819, 'Recall@5': 0.9085, 'MRR@5': 0.8347, 'MAP@5': 0.817, 'nDCG@5': 0.8413, 'Recall@10': 0.9357, 'MRR@10': 0.8386, 'MAP@10': 0.8211, 'nDCG@10': 0.8505}

=== MrTyDi | FT_QUERY | para5 ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7749, 'Recall@1': 0.7749, 'MRR@1': 0.7749, 'MAP@1': 0.7749, 'nDCG@1': 0.7749, 'Recall@5': 0.9106, 'MRR@5': 0.8309, 'MAP@5': 0.813, 'nDCG@5': 0.8385, 'Recall@10': 0.9337, 'MRR@10': 0.8343, 'MAP@10': 0.817, 'nDCG@10': 0.8471}
Paraphrase mean/std: {'Hit@1_mean': 0.7799, 'Hit@1_std': 0.0034, 'Recall@1_mean': 0.7799, 'Recall@1_std': 0.0034, 'MRR@1_mean': 0.7799, 'MRR@1_std': 0.0034, 'MAP@1_mean': 0.7799, 'MAP@1_std': 0.0034, 'nDCG@1_mean': 0.7799, 'nDCG@1_std': 0.0034, 'Recall@5_mean': 0.9106, 'Recall@5_std': 0.0018, 'MRR@5_mean': 0.8343, 'MRR@5_std': 0.0024, 'MAP@5_mean': 0.8166, 'MAP@5_std': 0.0023, 'nDCG@5_mean': 0.8413, 'nDCG@5_std': 0.002, 'Recall@10_mean': 0.9357, 'Recall@10_std': 0.0011, 'MRR@10_mean': 0.8378, 'MRR@10_std': 0.0023, 'MAP@10_mean': 0.8207, 'MAP@10_std': 0.0022, 'nDCG@10_mean': 0.8503, 'nDCG@10_std': 0.0019}

--- Noise stress :: FT_QUERY ---

=== MrTyDi | FT_QUERY | noise:clean ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7869, 'Recall@1': 0.7869, 'MRR@1': 0.7869, 'MAP@1': 0.7869, 'nDCG@1': 0.7869, 'Recall@5': 0.9075, 'MRR@5': 0.8376, 'MAP@5': 0.82, 'nDCG@5': 0.8429, 'Recall@10': 0.9367, 'MRR@10': 0.8417, 'MAP@10': 0.8245, 'nDCG@10': 0.853}

=== MrTyDi | FT_QUERY | noise:nopunct ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7588, 'Recall@1': 0.7588, 'MRR@1': 0.7588, 'MAP@1': 0.7588, 'nDCG@1': 0.7588, 'Recall@5': 0.8894, 'MRR@5': 0.8137, 'MAP@5': 0.7957, 'nDCG@5': 0.8202, 'Recall@10': 0.9186, 'MRR@10': 0.8177, 'MAP@10': 0.8006, 'nDCG@10': 0.8308}
  drop vs clean (nopunct): {'ΔMRR@1': -0.028141, 'ΔMRR@5': -0.02397, 'ΔMRR@10': -0.024057}

=== MrTyDi | FT_QUERY | noise:swap ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7538, 'Recall@1': 0.7538, 'MRR@1': 0.7538, 'MAP@1': 0.7538, 'nDCG@1': 0.7538, 'Recall@5': 0.8864, 'MRR@5': 0.8098, 'MAP@5': 0.7929, 'nDCG@5': 0.8175, 'Recall@10': 0.9146, 'MRR@10': 0.8136, 'MAP@10': 0.7973, 'nDCG@10': 0.8276}
  drop vs clean (swap): {'ΔMRR@1': -0.033166, 'ΔMRR@5': -0.027772, 'ΔMRR@10': -0.028117}

=== MrTyDi | FT_QUERY | noise:lower ===
[cache] docs torch.Size([20000, 768]) loaded


                                                            

{'N': 995, 'Hit@1': 0.7648, 'Recall@1': 0.7648, 'MRR@1': 0.7648, 'MAP@1': 0.7648, 'nDCG@1': 0.7648, 'Recall@5': 0.8935, 'MRR@5': 0.8178, 'MAP@5': 0.7986, 'nDCG@5': 0.8233, 'Recall@10': 0.9146, 'MRR@10': 0.8206, 'MAP@10': 0.8023, 'nDCG@10': 0.8315}
  drop vs clean (lower): {'ΔMRR@1': -0.022111, 'ΔMRR@5': -0.019816, 'ΔMRR@10': -0.021089}

=== Mr.TyDi SUMMARY (plain only) ===
BASE -> {'N': 995, 'Hit@1': 0.808, 'Recall@1': 0.808, 'MRR@1': 0.808, 'MAP@1': 0.808, 'nDCG@1': 0.808, 'Recall@5': 0.9357, 'MRR@5': 0.862, 'MAP@5': 0.8446, 'nDCG@5': 0.8686, 'Recall@10': 0.9588, 'MRR@10': 0.8653, 'MAP@10': 0.8494, 'nDCG@10': 0.8782}
FT_INST -> {'N': 995, 'Hit@1': 0.7357, 'Recall@1': 0.7357, 'MRR@1': 0.7357, 'MAP@1': 0.7357, 'nDCG@1': 0.7357, 'Recall@5': 0.8975, 'MRR@5': 0.803, 'MAP@5': 0.7828, 'nDCG@5': 0.8124, 'Recall@10': 0.9276, 'MRR@10': 0.8071, 'MAP@10': 0.7877, 'nDCG@10': 0.8233}
FT_QUERY -> {'N': 995, 'Hit@1': 0.7869, 'Recall@1': 0.7869, 'MRR@1': 0.7869, 'MAP@1': 0.7869, 'nDCG@1': 0.7869, 'Rec

                                                              

[cache] docs torch.Size([39326, 768]) saved -> /kaggle/working/robust_eval_cache/docs__mfollowir__base__d3fc34cd__L256.pt


                                                  

{'N': 39, 'Hit@1': 0.4872, 'Recall@1': 0.4872, 'MRR@1': 0.4872, 'MAP@1': 0.4872, 'nDCG@1': 0.4872, 'Recall@5': 0.7692, 'MRR@5': 0.6034, 'MAP@5': 0.3404, 'nDCG@5': 0.4301, 'Recall@10': 0.8462, 'MRR@10': 0.6123, 'MAP@10': 0.2679, 'nDCG@10': 0.3859}

=== mFollowIR | BASE | q+BANK ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4359, 'Recall@1': 0.4359, 'MRR@1': 0.4359, 'MAP@1': 0.4359, 'nDCG@1': 0.4359, 'Recall@5': 0.6667, 'MRR@5': 0.5286, 'MAP@5': 0.3332, 'nDCG@5': 0.4091, 'Recall@10': 0.7949, 'MRR@10': 0.5458, 'MAP@10': 0.2616, 'nDCG@10': 0.368}

--- mFollowIR p-MRR (BANK − plain) :: BASE ---
{'pMRR@1': -0.051282, 'pMRR@5': -0.074786, 'pMRR@10': -0.066494}

--- Prompt ablation :: BASE ---

=== mFollowIR | BASE | prompt:plain ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4872, 'Recall@1': 0.4872, 'MRR@1': 0.4872, 'MAP@1': 0.4872, 'nDCG@1': 0.4872, 'Recall@5': 0.7692, 'MRR@5': 0.6034, 'MAP@5': 0.3404, 'nDCG@5': 0.4301, 'Recall@10': 0.8462, 'MRR@10': 0.6123, 'MAP@10': 0.2679, 'nDCG@10': 0.3859}

=== mFollowIR | BASE | prompt:minimal_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4872, 'Recall@1': 0.4872, 'MRR@1': 0.4872, 'MAP@1': 0.4872, 'nDCG@1': 0.4872, 'Recall@5': 0.7179, 'MRR@5': 0.5726, 'MAP@5': 0.3323, 'nDCG@5': 0.4189, 'Recall@10': 0.7692, 'MRR@10': 0.5784, 'MAP@10': 0.2645, 'nDCG@10': 0.373}

=== mFollowIR | BASE | prompt:qa_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.7436, 'MRR@5': 0.6034, 'MAP@5': 0.3258, 'nDCG@5': 0.4148, 'Recall@10': 0.7949, 'MRR@10': 0.6105, 'MAP@10': 0.2552, 'nDCG@10': 0.3701}

=== mFollowIR | BASE | prompt:search_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4103, 'Recall@1': 0.4103, 'MRR@1': 0.4103, 'MAP@1': 0.4103, 'nDCG@1': 0.4103, 'Recall@5': 0.6923, 'MRR@5': 0.5081, 'MAP@5': 0.2916, 'nDCG@5': 0.3739, 'Recall@10': 0.7949, 'MRR@10': 0.521, 'MAP@10': 0.2265, 'nDCG@10': 0.3377}

=== mFollowIR | BASE | prompt:mfoll_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.7436, 'MRR@5': 0.5962, 'MAP@5': 0.3502, 'nDCG@5': 0.435, 'Recall@10': 0.7692, 'MRR@10': 0.6004, 'MAP@10': 0.2745, 'nDCG@10': 0.3854}

--- Paraphrase robustness (5x) :: BASE ---

=== mFollowIR | BASE | para1 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.7436, 'MRR@5': 0.5744, 'MAP@5': 0.3324, 'nDCG@5': 0.4227, 'Recall@10': 0.7692, 'MRR@10': 0.5769, 'MAP@10': 0.2622, 'nDCG@10': 0.3731}

=== mFollowIR | BASE | para2 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4359, 'Recall@1': 0.4359, 'MRR@1': 0.4359, 'MAP@1': 0.4359, 'nDCG@1': 0.4359, 'Recall@5': 0.7179, 'MRR@5': 0.5436, 'MAP@5': 0.3116, 'nDCG@5': 0.3976, 'Recall@10': 0.7949, 'MRR@10': 0.5541, 'MAP@10': 0.2509, 'nDCG@10': 0.3649}

=== mFollowIR | BASE | para3 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4359, 'Recall@1': 0.4359, 'MRR@1': 0.4359, 'MAP@1': 0.4359, 'nDCG@1': 0.4359, 'Recall@5': 0.6923, 'MRR@5': 0.5329, 'MAP@5': 0.2988, 'nDCG@5': 0.3825, 'Recall@10': 0.7436, 'MRR@10': 0.5408, 'MAP@10': 0.2375, 'nDCG@10': 0.3517}

=== mFollowIR | BASE | para4 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4359, 'Recall@1': 0.4359, 'MRR@1': 0.4359, 'MAP@1': 0.4359, 'nDCG@1': 0.4359, 'Recall@5': 0.6667, 'MRR@5': 0.5188, 'MAP@5': 0.2813, 'nDCG@5': 0.3638, 'Recall@10': 0.7436, 'MRR@10': 0.5285, 'MAP@10': 0.2274, 'nDCG@10': 0.3347}

=== mFollowIR | BASE | para5 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.6667, 'MRR@5': 0.5513, 'MAP@5': 0.3064, 'nDCG@5': 0.3895, 'Recall@10': 0.7949, 'MRR@10': 0.5694, 'MAP@10': 0.2459, 'nDCG@10': 0.3587}
Paraphrase mean/std: {'Hit@1_mean': 0.4462, 'Hit@1_std': 0.0126, 'Recall@1_mean': 0.4462, 'Recall@1_std': 0.0126, 'MRR@1_mean': 0.4462, 'MRR@1_std': 0.0126, 'MAP@1_mean': 0.4462, 'MAP@1_std': 0.0126, 'nDCG@1_mean': 0.4462, 'nDCG@1_std': 0.0126, 'Recall@5_mean': 0.6974, 'Recall@5_std': 0.0299, 'MRR@5_mean': 0.5442, 'MRR@5_std': 0.0186, 'MAP@5_mean': 0.3061, 'MAP@5_std': 0.0167, 'nDCG@5_mean': 0.3912, 'nDCG@5_std': 0.0193, 'Recall@10_mean': 0.7692, 'Recall@10_std': 0.0229, 'MRR@10_mean': 0.554, 'MRR@10_std': 0.0178, 'MAP@10_mean': 0.2448, 'MAP@10_std': 0.0118, 'nDCG@10_mean': 0.3566, 'nDCG@10_std': 0.013}

--- Noise stress :: BASE ---

=== mFollowIR | BASE | noise:clean ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4872, 'Recall@1': 0.4872, 'MRR@1': 0.4872, 'MAP@1': 0.4872, 'nDCG@1': 0.4872, 'Recall@5': 0.7692, 'MRR@5': 0.6034, 'MAP@5': 0.3404, 'nDCG@5': 0.4301, 'Recall@10': 0.8462, 'MRR@10': 0.6123, 'MAP@10': 0.2679, 'nDCG@10': 0.3859}

=== mFollowIR | BASE | noise:nopunct ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5385, 'Recall@1': 0.5385, 'MRR@1': 0.5385, 'MAP@1': 0.5385, 'nDCG@1': 0.5385, 'Recall@5': 0.7692, 'MRR@5': 0.6303, 'MAP@5': 0.3649, 'nDCG@5': 0.453, 'Recall@10': 0.8462, 'MRR@10': 0.6403, 'MAP@10': 0.2892, 'nDCG@10': 0.412}
  drop vs clean (nopunct): {'ΔMRR@1': 0.051282, 'ΔMRR@5': 0.026923, 'ΔMRR@10': 0.027991}

=== mFollowIR | BASE | noise:swap ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.7692, 'MRR@5': 0.6197, 'MAP@5': 0.3567, 'nDCG@5': 0.446, 'Recall@10': 0.8462, 'MRR@10': 0.6303, 'MAP@10': 0.2895, 'nDCG@10': 0.4104}
  drop vs clean (swap): {'ΔMRR@1': 0.025641, 'ΔMRR@5': 0.016239, 'ΔMRR@10': 0.01802}

=== mFollowIR | BASE | noise:lower ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4359, 'Recall@1': 0.4359, 'MRR@1': 0.4359, 'MAP@1': 0.4359, 'nDCG@1': 0.4359, 'Recall@5': 0.6923, 'MRR@5': 0.5329, 'MAP@5': 0.2993, 'nDCG@5': 0.3803, 'Recall@10': 0.7692, 'MRR@10': 0.5451, 'MAP@10': 0.2433, 'nDCG@10': 0.3514}
  drop vs clean (lower): {'ΔMRR@1': -0.051282, 'ΔMRR@5': -0.070513, 'ΔMRR@10': -0.067206}

=== mFollowIR | FT_INST | plain ===


                                                              

[cache] docs torch.Size([39326, 768]) saved -> /kaggle/working/robust_eval_cache/docs__mfollowir__inst__d3fc34cd__L256.pt


                                                  

{'N': 39, 'Hit@1': 0.5641, 'Recall@1': 0.5641, 'MRR@1': 0.5641, 'MAP@1': 0.5641, 'nDCG@1': 0.5641, 'Recall@5': 0.7436, 'MRR@5': 0.6214, 'MAP@5': 0.3814, 'nDCG@5': 0.4574, 'Recall@10': 0.8205, 'MRR@10': 0.6322, 'MAP@10': 0.3048, 'nDCG@10': 0.416}

=== mFollowIR | FT_INST | q+BANK ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.7179, 'MRR@5': 0.5885, 'MAP@5': 0.3631, 'nDCG@5': 0.4381, 'Recall@10': 0.7436, 'MRR@10': 0.5927, 'MAP@10': 0.2949, 'nDCG@10': 0.3997}

--- mFollowIR p-MRR (BANK − plain) :: FT_INST ---
{'pMRR@1': -0.051282, 'pMRR@5': -0.032906, 'pMRR@10': -0.039418}

--- Prompt ablation :: FT_INST ---

=== mFollowIR | FT_INST | prompt:plain ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5641, 'Recall@1': 0.5641, 'MRR@1': 0.5641, 'MAP@1': 0.5641, 'nDCG@1': 0.5641, 'Recall@5': 0.7436, 'MRR@5': 0.6214, 'MAP@5': 0.3814, 'nDCG@5': 0.4574, 'Recall@10': 0.8205, 'MRR@10': 0.6322, 'MAP@10': 0.3048, 'nDCG@10': 0.416}

=== mFollowIR | FT_INST | prompt:minimal_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5385, 'Recall@1': 0.5385, 'MRR@1': 0.5385, 'MAP@1': 0.5385, 'nDCG@1': 0.5385, 'Recall@5': 0.7692, 'MRR@5': 0.6179, 'MAP@5': 0.3643, 'nDCG@5': 0.4461, 'Recall@10': 0.8205, 'MRR@10': 0.6242, 'MAP@10': 0.3044, 'nDCG@10': 0.4164}

=== mFollowIR | FT_INST | prompt:qa_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5385, 'Recall@1': 0.5385, 'MRR@1': 0.5385, 'MAP@1': 0.5385, 'nDCG@1': 0.5385, 'Recall@5': 0.7436, 'MRR@5': 0.6077, 'MAP@5': 0.3564, 'nDCG@5': 0.4373, 'Recall@10': 0.7949, 'MRR@10': 0.6142, 'MAP@10': 0.3005, 'nDCG@10': 0.4116}

=== mFollowIR | FT_INST | prompt:search_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.7692, 'MRR@5': 0.5979, 'MAP@5': 0.3678, 'nDCG@5': 0.4459, 'Recall@10': 0.8205, 'MRR@10': 0.6039, 'MAP@10': 0.2974, 'nDCG@10': 0.4073}

=== mFollowIR | FT_INST | prompt:mfoll_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.7692, 'MRR@5': 0.6107, 'MAP@5': 0.3766, 'nDCG@5': 0.4574, 'Recall@10': 0.7949, 'MRR@10': 0.6132, 'MAP@10': 0.3053, 'nDCG@10': 0.4159}

--- Paraphrase robustness (5x) :: FT_INST ---

=== mFollowIR | FT_INST | para1 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5385, 'Recall@1': 0.5385, 'MRR@1': 0.5385, 'MAP@1': 0.5385, 'nDCG@1': 0.5385, 'Recall@5': 0.7436, 'MRR@5': 0.6128, 'MAP@5': 0.3694, 'nDCG@5': 0.4482, 'Recall@10': 0.8205, 'MRR@10': 0.6225, 'MAP@10': 0.3025, 'nDCG@10': 0.4129}

=== mFollowIR | FT_INST | para2 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5385, 'Recall@1': 0.5385, 'MRR@1': 0.5385, 'MAP@1': 0.5385, 'nDCG@1': 0.5385, 'Recall@5': 0.7436, 'MRR@5': 0.6141, 'MAP@5': 0.3629, 'nDCG@5': 0.4427, 'Recall@10': 0.7949, 'MRR@10': 0.6202, 'MAP@10': 0.3038, 'nDCG@10': 0.416}

=== mFollowIR | FT_INST | para3 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4872, 'Recall@1': 0.4872, 'MRR@1': 0.4872, 'MAP@1': 0.4872, 'nDCG@1': 0.4872, 'Recall@5': 0.7436, 'MRR@5': 0.5829, 'MAP@5': 0.3402, 'nDCG@5': 0.4205, 'Recall@10': 0.8205, 'MRR@10': 0.594, 'MAP@10': 0.2837, 'nDCG@10': 0.3943}

=== mFollowIR | FT_INST | para4 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.7436, 'MRR@5': 0.6, 'MAP@5': 0.3632, 'nDCG@5': 0.4428, 'Recall@10': 0.7949, 'MRR@10': 0.6061, 'MAP@10': 0.3016, 'nDCG@10': 0.4097}

=== mFollowIR | FT_INST | para5 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5385, 'Recall@1': 0.5385, 'MRR@1': 0.5385, 'MAP@1': 0.5385, 'nDCG@1': 0.5385, 'Recall@5': 0.7436, 'MRR@5': 0.6132, 'MAP@5': 0.3695, 'nDCG@5': 0.4493, 'Recall@10': 0.7949, 'MRR@10': 0.6198, 'MAP@10': 0.3031, 'nDCG@10': 0.4138}
Paraphrase mean/std: {'Hit@1_mean': 0.5231, 'Hit@1_std': 0.0205, 'Recall@1_mean': 0.5231, 'Recall@1_std': 0.0205, 'MRR@1_mean': 0.5231, 'MRR@1_std': 0.0205, 'MAP@1_mean': 0.5231, 'MAP@1_std': 0.0205, 'nDCG@1_mean': 0.5231, 'nDCG@1_std': 0.0205, 'Recall@5_mean': 0.7436, 'Recall@5_std': 0.0, 'MRR@5_mean': 0.6046, 'MRR@5_std': 0.012, 'MAP@5_mean': 0.3611, 'MAP@5_std': 0.0108, 'nDCG@5_mean': 0.4407, 'nDCG@5_std': 0.0105, 'Recall@10_mean': 0.8051, 'Recall@10_std': 0.0126, 'MRR@10_mean': 0.6125, 'MRR@10_std': 0.0109, 'MAP@10_mean': 0.2989, 'MAP@10_std': 0.0077, 'nDCG@10_mean': 0.4093, 'nDCG@10_std': 0.0078}

--- Noise stress :: FT_INST ---

=== mFollowIR | FT_INST | noise:clean ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5641, 'Recall@1': 0.5641, 'MRR@1': 0.5641, 'MAP@1': 0.5641, 'nDCG@1': 0.5641, 'Recall@5': 0.7436, 'MRR@5': 0.6214, 'MAP@5': 0.3814, 'nDCG@5': 0.4574, 'Recall@10': 0.8205, 'MRR@10': 0.6322, 'MAP@10': 0.3048, 'nDCG@10': 0.416}

=== mFollowIR | FT_INST | noise:nopunct ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5385, 'Recall@1': 0.5385, 'MRR@1': 0.5385, 'MAP@1': 0.5385, 'nDCG@1': 0.5385, 'Recall@5': 0.7436, 'MRR@5': 0.6171, 'MAP@5': 0.3759, 'nDCG@5': 0.4528, 'Recall@10': 0.7949, 'MRR@10': 0.6236, 'MAP@10': 0.3044, 'nDCG@10': 0.4147}
  drop vs clean (nopunct): {'ΔMRR@1': -0.025641, 'ΔMRR@5': -0.004274, 'ΔMRR@10': -0.008547}

=== mFollowIR | FT_INST | noise:swap ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5385, 'Recall@1': 0.5385, 'MRR@1': 0.5385, 'MAP@1': 0.5385, 'nDCG@1': 0.5385, 'Recall@5': 0.7179, 'MRR@5': 0.6043, 'MAP@5': 0.3722, 'nDCG@5': 0.4462, 'Recall@10': 0.7692, 'MRR@10': 0.6111, 'MAP@10': 0.2984, 'nDCG@10': 0.4067}
  drop vs clean (swap): {'ΔMRR@1': -0.025641, 'ΔMRR@5': -0.017094, 'ΔMRR@10': -0.021042}

=== mFollowIR | FT_INST | noise:lower ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.7179, 'MRR@5': 0.5885, 'MAP@5': 0.3486, 'nDCG@5': 0.4171, 'Recall@10': 0.7949, 'MRR@10': 0.6007, 'MAP@10': 0.2791, 'nDCG@10': 0.3848}
  drop vs clean (lower): {'ΔMRR@1': -0.051282, 'ΔMRR@5': -0.032906, 'ΔMRR@10': -0.031481}

=== mFollowIR | FT_QUERY | plain ===


                                                              

[cache] docs torch.Size([39326, 768]) saved -> /kaggle/working/robust_eval_cache/docs__mfollowir__qonly__d3fc34cd__L256.pt


                                                  

{'N': 39, 'Hit@1': 0.4103, 'Recall@1': 0.4103, 'MRR@1': 0.4103, 'MAP@1': 0.4103, 'nDCG@1': 0.4103, 'Recall@5': 0.7692, 'MRR@5': 0.5628, 'MAP@5': 0.3193, 'nDCG@5': 0.4102, 'Recall@10': 0.8462, 'MRR@10': 0.574, 'MAP@10': 0.2548, 'nDCG@10': 0.3703}

=== mFollowIR | FT_QUERY | q+BANK ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.6923, 'MRR@5': 0.5812, 'MAP@5': 0.3415, 'nDCG@5': 0.4195, 'Recall@10': 0.7436, 'MRR@10': 0.5881, 'MAP@10': 0.265, 'nDCG@10': 0.3676}

--- mFollowIR p-MRR (BANK − plain) :: FT_QUERY ---
{'pMRR@1': 0.102564, 'pMRR@5': 0.018376, 'pMRR@10': 0.014103}

--- Prompt ablation :: FT_QUERY ---

=== mFollowIR | FT_QUERY | prompt:plain ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4103, 'Recall@1': 0.4103, 'MRR@1': 0.4103, 'MAP@1': 0.4103, 'nDCG@1': 0.4103, 'Recall@5': 0.7692, 'MRR@5': 0.5628, 'MAP@5': 0.3193, 'nDCG@5': 0.4102, 'Recall@10': 0.8462, 'MRR@10': 0.574, 'MAP@10': 0.2548, 'nDCG@10': 0.3703}

=== mFollowIR | FT_QUERY | prompt:minimal_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.7436, 'MRR@5': 0.5671, 'MAP@5': 0.3202, 'nDCG@5': 0.4053, 'Recall@10': 0.7949, 'MRR@10': 0.5736, 'MAP@10': 0.2521, 'nDCG@10': 0.3614}

=== mFollowIR | FT_QUERY | prompt:qa_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.7179, 'MRR@5': 0.562, 'MAP@5': 0.3182, 'nDCG@5': 0.4027, 'Recall@10': 0.7692, 'MRR@10': 0.5694, 'MAP@10': 0.2567, 'nDCG@10': 0.3645}

=== mFollowIR | FT_QUERY | prompt:search_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.7436, 'MRR@5': 0.5564, 'MAP@5': 0.3018, 'nDCG@5': 0.3894, 'Recall@10': 0.7949, 'MRR@10': 0.5622, 'MAP@10': 0.2407, 'nDCG@10': 0.3503}

=== mFollowIR | FT_QUERY | prompt:mfoll_ru ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.7692, 'MRR@5': 0.5808, 'MAP@5': 0.3425, 'nDCG@5': 0.4265, 'Recall@10': 0.8462, 'MRR@10': 0.5896, 'MAP@10': 0.2752, 'nDCG@10': 0.3878}

--- Paraphrase robustness (5x) :: FT_QUERY ---

=== mFollowIR | FT_QUERY | para1 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4872, 'Recall@1': 0.4872, 'MRR@1': 0.4872, 'MAP@1': 0.4872, 'nDCG@1': 0.4872, 'Recall@5': 0.7436, 'MRR@5': 0.5876, 'MAP@5': 0.3393, 'nDCG@5': 0.4225, 'Recall@10': 0.8205, 'MRR@10': 0.597, 'MAP@10': 0.2684, 'nDCG@10': 0.3787}

=== mFollowIR | FT_QUERY | para2 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4872, 'Recall@1': 0.4872, 'MRR@1': 0.4872, 'MAP@1': 0.4872, 'nDCG@1': 0.4872, 'Recall@5': 0.7692, 'MRR@5': 0.5872, 'MAP@5': 0.3179, 'nDCG@5': 0.4093, 'Recall@10': 0.8205, 'MRR@10': 0.5932, 'MAP@10': 0.2596, 'nDCG@10': 0.3762}

=== mFollowIR | FT_QUERY | para3 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.5128, 'Recall@1': 0.5128, 'MRR@1': 0.5128, 'MAP@1': 0.5128, 'nDCG@1': 0.5128, 'Recall@5': 0.6923, 'MRR@5': 0.5714, 'MAP@5': 0.2979, 'nDCG@5': 0.3859, 'Recall@10': 0.7949, 'MRR@10': 0.5855, 'MAP@10': 0.2365, 'nDCG@10': 0.3493}

=== mFollowIR | FT_QUERY | para4 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.7179, 'MRR@5': 0.5641, 'MAP@5': 0.3063, 'nDCG@5': 0.3907, 'Recall@10': 0.7949, 'MRR@10': 0.5741, 'MAP@10': 0.2479, 'nDCG@10': 0.3598}

=== mFollowIR | FT_QUERY | para5 ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4872, 'Recall@1': 0.4872, 'MRR@1': 0.4872, 'MAP@1': 0.4872, 'nDCG@1': 0.4872, 'Recall@5': 0.7179, 'MRR@5': 0.5833, 'MAP@5': 0.3272, 'nDCG@5': 0.4137, 'Recall@10': 0.8205, 'MRR@10': 0.5967, 'MAP@10': 0.2643, 'nDCG@10': 0.3787}
Paraphrase mean/std: {'Hit@1_mean': 0.4872, 'Hit@1_std': 0.0162, 'Recall@1_mean': 0.4872, 'Recall@1_std': 0.0162, 'MRR@1_mean': 0.4872, 'MRR@1_std': 0.0162, 'MAP@1_mean': 0.4872, 'MAP@1_std': 0.0162, 'nDCG@1_mean': 0.4872, 'nDCG@1_std': 0.0162, 'Recall@5_mean': 0.7282, 'Recall@5_std': 0.0261, 'MRR@5_mean': 0.5787, 'MRR@5_std': 0.0094, 'MAP@5_mean': 0.3177, 'MAP@5_std': 0.0147, 'nDCG@5_mean': 0.4044, 'nDCG@5_std': 0.0139, 'Recall@10_mean': 0.8103, 'Recall@10_std': 0.0126, 'MRR@10_mean': 0.5893, 'MRR@10_std': 0.0086, 'MAP@10_mean': 0.2554, 'MAP@10_std': 0.0116, 'nDCG@10_mean': 0.3685, 'nDCG@10_std': 0.0119}

--- Noise stress :: FT_QUERY ---

=== mFollowIR | FT_QUERY | noise:clean ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4103, 'Recall@1': 0.4103, 'MRR@1': 0.4103, 'MAP@1': 0.4103, 'nDCG@1': 0.4103, 'Recall@5': 0.7692, 'MRR@5': 0.5628, 'MAP@5': 0.3193, 'nDCG@5': 0.4102, 'Recall@10': 0.8462, 'MRR@10': 0.574, 'MAP@10': 0.2548, 'nDCG@10': 0.3703}

=== mFollowIR | FT_QUERY | noise:nopunct ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.7949, 'MRR@5': 0.6056, 'MAP@5': 0.3505, 'nDCG@5': 0.4358, 'Recall@10': 0.8205, 'MRR@10': 0.6098, 'MAP@10': 0.2759, 'nDCG@10': 0.3906}
  drop vs clean (nopunct): {'ΔMRR@1': 0.051282, 'ΔMRR@5': 0.042735, 'ΔMRR@10': 0.035867}

=== mFollowIR | FT_QUERY | noise:swap ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4615, 'Recall@1': 0.4615, 'MRR@1': 0.4615, 'MAP@1': 0.4615, 'nDCG@1': 0.4615, 'Recall@5': 0.7949, 'MRR@5': 0.5957, 'MAP@5': 0.343, 'nDCG@5': 0.4283, 'Recall@10': 0.8462, 'MRR@10': 0.6032, 'MAP@10': 0.2645, 'nDCG@10': 0.3825}
  drop vs clean (swap): {'ΔMRR@1': 0.051282, 'ΔMRR@5': 0.032906, 'ΔMRR@10': 0.029243}

=== mFollowIR | FT_QUERY | noise:lower ===
[cache] docs torch.Size([39326, 768]) loaded


                                                  

{'N': 39, 'Hit@1': 0.4103, 'Recall@1': 0.4103, 'MRR@1': 0.4103, 'MAP@1': 0.4103, 'nDCG@1': 0.4103, 'Recall@5': 0.6667, 'MRR@5': 0.5235, 'MAP@5': 0.3026, 'nDCG@5': 0.3775, 'Recall@10': 0.7949, 'MRR@10': 0.5428, 'MAP@10': 0.2344, 'nDCG@10': 0.3396}
  drop vs clean (lower): {'ΔMRR@1': 0.0, 'ΔMRR@5': -0.039316, 'ΔMRR@10': -0.031125}


--- RU→EN --- Docs=1,180 | Queries=1,187 | QrelsQ=1,187

=== LAReQA RU→EN | BASE | plain ===


                                                            

[cache] docs torch.Size([1180, 768]) saved -> /kaggle/working/robust_eval_cache/docs__lareqa_RU→EN__base__be7f5f38__L256.pt


                                                            

{'N': 1187, 'Hit@1': 0.7666, 'Recall@1': 0.7666, 'MRR@1': 0.7666, 'MAP@1': 0.7666, 'nDCG@1': 0.7666, 'Recall@5': 0.9334, 'MRR@5': 0.8353, 'MAP@5': 0.6889, 'nDCG@5': 0.7423, 'Recall@10': 0.9646, 'MRR@10': 0.8396, 'MAP@10': 0.6877, 'nDCG@10': 0.7488}

=== LAReQA RU→EN | BASE | q+BANK ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.6596, 'Recall@1': 0.6596, 'MRR@1': 0.6596, 'MAP@1': 0.6596, 'nDCG@1': 0.6596, 'Recall@5': 0.8997, 'MRR@5': 0.7598, 'MAP@5': 0.626, 'nDCG@5': 0.684, 'Recall@10': 0.9377, 'MRR@10': 0.7651, 'MAP@10': 0.6268, 'nDCG@10': 0.6929}

--- LAReQA p-MRR (BANK − plain) :: RU→EN :: BASE ---
{'pMRR@1': -0.106992, 'pMRR@5': -0.075512, 'pMRR@10': -0.07453}

--- Prompt ablation :: RU→EN :: BASE ---

=== LAReQA RU→EN | BASE | prompt:plain ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7666, 'Recall@1': 0.7666, 'MRR@1': 0.7666, 'MAP@1': 0.7666, 'nDCG@1': 0.7666, 'Recall@5': 0.9334, 'MRR@5': 0.8353, 'MAP@5': 0.6889, 'nDCG@5': 0.7423, 'Recall@10': 0.9646, 'MRR@10': 0.8396, 'MAP@10': 0.6877, 'nDCG@10': 0.7488}

=== LAReQA RU→EN | BASE | prompt:minimal_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7388, 'Recall@1': 0.7388, 'MRR@1': 0.7388, 'MAP@1': 0.7388, 'nDCG@1': 0.7388, 'Recall@5': 0.9183, 'MRR@5': 0.8133, 'MAP@5': 0.6719, 'nDCG@5': 0.7246, 'Recall@10': 0.9511, 'MRR@10': 0.8177, 'MAP@10': 0.6706, 'nDCG@10': 0.7307}

=== LAReQA RU→EN | BASE | prompt:qa_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.695, 'Recall@1': 0.695, 'MRR@1': 0.695, 'MAP@1': 0.695, 'nDCG@1': 0.695, 'Recall@5': 0.9166, 'MRR@5': 0.7879, 'MAP@5': 0.6511, 'nDCG@5': 0.7078, 'Recall@10': 0.9553, 'MRR@10': 0.7932, 'MAP@10': 0.6511, 'nDCG@10': 0.7161}

=== LAReQA RU→EN | BASE | prompt:search_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7254, 'Recall@1': 0.7254, 'MRR@1': 0.7254, 'MAP@1': 0.7254, 'nDCG@1': 0.7254, 'Recall@5': 0.9174, 'MRR@5': 0.8048, 'MAP@5': 0.6654, 'nDCG@5': 0.7193, 'Recall@10': 0.9478, 'MRR@10': 0.8088, 'MAP@10': 0.6639, 'nDCG@10': 0.7247}

=== LAReQA RU→EN | BASE | prompt:mfoll_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.6984, 'Recall@1': 0.6984, 'MRR@1': 0.6984, 'MAP@1': 0.6984, 'nDCG@1': 0.6984, 'Recall@5': 0.9073, 'MRR@5': 0.7851, 'MAP@5': 0.6479, 'nDCG@5': 0.7027, 'Recall@10': 0.9469, 'MRR@10': 0.7902, 'MAP@10': 0.6475, 'nDCG@10': 0.7107}

--- Paraphrase robustness (5x) :: RU→EN :: BASE ---

=== LAReQA RU→EN | BASE | para1 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7136, 'Recall@1': 0.7136, 'MRR@1': 0.7136, 'MAP@1': 0.7136, 'nDCG@1': 0.7136, 'Recall@5': 0.9115, 'MRR@5': 0.7937, 'MAP@5': 0.6542, 'nDCG@5': 0.709, 'Recall@10': 0.9469, 'MRR@10': 0.7984, 'MAP@10': 0.6533, 'nDCG@10': 0.7158}

=== LAReQA RU→EN | BASE | para2 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7254, 'Recall@1': 0.7254, 'MRR@1': 0.7254, 'MAP@1': 0.7254, 'nDCG@1': 0.7254, 'Recall@5': 0.9158, 'MRR@5': 0.8055, 'MAP@5': 0.6652, 'nDCG@5': 0.7189, 'Recall@10': 0.9503, 'MRR@10': 0.81, 'MAP@10': 0.6637, 'nDCG@10': 0.7248}

=== LAReQA RU→EN | BASE | para3 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.711, 'Recall@1': 0.711, 'MRR@1': 0.711, 'MAP@1': 0.711, 'nDCG@1': 0.711, 'Recall@5': 0.9115, 'MRR@5': 0.7927, 'MAP@5': 0.6554, 'nDCG@5': 0.7103, 'Recall@10': 0.9461, 'MRR@10': 0.7971, 'MAP@10': 0.6537, 'nDCG@10': 0.7154}

=== LAReQA RU→EN | BASE | para4 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.6992, 'Recall@1': 0.6992, 'MRR@1': 0.6992, 'MAP@1': 0.6992, 'nDCG@1': 0.6992, 'Recall@5': 0.909, 'MRR@5': 0.7865, 'MAP@5': 0.6498, 'nDCG@5': 0.7054, 'Recall@10': 0.9461, 'MRR@10': 0.7916, 'MAP@10': 0.6491, 'nDCG@10': 0.7121}

=== LAReQA RU→EN | BASE | para5 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.6841, 'Recall@1': 0.6841, 'MRR@1': 0.6841, 'MAP@1': 0.6841, 'nDCG@1': 0.6841, 'Recall@5': 0.9048, 'MRR@5': 0.7745, 'MAP@5': 0.6399, 'nDCG@5': 0.6966, 'Recall@10': 0.9452, 'MRR@10': 0.7801, 'MAP@10': 0.64, 'nDCG@10': 0.7045}
Paraphrase mean/std: {'Hit@1_mean': 0.7067, 'Hit@1_std': 0.014, 'Recall@1_mean': 0.7067, 'Recall@1_std': 0.014, 'MRR@1_mean': 0.7067, 'MRR@1_std': 0.014, 'MAP@1_mean': 0.7067, 'MAP@1_std': 0.014, 'nDCG@1_mean': 0.7067, 'nDCG@1_std': 0.014, 'Recall@5_mean': 0.9105, 'Recall@5_std': 0.0036, 'MRR@5_mean': 0.7906, 'MRR@5_std': 0.0101, 'MAP@5_mean': 0.6529, 'MAP@5_std': 0.0082, 'nDCG@5_mean': 0.708, 'nDCG@5_std': 0.0073, 'Recall@10_mean': 0.9469, 'Recall@10_std': 0.0018, 'MRR@10_mean': 0.7954, 'MRR@10_std': 0.0097, 'MAP@10_mean': 0.6519, 'MAP@10_std': 0.0077, 'nDCG@10_mean': 0.7145, 'nDCG@10_std': 0.0066}

--- Noise stress :: RU→EN :: BASE ---

=== LAReQA RU→EN | BASE | noise:clean ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7666, 'Recall@1': 0.7666, 'MRR@1': 0.7666, 'MAP@1': 0.7666, 'nDCG@1': 0.7666, 'Recall@5': 0.9334, 'MRR@5': 0.8353, 'MAP@5': 0.6889, 'nDCG@5': 0.7423, 'Recall@10': 0.9646, 'MRR@10': 0.8396, 'MAP@10': 0.6877, 'nDCG@10': 0.7488}

=== LAReQA RU→EN | BASE | noise:nopunct ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.77, 'Recall@1': 0.77, 'MRR@1': 0.77, 'MAP@1': 0.77, 'nDCG@1': 0.77, 'Recall@5': 0.9318, 'MRR@5': 0.8359, 'MAP@5': 0.6905, 'nDCG@5': 0.7431, 'Recall@10': 0.9612, 'MRR@10': 0.8399, 'MAP@10': 0.6888, 'nDCG@10': 0.7488}
  drop vs clean (nopunct): {'ΔMRR@1': 0.00337, 'ΔMRR@5': 0.000534, 'ΔMRR@10': 0.000331}

=== LAReQA RU→EN | BASE | noise:swap ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7515, 'Recall@1': 0.7515, 'MRR@1': 0.7515, 'MAP@1': 0.7515, 'nDCG@1': 0.7515, 'Recall@5': 0.925, 'MRR@5': 0.8249, 'MAP@5': 0.6797, 'nDCG@5': 0.7332, 'Recall@10': 0.9579, 'MRR@10': 0.8295, 'MAP@10': 0.6785, 'nDCG@10': 0.7401}
  drop vs clean (swap): {'ΔMRR@1': -0.015164, 'ΔMRR@5': -0.010447, 'ΔMRR@10': -0.010083}

=== LAReQA RU→EN | BASE | noise:lower ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7582, 'Recall@1': 0.7582, 'MRR@1': 0.7582, 'MAP@1': 0.7582, 'nDCG@1': 0.7582, 'Recall@5': 0.9309, 'MRR@5': 0.8296, 'MAP@5': 0.6847, 'nDCG@5': 0.7386, 'Recall@10': 0.9629, 'MRR@10': 0.834, 'MAP@10': 0.6837, 'nDCG@10': 0.7451}
  drop vs clean (lower): {'ΔMRR@1': -0.008425, 'ΔMRR@5': -0.005715, 'ΔMRR@10': -0.005621}

=== LAReQA RU→EN | FT_INST | plain ===


                                                            

[cache] docs torch.Size([1180, 768]) saved -> /kaggle/working/robust_eval_cache/docs__lareqa_RU→EN__inst__be7f5f38__L256.pt


                                                            

{'N': 1187, 'Hit@1': 0.7195, 'Recall@1': 0.7195, 'MRR@1': 0.7195, 'MAP@1': 0.7195, 'nDCG@1': 0.7195, 'Recall@5': 0.9208, 'MRR@5': 0.8029, 'MAP@5': 0.6617, 'nDCG@5': 0.7181, 'Recall@10': 0.9596, 'MRR@10': 0.8081, 'MAP@10': 0.6614, 'nDCG@10': 0.7266}

=== LAReQA RU→EN | FT_INST | q+BANK ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7178, 'Recall@1': 0.7178, 'MRR@1': 0.7178, 'MAP@1': 0.7178, 'nDCG@1': 0.7178, 'Recall@5': 0.9149, 'MRR@5': 0.7986, 'MAP@5': 0.6599, 'nDCG@5': 0.7152, 'Recall@10': 0.9511, 'MRR@10': 0.8035, 'MAP@10': 0.6596, 'nDCG@10': 0.723}

--- LAReQA p-MRR (BANK − plain) :: RU→EN :: FT_INST ---
{'pMRR@1': -0.001685, 'pMRR@5': -0.004353, 'pMRR@10': -0.004693}

--- Prompt ablation :: RU→EN :: FT_INST ---

=== LAReQA RU→EN | FT_INST | prompt:plain ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7195, 'Recall@1': 0.7195, 'MRR@1': 0.7195, 'MAP@1': 0.7195, 'nDCG@1': 0.7195, 'Recall@5': 0.9208, 'MRR@5': 0.8029, 'MAP@5': 0.6617, 'nDCG@5': 0.7181, 'Recall@10': 0.9596, 'MRR@10': 0.8081, 'MAP@10': 0.6614, 'nDCG@10': 0.7266}

=== LAReQA RU→EN | FT_INST | prompt:minimal_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7203, 'Recall@1': 0.7203, 'MRR@1': 0.7203, 'MAP@1': 0.7203, 'nDCG@1': 0.7203, 'Recall@5': 0.9208, 'MRR@5': 0.8027, 'MAP@5': 0.6626, 'nDCG@5': 0.7185, 'Recall@10': 0.9562, 'MRR@10': 0.8075, 'MAP@10': 0.6621, 'nDCG@10': 0.7265}

=== LAReQA RU→EN | FT_INST | prompt:qa_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7262, 'Recall@1': 0.7262, 'MRR@1': 0.7262, 'MAP@1': 0.7262, 'nDCG@1': 0.7262, 'Recall@5': 0.92, 'MRR@5': 0.8068, 'MAP@5': 0.6659, 'nDCG@5': 0.7215, 'Recall@10': 0.9621, 'MRR@10': 0.8126, 'MAP@10': 0.6663, 'nDCG@10': 0.7312}

=== LAReQA RU→EN | FT_INST | prompt:search_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7195, 'Recall@1': 0.7195, 'MRR@1': 0.7195, 'MAP@1': 0.7195, 'nDCG@1': 0.7195, 'Recall@5': 0.9166, 'MRR@5': 0.8006, 'MAP@5': 0.6604, 'nDCG@5': 0.716, 'Recall@10': 0.9545, 'MRR@10': 0.8057, 'MAP@10': 0.6605, 'nDCG@10': 0.7248}

=== LAReQA RU→EN | FT_INST | prompt:mfoll_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.727, 'Recall@1': 0.727, 'MRR@1': 0.727, 'MAP@1': 0.727, 'nDCG@1': 0.727, 'Recall@5': 0.9166, 'MRR@5': 0.805, 'MAP@5': 0.6645, 'nDCG@5': 0.7195, 'Recall@10': 0.9562, 'MRR@10': 0.8104, 'MAP@10': 0.6647, 'nDCG@10': 0.7286}

--- Paraphrase robustness (5x) :: RU→EN :: FT_INST ---

=== LAReQA RU→EN | FT_INST | para1 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7144, 'Recall@1': 0.7144, 'MRR@1': 0.7144, 'MAP@1': 0.7144, 'nDCG@1': 0.7144, 'Recall@5': 0.9158, 'MRR@5': 0.7964, 'MAP@5': 0.6566, 'nDCG@5': 0.7121, 'Recall@10': 0.9537, 'MRR@10': 0.8014, 'MAP@10': 0.657, 'nDCG@10': 0.7212}

=== LAReQA RU→EN | FT_INST | para2 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7211, 'Recall@1': 0.7211, 'MRR@1': 0.7211, 'MAP@1': 0.7211, 'nDCG@1': 0.7211, 'Recall@5': 0.9132, 'MRR@5': 0.7996, 'MAP@5': 0.6613, 'nDCG@5': 0.7158, 'Recall@10': 0.9553, 'MRR@10': 0.8053, 'MAP@10': 0.6614, 'nDCG@10': 0.7252}

=== LAReQA RU→EN | FT_INST | para3 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7245, 'Recall@1': 0.7245, 'MRR@1': 0.7245, 'MAP@1': 0.7245, 'nDCG@1': 0.7245, 'Recall@5': 0.9183, 'MRR@5': 0.8038, 'MAP@5': 0.6617, 'nDCG@5': 0.7171, 'Recall@10': 0.9562, 'MRR@10': 0.8089, 'MAP@10': 0.6622, 'nDCG@10': 0.7266}

=== LAReQA RU→EN | FT_INST | para4 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7254, 'Recall@1': 0.7254, 'MRR@1': 0.7254, 'MAP@1': 0.7254, 'nDCG@1': 0.7254, 'Recall@5': 0.9124, 'MRR@5': 0.802, 'MAP@5': 0.6621, 'nDCG@5': 0.7164, 'Recall@10': 0.957, 'MRR@10': 0.8081, 'MAP@10': 0.6629, 'nDCG@10': 0.7272}

=== LAReQA RU→EN | FT_INST | para5 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7186, 'Recall@1': 0.7186, 'MRR@1': 0.7186, 'MAP@1': 0.7186, 'nDCG@1': 0.7186, 'Recall@5': 0.9107, 'MRR@5': 0.7975, 'MAP@5': 0.6593, 'nDCG@5': 0.7137, 'Recall@10': 0.9553, 'MRR@10': 0.8037, 'MAP@10': 0.6598, 'nDCG@10': 0.7239}
Paraphrase mean/std: {'Hit@1_mean': 0.7208, 'Hit@1_std': 0.004, 'Recall@1_mean': 0.7208, 'Recall@1_std': 0.004, 'MRR@1_mean': 0.7208, 'MRR@1_std': 0.004, 'MAP@1_mean': 0.7208, 'MAP@1_std': 0.004, 'nDCG@1_mean': 0.7208, 'nDCG@1_std': 0.004, 'Recall@5_mean': 0.9141, 'Recall@5_std': 0.0027, 'MRR@5_mean': 0.7999, 'MRR@5_std': 0.0028, 'MAP@5_mean': 0.6602, 'MAP@5_std': 0.0021, 'nDCG@5_mean': 0.715, 'nDCG@5_std': 0.0019, 'Recall@10_mean': 0.9555, 'Recall@10_std': 0.0011, 'MRR@10_mean': 0.8055, 'MRR@10_std': 0.0028, 'MAP@10_mean': 0.6607, 'MAP@10_std': 0.0021, 'nDCG@10_mean': 0.7248, 'nDCG@10_std': 0.0021}

--- Noise stress :: RU→EN :: FT_INST ---

=== LAReQA RU→EN | FT_INST | noise:clean ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7195, 'Recall@1': 0.7195, 'MRR@1': 0.7195, 'MAP@1': 0.7195, 'nDCG@1': 0.7195, 'Recall@5': 0.9208, 'MRR@5': 0.8029, 'MAP@5': 0.6617, 'nDCG@5': 0.7181, 'Recall@10': 0.9596, 'MRR@10': 0.8081, 'MAP@10': 0.6614, 'nDCG@10': 0.7266}

=== LAReQA RU→EN | FT_INST | noise:nopunct ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7178, 'Recall@1': 0.7178, 'MRR@1': 0.7178, 'MAP@1': 0.7178, 'nDCG@1': 0.7178, 'Recall@5': 0.9233, 'MRR@5': 0.8048, 'MAP@5': 0.6653, 'nDCG@5': 0.7214, 'Recall@10': 0.9587, 'MRR@10': 0.8095, 'MAP@10': 0.6649, 'nDCG@10': 0.7293}
  drop vs clean (nopunct): {'ΔMRR@1': -0.001685, 'ΔMRR@5': 0.001853, 'ΔMRR@10': 0.001391}

=== LAReQA RU→EN | FT_INST | noise:swap ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.706, 'Recall@1': 0.706, 'MRR@1': 0.706, 'MAP@1': 0.706, 'nDCG@1': 0.706, 'Recall@5': 0.9183, 'MRR@5': 0.7947, 'MAP@5': 0.656, 'nDCG@5': 0.7127, 'Recall@10': 0.9545, 'MRR@10': 0.7996, 'MAP@10': 0.6564, 'nDCG@10': 0.722}
  drop vs clean (swap): {'ΔMRR@1': -0.013479, 'ΔMRR@5': -0.008242, 'ΔMRR@10': -0.008535}

=== LAReQA RU→EN | FT_INST | noise:lower ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7068, 'Recall@1': 0.7068, 'MRR@1': 0.7068, 'MAP@1': 0.7068, 'nDCG@1': 0.7068, 'Recall@5': 0.9208, 'MRR@5': 0.7953, 'MAP@5': 0.6571, 'nDCG@5': 0.714, 'Recall@10': 0.9553, 'MRR@10': 0.8, 'MAP@10': 0.6564, 'nDCG@10': 0.7214}
  drop vs clean (lower): {'ΔMRR@1': -0.012637, 'ΔMRR@5': -0.007638, 'ΔMRR@10': -0.008149}

=== LAReQA RU→EN | FT_QUERY | plain ===


                                                            

[cache] docs torch.Size([1180, 768]) saved -> /kaggle/working/robust_eval_cache/docs__lareqa_RU→EN__qonly__be7f5f38__L256.pt


                                                            

{'N': 1187, 'Hit@1': 0.7439, 'Recall@1': 0.7439, 'MRR@1': 0.7439, 'MAP@1': 0.7439, 'nDCG@1': 0.7439, 'Recall@5': 0.9301, 'MRR@5': 0.8198, 'MAP@5': 0.6805, 'nDCG@5': 0.736, 'Recall@10': 0.9638, 'MRR@10': 0.8242, 'MAP@10': 0.6787, 'nDCG@10': 0.7417}

=== LAReQA RU→EN | FT_QUERY | q+BANK ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7414, 'Recall@1': 0.7414, 'MRR@1': 0.7414, 'MAP@1': 0.7414, 'nDCG@1': 0.7414, 'Recall@5': 0.9233, 'MRR@5': 0.817, 'MAP@5': 0.6756, 'nDCG@5': 0.7308, 'Recall@10': 0.9587, 'MRR@10': 0.8218, 'MAP@10': 0.6744, 'nDCG@10': 0.7375}

--- LAReQA p-MRR (BANK − plain) :: RU→EN :: FT_QUERY ---
{'pMRR@1': -0.002527, 'pMRR@5': -0.002724, 'pMRR@10': -0.002419}

--- Prompt ablation :: RU→EN :: FT_QUERY ---

=== LAReQA RU→EN | FT_QUERY | prompt:plain ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7439, 'Recall@1': 0.7439, 'MRR@1': 0.7439, 'MAP@1': 0.7439, 'nDCG@1': 0.7439, 'Recall@5': 0.9301, 'MRR@5': 0.8198, 'MAP@5': 0.6805, 'nDCG@5': 0.736, 'Recall@10': 0.9638, 'MRR@10': 0.8242, 'MAP@10': 0.6787, 'nDCG@10': 0.7417}

=== LAReQA RU→EN | FT_QUERY | prompt:minimal_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7388, 'Recall@1': 0.7388, 'MRR@1': 0.7388, 'MAP@1': 0.7388, 'nDCG@1': 0.7388, 'Recall@5': 0.925, 'MRR@5': 0.816, 'MAP@5': 0.6751, 'nDCG@5': 0.7309, 'Recall@10': 0.9646, 'MRR@10': 0.8216, 'MAP@10': 0.6744, 'nDCG@10': 0.7388}

=== LAReQA RU→EN | FT_QUERY | prompt:qa_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.743, 'Recall@1': 0.743, 'MRR@1': 0.743, 'MAP@1': 0.743, 'nDCG@1': 0.743, 'Recall@5': 0.9309, 'MRR@5': 0.8203, 'MAP@5': 0.6789, 'nDCG@5': 0.735, 'Recall@10': 0.9621, 'MRR@10': 0.8246, 'MAP@10': 0.6776, 'nDCG@10': 0.7411}

=== LAReQA RU→EN | FT_QUERY | prompt:search_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7464, 'Recall@1': 0.7464, 'MRR@1': 0.7464, 'MAP@1': 0.7464, 'nDCG@1': 0.7464, 'Recall@5': 0.9267, 'MRR@5': 0.8202, 'MAP@5': 0.68, 'nDCG@5': 0.7347, 'Recall@10': 0.9612, 'MRR@10': 0.825, 'MAP@10': 0.6785, 'nDCG@10': 0.7409}

=== LAReQA RU→EN | FT_QUERY | prompt:mfoll_ru ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.743, 'Recall@1': 0.743, 'MRR@1': 0.743, 'MAP@1': 0.743, 'nDCG@1': 0.743, 'Recall@5': 0.9309, 'MRR@5': 0.8202, 'MAP@5': 0.6781, 'nDCG@5': 0.7341, 'Recall@10': 0.9655, 'MRR@10': 0.8249, 'MAP@10': 0.6768, 'nDCG@10': 0.7406}

--- Paraphrase robustness (5x) :: RU→EN :: FT_QUERY ---

=== LAReQA RU→EN | FT_QUERY | para1 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7363, 'Recall@1': 0.7363, 'MRR@1': 0.7363, 'MAP@1': 0.7363, 'nDCG@1': 0.7363, 'Recall@5': 0.9225, 'MRR@5': 0.8133, 'MAP@5': 0.6714, 'nDCG@5': 0.7268, 'Recall@10': 0.9612, 'MRR@10': 0.8185, 'MAP@10': 0.6715, 'nDCG@10': 0.7359}

=== LAReQA RU→EN | FT_QUERY | para2 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7372, 'Recall@1': 0.7372, 'MRR@1': 0.7372, 'MAP@1': 0.7372, 'nDCG@1': 0.7372, 'Recall@5': 0.9267, 'MRR@5': 0.8176, 'MAP@5': 0.6758, 'nDCG@5': 0.7312, 'Recall@10': 0.9621, 'MRR@10': 0.8225, 'MAP@10': 0.6758, 'nDCG@10': 0.7396}

=== LAReQA RU→EN | FT_QUERY | para3 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7489, 'Recall@1': 0.7489, 'MRR@1': 0.7489, 'MAP@1': 0.7489, 'nDCG@1': 0.7489, 'Recall@5': 0.9309, 'MRR@5': 0.8236, 'MAP@5': 0.6806, 'nDCG@5': 0.7359, 'Recall@10': 0.9621, 'MRR@10': 0.828, 'MAP@10': 0.6795, 'nDCG@10': 0.7424}

=== LAReQA RU→EN | FT_QUERY | para4 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7397, 'Recall@1': 0.7397, 'MRR@1': 0.7397, 'MAP@1': 0.7397, 'nDCG@1': 0.7397, 'Recall@5': 0.9259, 'MRR@5': 0.8181, 'MAP@5': 0.6759, 'nDCG@5': 0.7307, 'Recall@10': 0.9596, 'MRR@10': 0.8229, 'MAP@10': 0.6764, 'nDCG@10': 0.7395}

=== LAReQA RU→EN | FT_QUERY | para5 ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7414, 'Recall@1': 0.7414, 'MRR@1': 0.7414, 'MAP@1': 0.7414, 'nDCG@1': 0.7414, 'Recall@5': 0.925, 'MRR@5': 0.8179, 'MAP@5': 0.6756, 'nDCG@5': 0.7303, 'Recall@10': 0.9587, 'MRR@10': 0.8226, 'MAP@10': 0.6755, 'nDCG@10': 0.7383}
Paraphrase mean/std: {'Hit@1_mean': 0.7407, 'Hit@1_std': 0.0045, 'Recall@1_mean': 0.7407, 'Recall@1_std': 0.0045, 'MRR@1_mean': 0.7407, 'MRR@1_std': 0.0045, 'MAP@1_mean': 0.7407, 'MAP@1_std': 0.0045, 'nDCG@1_mean': 0.7407, 'nDCG@1_std': 0.0045, 'Recall@5_mean': 0.9262, 'Recall@5_std': 0.0027, 'MRR@5_mean': 0.8181, 'MRR@5_std': 0.0033, 'MAP@5_mean': 0.6759, 'MAP@5_std': 0.0029, 'nDCG@5_mean': 0.731, 'nDCG@5_std': 0.0029, 'Recall@10_mean': 0.9607, 'Recall@10_std': 0.0014, 'MRR@10_mean': 0.8229, 'MRR@10_std': 0.003, 'MAP@10_mean': 0.6757, 'MAP@10_std': 0.0025, 'nDCG@10_mean': 0.7391, 'nDCG@10_std': 0.0021}

--- Noise stress :: RU→EN :: FT_QUERY ---

=== LAReQA RU→EN | FT_QUERY | noise:clean ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7439, 'Recall@1': 0.7439, 'MRR@1': 0.7439, 'MAP@1': 0.7439, 'nDCG@1': 0.7439, 'Recall@5': 0.9301, 'MRR@5': 0.8198, 'MAP@5': 0.6805, 'nDCG@5': 0.736, 'Recall@10': 0.9638, 'MRR@10': 0.8242, 'MAP@10': 0.6787, 'nDCG@10': 0.7417}

=== LAReQA RU→EN | FT_QUERY | noise:nopunct ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7498, 'Recall@1': 0.7498, 'MRR@1': 0.7498, 'MAP@1': 0.7498, 'nDCG@1': 0.7498, 'Recall@5': 0.9284, 'MRR@5': 0.8242, 'MAP@5': 0.6828, 'nDCG@5': 0.7373, 'Recall@10': 0.9663, 'MRR@10': 0.8294, 'MAP@10': 0.6824, 'nDCG@10': 0.7458}
  drop vs clean (nopunct): {'ΔMRR@1': 0.005897, 'ΔMRR@5': 0.004437, 'ΔMRR@10': 0.005122}

=== LAReQA RU→EN | FT_QUERY | noise:swap ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7447, 'Recall@1': 0.7447, 'MRR@1': 0.7447, 'MAP@1': 0.7447, 'nDCG@1': 0.7447, 'Recall@5': 0.9242, 'MRR@5': 0.8195, 'MAP@5': 0.6781, 'nDCG@5': 0.7324, 'Recall@10': 0.9596, 'MRR@10': 0.8242, 'MAP@10': 0.6779, 'nDCG@10': 0.7408}
  drop vs clean (swap): {'ΔMRR@1': 0.000842, 'ΔMRR@5': -0.000267, 'ΔMRR@10': -6.1e-05}

=== LAReQA RU→EN | FT_QUERY | noise:lower ===
[cache] docs torch.Size([1180, 768]) loaded


                                                            

{'N': 1187, 'Hit@1': 0.7447, 'Recall@1': 0.7447, 'MRR@1': 0.7447, 'MAP@1': 0.7447, 'nDCG@1': 0.7447, 'Recall@5': 0.9259, 'MRR@5': 0.8198, 'MAP@5': 0.6792, 'nDCG@5': 0.7336, 'Recall@10': 0.9596, 'MRR@10': 0.8245, 'MAP@10': 0.6783, 'nDCG@10': 0.741}
  drop vs clean (lower): {'ΔMRR@1': 0.000842, 'ΔMRR@5': 1.4e-05, 'ΔMRR@10': 0.000233}

--- EN→RU --- Docs=1,219 | Queries=1,188 | QrelsQ=1,188

=== LAReQA EN→RU | BASE | plain ===


                                                            

[cache] docs torch.Size([1219, 768]) saved -> /kaggle/working/robust_eval_cache/docs__lareqa_EN→RU__base__5723c199__L256.pt


                                                            

{'N': 1188, 'Hit@1': 0.7449, 'Recall@1': 0.7449, 'MRR@1': 0.7449, 'MAP@1': 0.7449, 'nDCG@1': 0.7449, 'Recall@5': 0.9276, 'MRR@5': 0.8207, 'MAP@5': 0.7149, 'nDCG@5': 0.7613, 'Recall@10': 0.9596, 'MRR@10': 0.8251, 'MAP@10': 0.7168, 'nDCG@10': 0.7705}

=== LAReQA EN→RU | BASE | q+BANK ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7003, 'Recall@1': 0.7003, 'MRR@1': 0.7003, 'MAP@1': 0.7003, 'nDCG@1': 0.7003, 'Recall@5': 0.8939, 'MRR@5': 0.7788, 'MAP@5': 0.6739, 'nDCG@5': 0.722, 'Recall@10': 0.931, 'MRR@10': 0.7839, 'MAP@10': 0.6757, 'nDCG@10': 0.7313}

--- LAReQA p-MRR (BANK − plain) :: EN→RU :: BASE ---
{'pMRR@1': -0.044613, 'pMRR@5': -0.041835, 'pMRR@10': -0.041213}

--- Prompt ablation :: EN→RU :: BASE ---

=== LAReQA EN→RU | BASE | prompt:plain ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7449, 'Recall@1': 0.7449, 'MRR@1': 0.7449, 'MAP@1': 0.7449, 'nDCG@1': 0.7449, 'Recall@5': 0.9276, 'MRR@5': 0.8207, 'MAP@5': 0.7149, 'nDCG@5': 0.7613, 'Recall@10': 0.9596, 'MRR@10': 0.8251, 'MAP@10': 0.7168, 'nDCG@10': 0.7705}

=== LAReQA EN→RU | BASE | prompt:minimal_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6237, 'Recall@1': 0.6237, 'MRR@1': 0.6237, 'MAP@1': 0.6237, 'nDCG@1': 0.6237, 'Recall@5': 0.8788, 'MRR@5': 0.7273, 'MAP@5': 0.6322, 'nDCG@5': 0.6852, 'Recall@10': 0.9226, 'MRR@10': 0.7333, 'MAP@10': 0.6343, 'nDCG@10': 0.6951}

=== LAReQA EN→RU | BASE | prompt:qa_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6852, 'Recall@1': 0.6852, 'MRR@1': 0.6852, 'MAP@1': 0.6852, 'nDCG@1': 0.6852, 'Recall@5': 0.8948, 'MRR@5': 0.7717, 'MAP@5': 0.6684, 'nDCG@5': 0.7176, 'Recall@10': 0.9377, 'MRR@10': 0.7775, 'MAP@10': 0.6718, 'nDCG@10': 0.7299}

=== LAReQA EN→RU | BASE | prompt:search_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6364, 'Recall@1': 0.6364, 'MRR@1': 0.6364, 'MAP@1': 0.6364, 'nDCG@1': 0.6364, 'Recall@5': 0.8914, 'MRR@5': 0.7402, 'MAP@5': 0.6453, 'nDCG@5': 0.6982, 'Recall@10': 0.9276, 'MRR@10': 0.7451, 'MAP@10': 0.6477, 'nDCG@10': 0.7081}

=== LAReQA EN→RU | BASE | prompt:mfoll_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6254, 'Recall@1': 0.6254, 'MRR@1': 0.6254, 'MAP@1': 0.6254, 'nDCG@1': 0.6254, 'Recall@5': 0.8822, 'MRR@5': 0.7264, 'MAP@5': 0.6305, 'nDCG@5': 0.6852, 'Recall@10': 0.9226, 'MRR@10': 0.732, 'MAP@10': 0.6325, 'nDCG@10': 0.6948}

--- Paraphrase robustness (5x) :: EN→RU :: BASE ---

=== LAReQA EN→RU | BASE | para1 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6212, 'Recall@1': 0.6212, 'MRR@1': 0.6212, 'MAP@1': 0.6212, 'nDCG@1': 0.6212, 'Recall@5': 0.8687, 'MRR@5': 0.7194, 'MAP@5': 0.626, 'nDCG@5': 0.6784, 'Recall@10': 0.9209, 'MRR@10': 0.7265, 'MAP@10': 0.6285, 'nDCG@10': 0.6902}

=== LAReQA EN→RU | BASE | para2 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6305, 'Recall@1': 0.6305, 'MRR@1': 0.6305, 'MAP@1': 0.6305, 'nDCG@1': 0.6305, 'Recall@5': 0.8923, 'MRR@5': 0.7354, 'MAP@5': 0.6393, 'nDCG@5': 0.6939, 'Recall@10': 0.9259, 'MRR@10': 0.7399, 'MAP@10': 0.6405, 'nDCG@10': 0.7016}

=== LAReQA EN→RU | BASE | para3 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6136, 'Recall@1': 0.6136, 'MRR@1': 0.6136, 'MAP@1': 0.6136, 'nDCG@1': 0.6136, 'Recall@5': 0.8855, 'MRR@5': 0.7219, 'MAP@5': 0.6277, 'nDCG@5': 0.6839, 'Recall@10': 0.9242, 'MRR@10': 0.7272, 'MAP@10': 0.629, 'nDCG@10': 0.6922}

=== LAReQA EN→RU | BASE | para4 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.5926, 'Recall@1': 0.5926, 'MRR@1': 0.5926, 'MAP@1': 0.5926, 'nDCG@1': 0.5926, 'Recall@5': 0.8822, 'MRR@5': 0.7092, 'MAP@5': 0.6158, 'nDCG@5': 0.6739, 'Recall@10': 0.9175, 'MRR@10': 0.7143, 'MAP@10': 0.6171, 'nDCG@10': 0.6818}

=== LAReQA EN→RU | BASE | para5 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7071, 'Recall@1': 0.7071, 'MRR@1': 0.7071, 'MAP@1': 0.7071, 'nDCG@1': 0.7071, 'Recall@5': 0.9125, 'MRR@5': 0.7923, 'MAP@5': 0.6889, 'nDCG@5': 0.7376, 'Recall@10': 0.947, 'MRR@10': 0.7969, 'MAP@10': 0.6906, 'nDCG@10': 0.7465}
Paraphrase mean/std: {'Hit@1_mean': 0.633, 'Hit@1_std': 0.0391, 'Recall@1_mean': 0.633, 'Recall@1_std': 0.0391, 'MRR@1_mean': 0.633, 'MRR@1_std': 0.0391, 'MAP@1_mean': 0.633, 'MAP@1_std': 0.0391, 'nDCG@1_mean': 0.633, 'nDCG@1_std': 0.0391, 'Recall@5_mean': 0.8882, 'Recall@5_std': 0.0143, 'MRR@5_mean': 0.7356, 'MRR@5_std': 0.0295, 'MAP@5_mean': 0.6395, 'MAP@5_std': 0.0258, 'nDCG@5_mean': 0.6935, 'nDCG@5_std': 0.023, 'Recall@10_mean': 0.9271, 'Recall@10_std': 0.0103, 'MRR@10_mean': 0.741, 'MRR@10_std': 0.0291, 'MAP@10_mean': 0.6411, 'MAP@10_std': 0.0258, 'nDCG@10_mean': 0.7025, 'nDCG@10_std': 0.0229}

--- Noise stress :: EN→RU :: BASE ---

=== LAReQA EN→RU | BASE | noise:clean ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7449, 'Recall@1': 0.7449, 'MRR@1': 0.7449, 'MAP@1': 0.7449, 'nDCG@1': 0.7449, 'Recall@5': 0.9276, 'MRR@5': 0.8207, 'MAP@5': 0.7149, 'nDCG@5': 0.7613, 'Recall@10': 0.9596, 'MRR@10': 0.8251, 'MAP@10': 0.7168, 'nDCG@10': 0.7705}

=== LAReQA EN→RU | BASE | noise:nopunct ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7449, 'Recall@1': 0.7449, 'MRR@1': 0.7449, 'MAP@1': 0.7449, 'nDCG@1': 0.7449, 'Recall@5': 0.9209, 'MRR@5': 0.8189, 'MAP@5': 0.7135, 'nDCG@5': 0.7584, 'Recall@10': 0.9571, 'MRR@10': 0.8238, 'MAP@10': 0.7166, 'nDCG@10': 0.7698}
  drop vs clean (nopunct): {'ΔMRR@1': 0.0, 'ΔMRR@5': -0.001726, 'ΔMRR@10': -0.00135}

=== LAReQA EN→RU | BASE | noise:swap ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7247, 'Recall@1': 0.7247, 'MRR@1': 0.7247, 'MAP@1': 0.7247, 'nDCG@1': 0.7247, 'Recall@5': 0.9184, 'MRR@5': 0.8036, 'MAP@5': 0.6982, 'nDCG@5': 0.7456, 'Recall@10': 0.9478, 'MRR@10': 0.8076, 'MAP@10': 0.7005, 'nDCG@10': 0.755}
  drop vs clean (swap): {'ΔMRR@1': -0.020202, 'ΔMRR@5': -0.017031, 'ΔMRR@10': -0.017553}

=== LAReQA EN→RU | BASE | noise:lower ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7382, 'Recall@1': 0.7382, 'MRR@1': 0.7382, 'MAP@1': 0.7382, 'nDCG@1': 0.7382, 'Recall@5': 0.9251, 'MRR@5': 0.8154, 'MAP@5': 0.7112, 'nDCG@5': 0.7589, 'Recall@10': 0.9638, 'MRR@10': 0.8207, 'MAP@10': 0.7141, 'nDCG@10': 0.7704}
  drop vs clean (lower): {'ΔMRR@1': -0.006734, 'ΔMRR@5': -0.005205, 'ΔMRR@10': -0.004454}

=== LAReQA EN→RU | FT_INST | plain ===


                                                            

[cache] docs torch.Size([1219, 768]) saved -> /kaggle/working/robust_eval_cache/docs__lareqa_EN→RU__inst__5723c199__L256.pt


                                                            

{'N': 1188, 'Hit@1': 0.6995, 'Recall@1': 0.6995, 'MRR@1': 0.6995, 'MAP@1': 0.6995, 'nDCG@1': 0.6995, 'Recall@5': 0.915, 'MRR@5': 0.786, 'MAP@5': 0.6829, 'nDCG@5': 0.7346, 'Recall@10': 0.9537, 'MRR@10': 0.7913, 'MAP@10': 0.6866, 'nDCG@10': 0.7469}

=== LAReQA EN→RU | FT_INST | q+BANK ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6827, 'Recall@1': 0.6827, 'MRR@1': 0.6827, 'MAP@1': 0.6827, 'nDCG@1': 0.6827, 'Recall@5': 0.9007, 'MRR@5': 0.7706, 'MAP@5': 0.6671, 'nDCG@5': 0.7192, 'Recall@10': 0.9411, 'MRR@10': 0.7763, 'MAP@10': 0.6707, 'nDCG@10': 0.7316}

--- LAReQA p-MRR (BANK − plain) :: EN→RU :: FT_INST ---
{'pMRR@1': -0.016835, 'pMRR@5': -0.015362, 'pMRR@10': -0.015028}

--- Prompt ablation :: EN→RU :: FT_INST ---

=== LAReQA EN→RU | FT_INST | prompt:plain ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6995, 'Recall@1': 0.6995, 'MRR@1': 0.6995, 'MAP@1': 0.6995, 'nDCG@1': 0.6995, 'Recall@5': 0.915, 'MRR@5': 0.786, 'MAP@5': 0.6829, 'nDCG@5': 0.7346, 'Recall@10': 0.9537, 'MRR@10': 0.7913, 'MAP@10': 0.6866, 'nDCG@10': 0.7469}

=== LAReQA EN→RU | FT_INST | prompt:minimal_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6944, 'Recall@1': 0.6944, 'MRR@1': 0.6944, 'MAP@1': 0.6944, 'nDCG@1': 0.6944, 'Recall@5': 0.9099, 'MRR@5': 0.7812, 'MAP@5': 0.6772, 'nDCG@5': 0.7293, 'Recall@10': 0.947, 'MRR@10': 0.7863, 'MAP@10': 0.6801, 'nDCG@10': 0.7402}

=== LAReQA EN→RU | FT_INST | prompt:qa_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6919, 'Recall@1': 0.6919, 'MRR@1': 0.6919, 'MAP@1': 0.6919, 'nDCG@1': 0.6919, 'Recall@5': 0.9082, 'MRR@5': 0.7788, 'MAP@5': 0.676, 'nDCG@5': 0.7278, 'Recall@10': 0.9495, 'MRR@10': 0.7846, 'MAP@10': 0.6803, 'nDCG@10': 0.7412}

=== LAReQA EN→RU | FT_INST | prompt:search_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6869, 'Recall@1': 0.6869, 'MRR@1': 0.6869, 'MAP@1': 0.6869, 'nDCG@1': 0.6869, 'Recall@5': 0.9049, 'MRR@5': 0.7749, 'MAP@5': 0.6719, 'nDCG@5': 0.7239, 'Recall@10': 0.9394, 'MRR@10': 0.7796, 'MAP@10': 0.6749, 'nDCG@10': 0.7346}

=== LAReQA EN→RU | FT_INST | prompt:mfoll_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.697, 'Recall@1': 0.697, 'MRR@1': 0.697, 'MAP@1': 0.697, 'nDCG@1': 0.697, 'Recall@5': 0.9116, 'MRR@5': 0.7844, 'MAP@5': 0.6787, 'nDCG@5': 0.7311, 'Recall@10': 0.9503, 'MRR@10': 0.7898, 'MAP@10': 0.6825, 'nDCG@10': 0.7434}

--- Paraphrase robustness (5x) :: EN→RU :: FT_INST ---

=== LAReQA EN→RU | FT_INST | para1 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6936, 'Recall@1': 0.6936, 'MRR@1': 0.6936, 'MAP@1': 0.6936, 'nDCG@1': 0.6936, 'Recall@5': 0.9108, 'MRR@5': 0.781, 'MAP@5': 0.6762, 'nDCG@5': 0.7285, 'Recall@10': 0.9453, 'MRR@10': 0.7857, 'MAP@10': 0.6792, 'nDCG@10': 0.7392}

=== LAReQA EN→RU | FT_INST | para2 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6877, 'Recall@1': 0.6877, 'MRR@1': 0.6877, 'MAP@1': 0.6877, 'nDCG@1': 0.6877, 'Recall@5': 0.9066, 'MRR@5': 0.775, 'MAP@5': 0.671, 'nDCG@5': 0.7238, 'Recall@10': 0.947, 'MRR@10': 0.7806, 'MAP@10': 0.6741, 'nDCG@10': 0.7354}

=== LAReQA EN→RU | FT_INST | para3 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6995, 'Recall@1': 0.6995, 'MRR@1': 0.6995, 'MAP@1': 0.6995, 'nDCG@1': 0.6995, 'Recall@5': 0.9074, 'MRR@5': 0.7829, 'MAP@5': 0.679, 'nDCG@5': 0.7301, 'Recall@10': 0.9453, 'MRR@10': 0.788, 'MAP@10': 0.682, 'nDCG@10': 0.7411}

=== LAReQA EN→RU | FT_INST | para4 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6961, 'Recall@1': 0.6961, 'MRR@1': 0.6961, 'MAP@1': 0.6961, 'nDCG@1': 0.6961, 'Recall@5': 0.9066, 'MRR@5': 0.7813, 'MAP@5': 0.6771, 'nDCG@5': 0.7286, 'Recall@10': 0.9436, 'MRR@10': 0.7865, 'MAP@10': 0.6801, 'nDCG@10': 0.7395}

=== LAReQA EN→RU | FT_INST | para5 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7003, 'Recall@1': 0.7003, 'MRR@1': 0.7003, 'MAP@1': 0.7003, 'nDCG@1': 0.7003, 'Recall@5': 0.9108, 'MRR@5': 0.7847, 'MAP@5': 0.6807, 'nDCG@5': 0.7322, 'Recall@10': 0.9495, 'MRR@10': 0.7901, 'MAP@10': 0.6842, 'nDCG@10': 0.7442}
Paraphrase mean/std: {'Hit@1_mean': 0.6955, 'Hit@1_std': 0.0046, 'Recall@1_mean': 0.6955, 'Recall@1_std': 0.0046, 'MRR@1_mean': 0.6955, 'MRR@1_std': 0.0046, 'MAP@1_mean': 0.6955, 'MAP@1_std': 0.0046, 'nDCG@1_mean': 0.6955, 'nDCG@1_std': 0.0046, 'Recall@5_mean': 0.9084, 'Recall@5_std': 0.0019, 'MRR@5_mean': 0.781, 'MRR@5_std': 0.0033, 'MAP@5_mean': 0.6768, 'MAP@5_std': 0.0033, 'nDCG@5_mean': 0.7286, 'nDCG@5_std': 0.0028, 'Recall@10_mean': 0.9461, 'Recall@10_std': 0.002, 'MRR@10_mean': 0.7862, 'MRR@10_std': 0.0032, 'MAP@10_mean': 0.6799, 'MAP@10_std': 0.0034, 'nDCG@10_mean': 0.7399, 'nDCG@10_std': 0.0029}

--- Noise stress :: EN→RU :: FT_INST ---

=== LAReQA EN→RU | FT_INST | noise:clean ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6995, 'Recall@1': 0.6995, 'MRR@1': 0.6995, 'MAP@1': 0.6995, 'nDCG@1': 0.6995, 'Recall@5': 0.915, 'MRR@5': 0.786, 'MAP@5': 0.6829, 'nDCG@5': 0.7346, 'Recall@10': 0.9537, 'MRR@10': 0.7913, 'MAP@10': 0.6866, 'nDCG@10': 0.7469}

=== LAReQA EN→RU | FT_INST | noise:nopunct ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6978, 'Recall@1': 0.6978, 'MRR@1': 0.6978, 'MAP@1': 0.6978, 'nDCG@1': 0.6978, 'Recall@5': 0.9116, 'MRR@5': 0.7848, 'MAP@5': 0.6816, 'nDCG@5': 0.733, 'Recall@10': 0.9529, 'MRR@10': 0.7905, 'MAP@10': 0.6858, 'nDCG@10': 0.7463}
  drop vs clean (nopunct): {'ΔMRR@1': -0.001684, 'ΔMRR@5': -0.001235, 'ΔMRR@10': -0.000829}

=== LAReQA EN→RU | FT_INST | noise:swap ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6944, 'Recall@1': 0.6944, 'MRR@1': 0.6944, 'MAP@1': 0.6944, 'nDCG@1': 0.6944, 'Recall@5': 0.9049, 'MRR@5': 0.7789, 'MAP@5': 0.6771, 'nDCG@5': 0.7281, 'Recall@10': 0.9444, 'MRR@10': 0.7842, 'MAP@10': 0.6807, 'nDCG@10': 0.7408}
  drop vs clean (swap): {'ΔMRR@1': -0.005051, 'ΔMRR@5': -0.007127, 'ΔMRR@10': -0.007158}

=== LAReQA EN→RU | FT_INST | noise:lower ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6961, 'Recall@1': 0.6961, 'MRR@1': 0.6961, 'MAP@1': 0.6961, 'nDCG@1': 0.6961, 'Recall@5': 0.9108, 'MRR@5': 0.784, 'MAP@5': 0.6817, 'nDCG@5': 0.7337, 'Recall@10': 0.9503, 'MRR@10': 0.7892, 'MAP@10': 0.685, 'nDCG@10': 0.7453}
  drop vs clean (lower): {'ΔMRR@1': -0.003367, 'ΔMRR@5': -0.001964, 'ΔMRR@10': -0.002135}

=== LAReQA EN→RU | FT_QUERY | plain ===


                                                            

[cache] docs torch.Size([1219, 768]) saved -> /kaggle/working/robust_eval_cache/docs__lareqa_EN→RU__qonly__5723c199__L256.pt


                                                            

{'N': 1188, 'Hit@1': 0.7222, 'Recall@1': 0.7222, 'MRR@1': 0.7222, 'MAP@1': 0.7222, 'nDCG@1': 0.7222, 'Recall@5': 0.9285, 'MRR@5': 0.8044, 'MAP@5': 0.7005, 'nDCG@5': 0.7511, 'Recall@10': 0.9588, 'MRR@10': 0.8086, 'MAP@10': 0.7027, 'nDCG@10': 0.7602}

=== LAReQA EN→RU | FT_QUERY | q+BANK ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7264, 'Recall@1': 0.7264, 'MRR@1': 0.7264, 'MAP@1': 0.7264, 'nDCG@1': 0.7264, 'Recall@5': 0.92, 'MRR@5': 0.8027, 'MAP@5': 0.6982, 'nDCG@5': 0.7474, 'Recall@10': 0.9503, 'MRR@10': 0.8069, 'MAP@10': 0.7006, 'nDCG@10': 0.7569}

--- LAReQA p-MRR (BANK − plain) :: EN→RU :: FT_QUERY ---
{'pMRR@1': 0.004209, 'pMRR@5': -0.001655, 'pMRR@10': -0.001672}

--- Prompt ablation :: EN→RU :: FT_QUERY ---

=== LAReQA EN→RU | FT_QUERY | prompt:plain ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7222, 'Recall@1': 0.7222, 'MRR@1': 0.7222, 'MAP@1': 0.7222, 'nDCG@1': 0.7222, 'Recall@5': 0.9285, 'MRR@5': 0.8044, 'MAP@5': 0.7005, 'nDCG@5': 0.7511, 'Recall@10': 0.9588, 'MRR@10': 0.8086, 'MAP@10': 0.7027, 'nDCG@10': 0.7602}

=== LAReQA EN→RU | FT_QUERY | prompt:minimal_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.729, 'Recall@1': 0.729, 'MRR@1': 0.729, 'MAP@1': 0.729, 'nDCG@1': 0.729, 'Recall@5': 0.9209, 'MRR@5': 0.8047, 'MAP@5': 0.6997, 'nDCG@5': 0.7487, 'Recall@10': 0.952, 'MRR@10': 0.8091, 'MAP@10': 0.7027, 'nDCG@10': 0.759}

=== LAReQA EN→RU | FT_QUERY | prompt:qa_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7079, 'Recall@1': 0.7079, 'MRR@1': 0.7079, 'MAP@1': 0.7079, 'nDCG@1': 0.7079, 'Recall@5': 0.9099, 'MRR@5': 0.7888, 'MAP@5': 0.6845, 'nDCG@5': 0.7348, 'Recall@10': 0.9495, 'MRR@10': 0.7945, 'MAP@10': 0.6884, 'nDCG@10': 0.7472}

=== LAReQA EN→RU | FT_QUERY | prompt:search_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7029, 'Recall@1': 0.7029, 'MRR@1': 0.7029, 'MAP@1': 0.7029, 'nDCG@1': 0.7029, 'Recall@5': 0.9116, 'MRR@5': 0.7869, 'MAP@5': 0.6832, 'nDCG@5': 0.7337, 'Recall@10': 0.9495, 'MRR@10': 0.792, 'MAP@10': 0.6865, 'nDCG@10': 0.7454}

=== LAReQA EN→RU | FT_QUERY | prompt:mfoll_en ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7315, 'Recall@1': 0.7315, 'MRR@1': 0.7315, 'MAP@1': 0.7315, 'nDCG@1': 0.7315, 'Recall@5': 0.9234, 'MRR@5': 0.807, 'MAP@5': 0.6996, 'nDCG@5': 0.7493, 'Recall@10': 0.9554, 'MRR@10': 0.8115, 'MAP@10': 0.703, 'nDCG@10': 0.7602}

--- Paraphrase robustness (5x) :: EN→RU :: FT_QUERY ---

=== LAReQA EN→RU | FT_QUERY | para1 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7189, 'Recall@1': 0.7189, 'MRR@1': 0.7189, 'MAP@1': 0.7189, 'nDCG@1': 0.7189, 'Recall@5': 0.9167, 'MRR@5': 0.7978, 'MAP@5': 0.6947, 'nDCG@5': 0.7443, 'Recall@10': 0.9512, 'MRR@10': 0.8025, 'MAP@10': 0.6972, 'nDCG@10': 0.7541}

=== LAReQA EN→RU | FT_QUERY | para2 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6961, 'Recall@1': 0.6961, 'MRR@1': 0.6961, 'MAP@1': 0.6961, 'nDCG@1': 0.6961, 'Recall@5': 0.915, 'MRR@5': 0.7828, 'MAP@5': 0.6784, 'nDCG@5': 0.7305, 'Recall@10': 0.9503, 'MRR@10': 0.7877, 'MAP@10': 0.6816, 'nDCG@10': 0.7416}

=== LAReQA EN→RU | FT_QUERY | para3 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6936, 'Recall@1': 0.6936, 'MRR@1': 0.6936, 'MAP@1': 0.6936, 'nDCG@1': 0.6936, 'Recall@5': 0.9024, 'MRR@5': 0.7754, 'MAP@5': 0.6726, 'nDCG@5': 0.7228, 'Recall@10': 0.9495, 'MRR@10': 0.7819, 'MAP@10': 0.6771, 'nDCG@10': 0.7374}

=== LAReQA EN→RU | FT_QUERY | para4 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.6827, 'Recall@1': 0.6827, 'MRR@1': 0.6827, 'MAP@1': 0.6827, 'nDCG@1': 0.6827, 'Recall@5': 0.9099, 'MRR@5': 0.7731, 'MAP@5': 0.6705, 'nDCG@5': 0.7236, 'Recall@10': 0.9478, 'MRR@10': 0.7785, 'MAP@10': 0.6735, 'nDCG@10': 0.7348}

=== LAReQA EN→RU | FT_QUERY | para5 ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7256, 'Recall@1': 0.7256, 'MRR@1': 0.7256, 'MAP@1': 0.7256, 'nDCG@1': 0.7256, 'Recall@5': 0.9242, 'MRR@5': 0.805, 'MAP@5': 0.7011, 'nDCG@5': 0.7509, 'Recall@10': 0.9554, 'MRR@10': 0.8093, 'MAP@10': 0.7035, 'nDCG@10': 0.7604}
Paraphrase mean/std: {'Hit@1_mean': 0.7034, 'Hit@1_std': 0.0162, 'Recall@1_mean': 0.7034, 'Recall@1_std': 0.0162, 'MRR@1_mean': 0.7034, 'MRR@1_std': 0.0162, 'MAP@1_mean': 0.7034, 'MAP@1_std': 0.0162, 'nDCG@1_mean': 0.7034, 'nDCG@1_std': 0.0162, 'Recall@5_mean': 0.9136, 'Recall@5_std': 0.0073, 'MRR@5_mean': 0.7868, 'MRR@5_std': 0.0125, 'MAP@5_mean': 0.6835, 'MAP@5_std': 0.0122, 'nDCG@5_mean': 0.7344, 'nDCG@5_std': 0.0113, 'Recall@10_mean': 0.9508, 'Recall@10_std': 0.0025, 'MRR@10_mean': 0.792, 'MRR@10_std': 0.0119, 'MAP@10_mean': 0.6866, 'MAP@10_std': 0.0117, 'nDCG@10_mean': 0.7456, 'nDCG@10_std': 0.0099}

--- Noise stress :: EN→RU :: FT_QUERY ---

=== LAReQA EN→RU | FT_QUERY | noise:clean ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7222, 'Recall@1': 0.7222, 'MRR@1': 0.7222, 'MAP@1': 0.7222, 'nDCG@1': 0.7222, 'Recall@5': 0.9285, 'MRR@5': 0.8044, 'MAP@5': 0.7005, 'nDCG@5': 0.7511, 'Recall@10': 0.9588, 'MRR@10': 0.8086, 'MAP@10': 0.7027, 'nDCG@10': 0.7602}

=== LAReQA EN→RU | FT_QUERY | noise:nopunct ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7298, 'Recall@1': 0.7298, 'MRR@1': 0.7298, 'MAP@1': 0.7298, 'nDCG@1': 0.7298, 'Recall@5': 0.9209, 'MRR@5': 0.8067, 'MAP@5': 0.7013, 'nDCG@5': 0.7504, 'Recall@10': 0.9579, 'MRR@10': 0.8119, 'MAP@10': 0.7042, 'nDCG@10': 0.7614}
  drop vs clean (nopunct): {'ΔMRR@1': 0.007576, 'ΔMRR@5': 0.002287, 'ΔMRR@10': 0.003349}

=== LAReQA EN→RU | FT_QUERY | noise:swap ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7189, 'Recall@1': 0.7189, 'MRR@1': 0.7189, 'MAP@1': 0.7189, 'nDCG@1': 0.7189, 'Recall@5': 0.9158, 'MRR@5': 0.7966, 'MAP@5': 0.6922, 'nDCG@5': 0.742, 'Recall@10': 0.9537, 'MRR@10': 0.8018, 'MAP@10': 0.6956, 'nDCG@10': 0.7537}
  drop vs clean (swap): {'ΔMRR@1': -0.003367, 'ΔMRR@5': -0.007814, 'ΔMRR@10': -0.006763}

=== LAReQA EN→RU | FT_QUERY | noise:lower ===
[cache] docs torch.Size([1219, 768]) loaded


                                                            

{'N': 1188, 'Hit@1': 0.7298, 'Recall@1': 0.7298, 'MRR@1': 0.7298, 'MAP@1': 0.7298, 'nDCG@1': 0.7298, 'Recall@5': 0.92, 'MRR@5': 0.8045, 'MAP@5': 0.7012, 'nDCG@5': 0.7496, 'Recall@10': 0.9537, 'MRR@10': 0.8094, 'MAP@10': 0.7048, 'nDCG@10': 0.7611}
  drop vs clean (lower): {'ΔMRR@1': 0.007576, 'ΔMRR@5': 9.8e-05, 'ΔMRR@10': 0.000762}

