In [1]:
import pandas as pd

SRC = "/kaggle/input/tevatron-msmarco-translated-ru/tevatron_msmarco_ru.parquet"
DST = "/kaggle/working/tevatron_msmarco_ru_rows_855_1355.parquet"

start_pos = 855          
end_pos_inclusive = 1355 

df = pd.read_parquet(SRC, engine="pyarrow")

n = len(df)
start = max(0, start_pos)
end_exclusive = min(n, end_pos_inclusive + 1)

df_slice = df.iloc[start:end_exclusive].copy()

df_slice.to_parquet(DST, index=False, engine="pyarrow")

print("Total rows:", n)
print("Slice shape:", df_slice.shape)
print("First/last positions in slice (by original order):", start, end_exclusive - 1)


Total rows: 3600
Slice shape: (501, 6)
First/last positions in slice (by original order): 855 1355


In [2]:
!pip -q install --upgrade transformers accelerate bitsandbytes safetensors sentencepiece "fsspec<=2025.3.0"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.9/374.9 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.8/485.8 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K  

In [3]:
import os, re, gc, json, time, random, hashlib, math
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
)

INPUT_PATH = "/kaggle/working/tevatron_msmarco_ru_rows_855_1355.parquet"
OUTPUT_JSONL = "/kaggle/working/out_instructions_855_1355.jsonl"
DEBUG_LOG    = "/kaggle/working/debug_bad_generations_855_1355.txt"

SESSION_LIMIT = 500
SEED = 42
random.seed(SEED)

MODEL_NAME = "RefalMachine/RuadaptQwen2.5-7B-Lite-Beta"
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

MAX_NEW_TOKENS = 180
TEMPERATURE    = 0.8
TOP_P          = 0.9

MAX_NEGS_PER_PROMPT = 5

LENGTH_PROFILES = [
    ("short_strict",    "1–2 предложения, строгий тон, без подсказки ответа"),
    ("persona",         "2–4 простых предложения для старшеклассников, но без ответа"),
    ("background_long", "4–6 предложений с контекстом и ограничениями, но без ответа"),
]

SYSTEM_MSG = (
    "Ты — генератор инструкций для задачи dense retrieval. "
    "Тебе дан запрос, один РЕЛЕВАНТНЫЙ документ [1] и несколько НЕРЕЛЕВАНТНЫХ документов [2..N]. "
    "Сгенерируй ДОПОЛНИТЕЛЬНУЮ ИНСТРУКЦИЮ (на русском), которую можно приписать к концу запроса так, "
    "чтобы документ [1] оставался релевантным, а ВСЕ остальные документы [2..N] стали нерелевантны. "
    "Не цитируй документы и не раскрывай ответ; формулируй критерии релевантности. "
    "Верни ТОЛЬКО JSON-объект без дополнительного текста."
)

USER_TEMPLATE = (
    "## Input Data\n"
    "У меня есть следующий запрос и 1 документ, помеченный как релевантный, и {nneg} — как нерелевантные.\n"
    "Query: {query}\n\n"
    "Relevant document [1]:\n{pos}\n\n"
    "Non-relevant documents:\n{neg_block}\n\n"
    "## Your task\n"
    "Нужно придумать инструкцию, чтобы релевантным остался ТОЛЬКО документ [1], "
    "а все остальные [2..{max_id}] стали нерелевантны. НЕ подсказывай ответ и НЕ цитируй документы. "
    "Инструкция должна быть длины: {length_format}.\n"
    "Опиши конкретные критерии релевантности, чтобы [1] подходил, а [2..{max_id}] — нет. "
    "Верни JSON ТОЛЬКО с ключами:\n"
    '  "instruction" : str,\n'
    '  "relevant_docs" : "[1]",\n'
    '  "non-relevant_docs" : "[2,3,...]".\n'
    "## Your output (JSON only):"
)

2025-09-04 16:55:52.238216: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757004952.583156      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757004952.683211      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
def load_rows(path: str) -> List[Dict[str, Any]]:
    if path.endswith(".jsonl"):
        return [json.loads(l) for l in open(path, "r", encoding="utf-8")]
    df = pd.read_parquet(path)
    return df.to_dict("records")

def normalize_hnegs(val) -> List[str]:
    out = []
    if isinstance(val, list):
        it = val
    elif isinstance(val, str) and val.strip():
        try:
            j = json.loads(val)
            it = j if isinstance(j, list) else [val]
        except Exception:
            it = [val]
    else:
        it = []
    for x in it:
        s = str(x).strip()
        if s:
            out.append(s)
    seen = set()
    uniq = []
    for s in out:
        if s not in seen:
            uniq.append(s); seen.add(s)
    return uniq

def stable_row_id(q: str, p: str) -> str:
    h = hashlib.md5()
    h.update((q + "\n" + p).encode("utf-8"))
    return h.hexdigest()

def load_done_ids(out_path: str) -> set:
    done = set()
    if os.path.exists(out_path):
        with open(out_path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    item = json.loads(line)
                    rid = item.get("_row_id")
                    if rid:
                        done.add(rid)
                except Exception:
                    pass
    return done

def to_py(o):
    if isinstance(o, dict):
        return {k: to_py(v) for k, v in o.items()}
    if isinstance(o, (list, tuple, set)):
        return [to_py(v) for v in o]
    if isinstance(o, np.ndarray):
        return o.tolist()
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, (np.bool_,)):
        return bool(o)
    return o

def append_row(out_path: str, row: Dict[str, Any]):
    with open(out_path, "a", encoding="utf-8") as f:
        f.write(json.dumps(to_py(row), ensure_ascii=False) + "\n")

def log_bad(query: str, pos: str, negs: List[str], raw: str, reason: str):
    os.makedirs(os.path.dirname(DEBUG_LOG), exist_ok=True)
    with open(DEBUG_LOG, "a", encoding="utf-8") as f:
        f.write("="*80 + "\n")
        f.write("QUERY: " + (query[:300] if query else "") + "\n")
        f.write("POS   : " + (pos[:600] if pos else "") + "\n")
        if negs:
            for j, t in enumerate(negs[:MAX_NEGS_PER_PROMPT], start=2):
                f.write(f"NEG[{j}]: {t[:400]}\n")
        f.write("RAW   : " + raw[:1200] + "\n")
        f.write("REASON: " + reason + "\n")

def is_russian_text(s: str, threshold: float = 0.4) -> bool:
    if not s:
        return False
    cyr = sum('а' <= ch.lower() <= 'я' or ch.lower() == 'ё' for ch in s)
    return (cyr / max(1, len(s))) >= threshold

def basic_filter(inst_text: str) -> bool:
    if not is_russian_text(inst_text, 0.4):
        return False
    non_ws_len = len(re.sub(r"\s+", "", inst_text))
    if non_ws_len > 800:
        return False
    return True

def parse_promptriever_json(text: str) -> Dict[str, Any] | None:

    m = re.search(r"\{[\s\S]*\}", text)
    if not m:
        return None
    try:
        obj = json.loads(m.group(0))
        if not isinstance(obj, dict):
            return None
    except Exception:
        return None

    nr_key = None
    for k in obj.keys():
        if k.lower().replace("_", "-") == "non-relevant-docs" or k.lower().replace("_", "-") == "non-relevant_docs":
            nr_key = k
            break
    if nr_key is None and "non-relevant_docs" in obj:
        nr_key = "non-relevant_docs"

    out = {
        "instruction": obj.get("instruction", "").strip(),
        "relevant_docs": obj.get("relevant_docs", "").strip(),
        "non-relevant_docs": obj.get(nr_key, "").strip() if nr_key else "",
    }
    return out

def make_neg_block(negs: List[str]) -> str:
    lines = []
    for idx, t in enumerate(negs, start=2):
        lines.append(f"[{idx}] {t}")
    return "\n".join(lines)

In [5]:
assert torch.cuda.is_available(), "Enable GPU (Runtime > Accelerator > GPU)."

print("Loading tokenizer/model:", MODEL_NAME)
tok = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tok.padding_side = "left"
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token
print("padding_side:", tok.padding_side, "pad_token_id:", tok.pad_token_id, "eos_token_id:", tok.eos_token_id)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_cfg,
    trust_remote_code=True,
)
model.config.pad_token_id = tok.pad_token_id

generate = pipeline(
    "text-generation",
    model=model,
    tokenizer=tok,
    return_full_text=False,
)

eos_id = tok.eos_token_id if tok.eos_token_id is not None else model.config.eos_token_id
print("Model loaded. EOS id:", eos_id)

Loading tokenizer/model: RefalMachine/RuadaptQwen2.5-7B-Lite-Beta


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

padding_side: left pad_token_id: 145109 eos_token_id: 145111


config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded. EOS id: 145111


In [6]:
rows = load_rows(INPUT_PATH)
print("Total rows in dataset:", len(rows))

done_ids = load_done_ids(OUTPUT_JSONL)
print("Already done:", len(done_ids))

processed_this_session = 0

ema_sec_per_ex = None
t0 = time.time()

pbar = tqdm(total=min(SESSION_LIMIT, len(rows)), desc="Generating instructions (session)")

for r in rows:
    if processed_this_session >= SESSION_LIMIT:
        break

    q = (r.get("query_ru") or "").strip()
    p = (r.get("positive_ru") or "").strip()
    if not q or not p:
        continue

    rid = stable_row_id(q, p)
    if rid in done_ids:
        continue

    raw_negs = normalize_hnegs(r.get("hard_negs_ru", []))
    negs = raw_negs[:MAX_NEGS_PER_PROMPT]
    if not negs:
        negs = ["(пусто) Нерелевантный фрагмент отсутствует — добавлен заполнитель."]

    neg_block = make_neg_block(negs)
    max_id = 1 + len(negs)

    sub_prompts = []
    meta_profiles = []
    for style_name, length_fmt in LENGTH_PROFILES:
        user = USER_TEMPLATE.format(
            query=q,
            pos=p,
            nneg=len(negs),
            neg_block=neg_block,
            max_id=max_id,
            length_format=length_fmt
        )
        messages = [
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user", "content": user},
        ]
        prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        sub_prompts.append(prompt)
        meta_profiles.append((style_name, length_fmt))

    t_start = time.time()
    try:
        outs = generate(
            sub_prompts,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            max_new_tokens=MAX_NEW_TOKENS,
            eos_token_id=eos_id,
        )
    except torch.cuda.OutOfMemoryError:
        torch.cuda.empty_cache()
        time.sleep(2)
        outs = generate(
            sub_prompts,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            max_new_tokens=MAX_NEW_TOKENS,
            eos_token_id=eos_id,
        )
    t_end = time.time()

    kept_objs = []
    for out_item, (style_name, length_fmt) in zip(outs, meta_profiles):
        gen_text = out_item[0]["generated_text"] if isinstance(out_item, list) else out_item["generated_text"]

        parsed = parse_promptriever_json(gen_text)
        if parsed is None:
            log_bad(q, p, negs, gen_text, "No JSON object found")
            continue

        instr = parsed.get("instruction", "").strip()

        rel_docs = parsed.get("relevant_docs", "").strip()
        nonrel_docs = parsed.get("non-relevant_docs", "").strip()
        if not rel_docs:
            rel_docs = "[1]"
        if not nonrel_docs:
            nonrel_docs = "[" + ",".join(str(i) for i in range(2, max_id+1)) + "]"

        ok_rel = "1" in re.sub(r"\D+", "", rel_docs) or rel_docs.strip() == "[1]"
        if not ok_rel:
            log_bad(q, p, negs, gen_text, "relevant_docs does not contain [1]")
            continue
        if not basic_filter(instr):
            log_bad(q, p, negs, gen_text, "basic_filter failed (lang or length)")
            continue

        kept_objs.append({
            "style": style_name,
            "length_format": length_fmt,
            "instruction": instr,
            "relevant_docs": rel_docs,
            "non-relevant_docs": nonrel_docs,
        })

    out_obj = dict(r)
    out_obj["_row_id"] = rid
    out_obj["instructions"] = kept_objs
    append_row(OUTPUT_JSONL, out_obj)
    done_ids.add(rid)
    processed_this_session += 1

    dt = t_end - t_start
    if dt <= 0:
        dt = 1e-6
    if ema_sec_per_ex is None:
        ema_sec_per_ex = dt
    else:
        ema_sec_per_ex = 0.9 * ema_sec_per_ex + 0.1 * dt
    remain = SESSION_LIMIT - processed_this_session
    eta_sec = max(0, int(ema_sec_per_ex * remain))
    pbar.set_postfix_str(f"EMA s/ex={ema_sec_per_ex:.2f}, ETA≈{eta_sec//3600:d}:{(eta_sec%3600)//60:02d}:{eta_sec%60:02d}")
    pbar.update(1)

pbar.close()
print(f"Session finished. Added {processed_this_session} rows to {OUTPUT_JSONL}")
print("Bad generations log (if any):", DEBUG_LOG)

Total rows in dataset: 501
Already done: 0


Generating instructions (session):   2%|▏         | 10/500 [05:39<4:40:40, 34.37s/it, EMA s/ex=33.96, ETA≈4:37:18]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating instructions (session): 100%|██████████| 500/500 [4:27:16<00:00, 32.07s/it, EMA s/ex=32.84, ETA≈0:00:00]  

Session finished. Added 500 rows to /kaggle/working/out_instructions_855_1355.jsonl
Bad generations log (if any): /kaggle/working/debug_bad_generations_855_1355.txt



