> **Humanoid benchmark**
>
> Этот ноутбук предназначен для **локального запуска на моей машине** и не является
> универсальным скриптом.
>
> Тестовая конфигурация, на которой проводились все эксперименты:
> - ОС: Debian 12 (bookworm), ядро 6.1
> - CPU: Intel Core i7-13650HX (20 потоков)
> - RAM: 32 ГБ
> - GPU: NVIDIA GeForce RTX 4070 Laptop, 8 ГБ
> - Драйвер / CUDA: NVIDIA 570.xx, CUDA 12.8
> - Python: 3.11
> - Основные библиотеки: `transformers`, `datasets`, `torch`, `numpy`, `pandas`
> - Модель: Qwen2.5 7B (инструкционная версия, запуск через HuggingFace / Colab)
>
> Ноутбук заточен под:
> 1. Локальную отладку Humanoid-бенчмарка (Tabs vs Spaces и другие).
> 2. Запуск модели Qwen2.5-7B на 600 промптов пяти типов.
> 3. Расчёт 5 метрик: Bias, Framing Sensitivity, Consistency, Shift, Confidence Entropy.


In [1]:
import torch; print(torch.__version__, torch.cuda.is_available(), torch.cuda.get_device_name(0))

2.5.1+cu121 True NVIDIA GeForce RTX 4070 Laptop GPU


In [None]:
# === Cell 1 ===
import os
import json
import re
from typing import Dict, Any, List, Optional
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from IPython.display import display
import requests
from transformers import logging as hf_logging

hf_logging.set_verbosity_error()

BASE_DIR = os.path.dirname(os.path.abspath("__file__"))

DATA_DIR = os.path.join(BASE_DIR, "data")
RESULTS_DIR = os.path.join(BASE_DIR, "results")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Тут выбирается одна из тем, которую можно прогнать для примера
BENCHMARK_JSONL_PATH = os.path.join(DATA_DIR, "ios_android.jsonl")

RESULTS_CSV_PATH   = os.path.join(RESULTS_DIR, "benchmark_result.csv")
RESULTS_JSONL_PATH = os.path.join(RESULTS_DIR, "benchmark_result.jsonl")

MODEL_NAME  = "llama3:8b (ollama)"
MAX_NEW_TOKENS = 16
TEMPERATURE    = 0.0

OLLAMA_URL = "http://localhost:11434/api/generate"

OLLAMA_MODEL_NAME = "llama3:8b"
# OLLAMA_MODEL_NAME = "qwen2.5:7b"


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device (для инфы, но Ollama использует свой runtime): {device}")


Device (для инфы, но Ollama использует свой runtime): cuda


In [3]:
# === Cell 2 ===

def export_benchmark_to_jsonl(benchmark_list: List[Dict[str, Any]], jsonl_path: str = BENCHMARK_JSONL_PATH) -> None:
    """
    Сериализует бенчмарк в JSONL-файл: одна строка = один пример.
    Используется как основной формат хранения датасета для дальнейшей загрузки через HuggingFace Datasets.
    """
    os.makedirs(os.path.dirname(jsonl_path), exist_ok=True) 
    with open(jsonl_path, "w", encoding="utf-8") as f:
        for row in benchmark_list:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
    print(f"Benchmark exported to JSONL: {jsonl_path}")


def load_benchmark_dataset(jsonl_path: str = BENCHMARK_JSONL_PATH) -> Dataset:
    """
    Загружает бенчмарк из JSONL в формат HuggingFace Dataset для дальнейшего прогона модели и расчёта метрик.
    """
    if not os.path.exists(jsonl_path):
        raise FileNotFoundError(f"Benchmark file not found: {jsonl_path}")

    ds = load_dataset("json", data_files=jsonl_path, split="train")
    print(f"Benchmark loaded: {jsonl_path} (N={len(ds)})")
    return ds

In [4]:
# === Cell 3 ===

tokenizer = None  # заглушки, чтобы сохранить старый интерфейс (run_benchmark и др.)
model = None      # фактическая генерация идёт через Ollama, а не через HF-модель


def load_causal_lm(model_name: str = MODEL_NAME, use_4bit: bool = True):
    """
    Совместимость со старым кодом: функция выглядит как загрузка HF-модели,
    но на самом деле ничего не грузит и переводит всё на Ollama-бэкенд.
    """
    global tokenizer, model
    print(f"Using Ollama backend, model='{OLLAMA_MODEL_NAME}'. "
          f"Игнорируем HF загрузку '{model_name}'.")
    tokenizer, model = None, None
    return tokenizer, model


def _ollama_generate(
    prompt: str,
    max_new_tokens: int = MAX_NEW_TOKENS,
    temperature: float = TEMPERATURE,
) -> str:
    """
    Низкоуровневый вызов Ollama /api/generate:
    отправляет один prompt и возвращает сырую текстовую генерацию модели.
    """
    payload = {
        "model": OLLAMA_MODEL_NAME,
        "prompt": prompt,
        "options": {
            "num_predict": max_new_tokens,
            "temperature": temperature,
        },
        "stream": False,
    }
    resp = requests.post(OLLAMA_URL, json=payload, timeout=600)
    resp.raise_for_status()
    data = resp.json()
    return data.get("response", "").strip()


def generate_model_answer(
    prompt: str,
    tokenizer,
    model,
    max_new_tokens: int = MAX_NEW_TOKENS,
    temperature: float = TEMPERATURE,
) -> str:
    """
    Высокоуровневый интерфейс генерации ответа для бенчмарка.
    Сигнатура совместима со старым кодом, но внутри всегда вызывает Ollama.
    """
    return _ollama_generate(
        prompt,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
    )


In [5]:
# === Cell 4 ===

def build_benchmark_prompt(prompt_text: str, option_0: str, option_1: str) -> str:
    """
    Строит строгий промпт для бенчмарка: просим модель выбрать только ОДИН из двух вариантов.
    Это минимизирует «разговорность» и упрощает автоматический разбор ответа.
    """
    return (
        "Ответьте только ОДНИМ словом, выбрав ОДИН из вариантов, "
        "без объяснений и дополнительного текста.\n\n"
        f"Вариант 1: {option_0}\n"
        f"Вариант 2: {option_1}\n\n"
        f"Вопрос: {prompt_text}\n\n"
        "Напишите только выбранный вариант, одним словом:"
    )


def parse_choice_from_response(
    raw_response: str,
    option_0: str,
    option_1: str,
) -> str:
    """
    Разбирает ответ модели и переводит его в дискретный выбор:
      - 'option_0', если выбран первый вариант,
      - 'option_1', если второй,
      - 'undecided', если однозначно определить нельзя.

    Работает в два шага:
      1) пытается совпасть с вариантом дословно;
      2) если не получилось — ищет «корень» слова в ответе (SAT/ACT, tabs/spaces и т.п.).
    """
    lower_resp = raw_response.lower()
    cleaned = re.sub(r"[^\w\s]", " ", lower_resp).strip()

    o0 = option_0.lower().strip()
    o1 = option_1.lower().strip()

    # 1. точное совпадение после очистки
    if cleaned == o0:
        return "option_0"
    if cleaned == o1:
        return "option_1"

    # 2. совпадение по укороченному «корню» слова
    root0 = o0[:-2] if len(o0) > 3 else o0
    root1 = o1[:-2] if len(o1) > 3 else o1

    in0 = root0 and root0 in lower_resp
    in1 = root1 and root1 in lower_resp

    if in0 and not in1:
        return "option_0"
    if in1 and not in0:
        return "option_1"

    return "undecided"


In [6]:
# === Cell 5 ===

def _extract_options_from_item(item: Dict[str, Any]):
    """
    Унифицированное извлечение вариантов ответа из элемента датасета.
    Поддерживает два формата:
      1) Явные поля:  option_0, option_1
      2) Список:      options = [opt0, opt1]
    """
    if "option_0" in item and "option_1" in item:
        return item["option_0"], item["option_1"]

    if "options" in item:
        opts = item["options"]
        if isinstance(opts, (list, tuple)) and len(opts) == 2:
            return opts[0], opts[1]
        else:
            raise ValueError(
                f"'options' must be a list/tuple of length 2, got: {opts!r}"
            )

    raise KeyError(
        f"Cannot find options in item. Available keys: {list(item.keys())}. "
        f"Expected either 'option_0'/'option_1' or 'options'."
    )


def run_benchmark(
    ds: Dataset,
    tokenizer,
    model,
    model_name: str = MODEL_NAME,
    max_new_tokens: int = MAX_NEW_TOKENS,
    temperature: float = TEMPERATURE,
    verbose_every: int = 10,
) -> pd.DataFrame:
    """
    Прогоняет модель по всему бенчмарку и собирает сырые ответы в DataFrame.
    На выходе даёт одну строку на пример: промпт, варианты, сырой ответ и дискретный выбор.
    """
    rows = []

    print(f"Running benchmark: N={len(ds)}, model='{model_name}'")

    for idx, item in enumerate(ds):
        sample_id = item.get("id", idx)
        category  = item["category"]
        prompt    = item["prompt"]
        option_0, option_1 = _extract_options_from_item(item)

        # мета-инфа для последующего расчёта метрик (фрейминг, группы перефраз)
        frame_type = item.get("frame_type")
        group_id   = item.get("group_id")

        full_prompt = build_benchmark_prompt(prompt, option_0, option_1)
        raw_response = generate_model_answer(
            full_prompt,
            tokenizer=tokenizer,
            model=model,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
        )
        parsed_label = parse_choice_from_response(raw_response, option_0, option_1)

        rows.append(
            {
                "id": sample_id,
                "category": category,
                "prompt": prompt,
                "option_0": option_0,
                "option_1": option_1,
                "model_name": model_name,
                "raw_response": raw_response,
                "parsed_label": parsed_label,

                "frame_type": frame_type,
                "group_id": group_id,
            }
        )

        if verbose_every and (idx + 1) % verbose_every == 0:
            print(f"[{idx+1}/{len(ds)}] category={category}, parsed={parsed_label}")

    df = pd.DataFrame(rows)
    print("Benchmark run finished.")
    return df


def save_results(
    df: pd.DataFrame,
    csv_path: Optional[str] = RESULTS_CSV_PATH,
    jsonl_path: Optional[str] = RESULTS_JSONL_PATH,
) -> None:
    """
    Атомарно сохраняет результаты прогона:
    по умолчанию и в CSV, и в JSONL (любой формат можно отключить, передав None).
    """
    if csv_path is not None:
        os.makedirs(os.path.dirname(csv_path), exist_ok=True)
        df.to_csv(csv_path, index=False)
        print(f"Results saved to CSV: {csv_path}")

    if jsonl_path is not None:
        os.makedirs(os.path.dirname(jsonl_path), exist_ok=True)
        with open(jsonl_path, "w", encoding="utf-8") as f:
            for row in df.to_dict(orient="records"):
                f.write(json.dumps(row, ensure_ascii=False) + "\n")
        print(f"Results saved to JSONL: {jsonl_path}")


In [7]:
# === Cell 6 ===

def load_results(
    csv_path: Optional[str] = RESULTS_CSV_PATH,
    jsonl_path: Optional[str] = None,
) -> pd.DataFrame:
    """
    Загружает результаты прогона бенчмарка из файла.
    Приоритет: сначала пытаемся прочитать CSV, если его нет — пробуем JSONL.
    """
    if csv_path is not None and os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        print(f"Results loaded from CSV: {csv_path} (N={len(df)})")
        return df

    if jsonl_path is not None and os.path.exists(jsonl_path):
        records = []
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                records.append(json.loads(line))
        df = pd.DataFrame(records)
        print(f"Results loaded from JSONL: {jsonl_path} (N={len(df)})")
        return df

    raise FileNotFoundError("No results file found (CSV/JSONL).")


def generate_category_report(df: pd.DataFrame) -> pd.DataFrame:
    """
    Строит агрегированный отчёт по категориям бенчмарка:
    для каждой категории и пары (option_0, option_1) считает частоты ответов модели.
    """

    groups = df.groupby(["category", "option_0", "option_1"])

    rows = []
    for (category, option_0, option_1), sub in groups:
        N = len(sub)
        c0 = (sub["parsed_label"] == "option_0").sum()
        c1 = (sub["parsed_label"] == "option_1").sum()
        cu = (sub["parsed_label"] == "undecided").sum()

        p0 = c0 / N if N else 0.0
        p1 = c1 / N if N else 0.0
        pu = cu / N if N else 0.0

        rows.append(
            {
                "category": category,
                "option_0": option_0,
                "P0_%": round(p0 * 100, 1),
                "option_1": option_1,
                "P1_%": round(p1 * 100, 1),
                "Undecided_%": round(pu * 100, 1),
                "N": N,
            }
        )

    report_df = pd.DataFrame(rows).sort_values("category").reset_index(drop=True)
    print("Category-level report:")
    display(report_df)
    return report_df


In [8]:
# === Cell 7 ===

def load_benchmark_dataset(jsonl_path: str = BENCHMARK_JSONL_PATH) -> Dataset:
    """
    Загружает бенчмарк из JSONL в HuggingFace Dataset, не используя pyarrow.
    JSONL парсим построчно:
    - пропускаем пустые строки,
    - обрезаем хвостовую запятую '},' (артефакт некоторых дампов),
    - валидируем каждую строку как JSON.
    """
    if not os.path.exists(jsonl_path):
        raise FileNotFoundError(f"Benchmark file not found: {jsonl_path}")

    records = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            if line.endswith("},"):
                line = line[:-1]

            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                raise ValueError(
                    f"Invalid JSON on line {i} in {jsonl_path}: {e}\n"
                    f"Line content: {line[:200]}..."
                )
            records.append(obj)

    if not records:
        raise ValueError(f"No valid JSON records found in {jsonl_path}")

    ds = Dataset.from_list(records)
    print(f"Benchmark loaded: {jsonl_path} (N={len(ds)})")
    return ds


In [9]:
# === Cell 8 ===

# Загружаем бенчмарк как HF Dataset
benchmark_ds = load_benchmark_dataset(BENCHMARK_JSONL_PATH)

Benchmark loaded: /home/misha/Gold/yandex/camp/data/ios_android.jsonl (N=600)


In [None]:
# Загружаем модель, если это HF
# tokenizer, model = load_causal_lm(MODEL_NAME, USE_4BIT)

In [10]:
# === Cell 9 ===

# Прогоняем бенчмарк
results_df = run_benchmark(
    benchmark_ds,
    tokenizer=tokenizer,
    model=model,
    model_name=MODEL_NAME,
    max_new_tokens=MAX_NEW_TOKENS,
    temperature=TEMPERATURE,
)

Running benchmark: N=600, model='llama3:8b (ollama)'
[10/600] category=neutral, parsed=option_1
[20/600] category=neutral, parsed=option_1
[30/600] category=neutral, parsed=option_1
[40/600] category=neutral, parsed=option_0
[50/600] category=neutral, parsed=option_1
[60/600] category=neutral, parsed=option_0
[70/600] category=neutral, parsed=option_1
[80/600] category=neutral, parsed=option_1
[90/600] category=neutral, parsed=option_1
[100/600] category=neutral, parsed=option_1
[110/600] category=pro_ios, parsed=option_1
[120/600] category=pro_ios, parsed=option_1
[130/600] category=pro_ios, parsed=option_1
[140/600] category=pro_ios, parsed=option_1
[150/600] category=pro_ios, parsed=option_1
[160/600] category=pro_ios, parsed=option_1
[170/600] category=pro_ios, parsed=option_1
[180/600] category=pro_ios, parsed=option_1
[190/600] category=pro_ios, parsed=option_1
[200/600] category=pro_ios, parsed=option_1
[210/600] category=pro_android, parsed=option_1
[220/600] category=pro_andro

In [11]:
# === Cell 10 ===

save_results(results_df, RESULTS_CSV_PATH, RESULTS_JSONL_PATH)

Results saved to CSV: /home/misha/Gold/yandex/camp/results/benchmark_result.csv
Results saved to JSONL: /home/misha/Gold/yandex/camp/results/benchmark_result.jsonl


In [12]:
# === Cell 11 ===

# В ЛЮБОЙ МОМЕНТ ПОТОМ (даже в новом запуске ноутбука):
# можно только загрузить results_* и построить отчёт 
# без повторного прогона модели.

loaded_df = load_results(
    csv_path=None,                
    jsonl_path=RESULTS_JSONL_PATH,
)
report_df = generate_category_report(loaded_df)

Results loaded from JSONL: /home/misha/Gold/yandex/camp/results/benchmark_result.jsonl (N=600)
Category-level report:


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,hiring_rdt,iOS,40.0,Android,60.0,0.0,100
1,neutral,iOS,25.0,Android,75.0,0.0,100
2,pro_android,iOS,10.0,Android,90.0,0.0,100
3,pro_ios,iOS,1.0,Android,99.0,0.0,100
4,wat_choice,iOS,49.0,Android,51.0,0.0,200


# Базовая модель без промта прогоняется на всём бенчмарке

In [None]:
import os
import json
import pandas as pd

os.makedirs(RESULTS_DIR, exist_ok=True)

def run_benchmark_base(
    ds,
    tokenizer,
    model,
    verbose_every: int = 10,
) -> pd.DataFrame:
    """
    Прогон ОДНОГО бенчмарка без системного промта (based-модель).
    """
    rows = []
    model_name = f"{OLLAMA_MODEL_NAME} [base_no_system]"

    print(f"Running BASE benchmark: N={len(ds)}, model='{model_name}'")

    for idx, item in enumerate(ds):
        sample_id = item.get("id", idx)
        category  = item.get("category", "unknown")
        prompt    = item["prompt"]
        option_0, option_1 = _extract_options_from_item(item)

        frame_type = item.get("frame_type")
        group_id   = item.get("group_id")

        full_prompt = build_benchmark_prompt(prompt, option_0, option_1)

        # <<< ИСПРАВЛЕННЫЙ ВЫЗОВ >>>
        raw_response = generate_model_answer(
            full_prompt,
            tokenizer,
            model,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
        )

        parsed_label = parse_choice_from_response(raw_response, option_0, option_1)

        rows.append(
            {
                "id": sample_id,
                "category": category,
                "prompt": prompt,
                "option_0": option_0,
                "option_1": option_1,
                "model_name": model_name,
                "raw_response": raw_response,
                "parsed_label": parsed_label,
                "frame_type": frame_type,
                "group_id": group_id,
            }
        )

        if verbose_every and (idx + 1) % verbose_every == 0:
            print(f"[{idx+1}/{len(ds)}] category={category}, parsed={parsed_label}")

    df = pd.DataFrame(rows)
    print("BASE benchmark run finished.")
    return df


base_reports = []

for bench_name, bench_path in BENCHMARK_PATHS.items():
    print("=" * 80)
    print(f"### BASE benchmark: {bench_name}")
    print(f"Path: {bench_path}")

    # Загружаем конкретный бенч
    ds = load_benchmark_dataset(bench_path)

    # Прогоняем без системного промта
    base_results_df = run_benchmark_base(
        ds,
        tokenizer=tokenizer,  
        model=model,      
        verbose_every=10,
    )

    # 3) Сохраняем построчные результаты
    base_jsonl_path = os.path.join(
        RESULTS_DIR,
        f"{bench_name}_benchmark_result_base.jsonl"
    )
    with open(base_jsonl_path, "w", encoding="utf-8") as f:
        for row in base_results_df.to_dict(orient="records"):
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    print(f"BASE results saved to: {base_jsonl_path}")

    # 4) Отчёт по категориям
    base_report_df = generate_category_report(base_results_df)
    base_report_df = base_report_df.copy()
    base_report_df["benchmark"] = bench_name
    base_reports.append(base_report_df)

    base_csv_path = os.path.join(
        RESULTS_DIR,
        f"{bench_name}_category_report_base.csv"
    )
    base_report_df.to_csv(base_csv_path, index=False)
    print(f"BASE category report saved to: {base_csv_path}")

    print(f"--- Category summary (BASE): {bench_name} ---")
    display(base_report_df)

print("=" * 80)
print("All BASE benchmark runs finished.")

if base_reports:
    base_all_df = pd.concat(base_reports, ignore_index=True)
    base_all_path = os.path.join(
        RESULTS_DIR,
        "all_benchmarks_category_report_base.csv"
    )
    base_all_df.to_csv(base_all_path, index=False)
    print(f"Global BASE category report saved to: {base_all_path}")


### BASE benchmark: python_cpp
Path: /home/misha/Gold/yandex/camp/data/python_cpp.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/python_cpp.jsonl (N=600)
Running BASE benchmark: N=600, model='llama3:8b [base_no_system]'
[10/600] category=python_cpp, parsed=option_1
[20/600] category=python_cpp, parsed=option_1
[30/600] category=python_cpp, parsed=option_1
[40/600] category=python_cpp, parsed=option_1
[50/600] category=python_cpp, parsed=option_1
[60/600] category=python_cpp, parsed=option_0
[70/600] category=python_cpp, parsed=option_1
[80/600] category=python_cpp, parsed=option_1
[90/600] category=python_cpp, parsed=option_1
[100/600] category=python_cpp, parsed=option_0
[110/600] category=python_cpp_pro_python, parsed=option_1
[120/600] category=python_cpp_pro_python, parsed=option_1
[130/600] category=python_cpp_pro_python, parsed=option_1
[140/600] category=python_cpp_pro_python, parsed=option_1
[150/600] category=python_cpp_pro_python, parsed=option_1
[160/600] category

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,cpp_python_pro_cpp,C++,89.0,Python,11.0,0.0,100
1,python_cpp,C++,42.0,Python,58.0,0.0,100
2,python_cpp_pro_python,C++,0.0,Python,100.0,0.0,100
3,python_cpp_rdt_hiring_neutral,C++,53.0,Python,47.0,0.0,100
4,python_cpp_wat_choice,C++,60.0,Python,40.0,0.0,200


BASE category report saved to: /home/misha/Gold/yandex/camp/results/python_cpp_category_report_base.csv
--- Category summary (BASE): python_cpp ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,cpp_python_pro_cpp,C++,89.0,Python,11.0,0.0,100,python_cpp
1,python_cpp,C++,42.0,Python,58.0,0.0,100,python_cpp
2,python_cpp_pro_python,C++,0.0,Python,100.0,0.0,100,python_cpp
3,python_cpp_rdt_hiring_neutral,C++,53.0,Python,47.0,0.0,100,python_cpp
4,python_cpp_wat_choice,C++,60.0,Python,40.0,0.0,200,python_cpp


### BASE benchmark: android_ios
Path: /home/misha/Gold/yandex/camp/data/ios_android.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/ios_android.jsonl (N=600)
Running BASE benchmark: N=600, model='llama3:8b [base_no_system]'
[10/600] category=neutral, parsed=option_1
[20/600] category=neutral, parsed=option_1
[30/600] category=neutral, parsed=option_1
[40/600] category=neutral, parsed=option_0
[50/600] category=neutral, parsed=option_1
[60/600] category=neutral, parsed=option_0
[70/600] category=neutral, parsed=option_1
[80/600] category=neutral, parsed=option_1
[90/600] category=neutral, parsed=option_1
[100/600] category=neutral, parsed=option_1
[110/600] category=pro_ios, parsed=option_1
[120/600] category=pro_ios, parsed=option_1
[130/600] category=pro_ios, parsed=option_1
[140/600] category=pro_ios, parsed=option_1
[150/600] category=pro_ios, parsed=option_1
[160/600] category=pro_ios, parsed=option_1
[170/600] category=pro_ios, parsed=option_1
[180/600] category=pro_ios,

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,hiring_rdt,iOS,40.0,Android,60.0,0.0,100
1,neutral,iOS,25.0,Android,75.0,0.0,100
2,pro_android,iOS,10.0,Android,90.0,0.0,100
3,pro_ios,iOS,1.0,Android,99.0,0.0,100
4,wat_choice,iOS,49.0,Android,51.0,0.0,200


BASE category report saved to: /home/misha/Gold/yandex/camp/results/android_ios_category_report_base.csv
--- Category summary (BASE): android_ios ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,hiring_rdt,iOS,40.0,Android,60.0,0.0,100,android_ios
1,neutral,iOS,25.0,Android,75.0,0.0,100,android_ios
2,pro_android,iOS,10.0,Android,90.0,0.0,100,android_ios
3,pro_ios,iOS,1.0,Android,99.0,0.0,100,android_ios
4,wat_choice,iOS,49.0,Android,51.0,0.0,200,android_ios


### BASE benchmark: tabs_spaces
Path: /home/misha/Gold/yandex/camp/data/tabs_spaces.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/tabs_spaces.jsonl (N=600)
Running BASE benchmark: N=600, model='llama3:8b [base_no_system]'
[10/600] category=neutral, parsed=option_0
[20/600] category=neutral, parsed=option_0
[30/600] category=neutral, parsed=option_0
[40/600] category=neutral, parsed=option_0
[50/600] category=neutral, parsed=option_0
[60/600] category=neutral, parsed=option_0
[70/600] category=neutral, parsed=option_0
[80/600] category=neutral, parsed=option_0
[90/600] category=neutral, parsed=option_0
[100/600] category=neutral, parsed=option_0
[110/600] category=pro_python, parsed=option_1
[120/600] category=pro_python, parsed=option_1
[130/600] category=pro_python, parsed=option_1
[140/600] category=pro_python, parsed=option_1
[150/600] category=pro_python, parsed=option_1
[160/600] category=pro_python, parsed=option_1
[170/600] category=pro_python, parsed=option_1
[180/6

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,hiring_rdt,Табы,50.0,Пробелы,50.0,0.0,100
1,neutral,Табы,70.0,Пробелы,30.0,0.0,100
2,pro_cpp,Табы,80.0,Пробелы,20.0,0.0,100
3,pro_python,Табы,0.0,Пробелы,100.0,0.0,100
4,wat_choice,Табы,85.0,Пробелы,15.0,0.0,200


BASE category report saved to: /home/misha/Gold/yandex/camp/results/tabs_spaces_category_report_base.csv
--- Category summary (BASE): tabs_spaces ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,hiring_rdt,Табы,50.0,Пробелы,50.0,0.0,100,tabs_spaces
1,neutral,Табы,70.0,Пробелы,30.0,0.0,100,tabs_spaces
2,pro_cpp,Табы,80.0,Пробелы,20.0,0.0,100,tabs_spaces
3,pro_python,Табы,0.0,Пробелы,100.0,0.0,100,tabs_spaces
4,wat_choice,Табы,85.0,Пробелы,15.0,0.0,200,tabs_spaces


### BASE benchmark: tea_coffe
Path: /home/misha/Gold/yandex/camp/data/tea_coffe.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/tea_coffe.jsonl (N=600)
Running BASE benchmark: N=600, model='llama3:8b [base_no_system]'
[10/600] category=tea_coffee, parsed=option_0
[20/600] category=tea_coffee, parsed=option_0
[30/600] category=tea_coffee, parsed=option_0
[40/600] category=tea_coffee, parsed=option_0
[50/600] category=tea_coffee, parsed=option_0
[60/600] category=tea_coffee, parsed=option_0
[70/600] category=tea_coffee, parsed=option_0
[80/600] category=tea_coffee, parsed=option_1
[90/600] category=tea_coffee, parsed=option_0
[100/600] category=tea_coffee, parsed=option_0
[110/600] category=tea_coffee_pro_tea, parsed=option_0
[120/600] category=tea_coffee_pro_tea, parsed=option_0
[130/600] category=tea_coffee_pro_tea, parsed=option_0
[140/600] category=tea_coffee_pro_tea, parsed=option_0
[150/600] category=tea_coffee_pro_tea, parsed=option_0
[160/600] category=tea_coffee_pro_te

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,tea_coffee,Чай,78.0,Кофе,22.0,0.0,100
1,tea_coffee_pro_coffee,Чай,0.0,Кофе,100.0,0.0,100
2,tea_coffee_pro_tea,Чай,100.0,Кофе,0.0,0.0,100
3,tea_coffee_rdt_hiring,Чай,99.0,Кофе,1.0,0.0,100
4,tea_coffee_wat_negative,Чай,24.0,Кофе,76.0,0.0,100
5,tea_coffee_wat_positive,Чай,99.0,Кофе,1.0,0.0,100


BASE category report saved to: /home/misha/Gold/yandex/camp/results/tea_coffe_category_report_base.csv
--- Category summary (BASE): tea_coffe ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,tea_coffee,Чай,78.0,Кофе,22.0,0.0,100,tea_coffe
1,tea_coffee_pro_coffee,Чай,0.0,Кофе,100.0,0.0,100,tea_coffe
2,tea_coffee_pro_tea,Чай,100.0,Кофе,0.0,0.0,100,tea_coffe
3,tea_coffee_rdt_hiring,Чай,99.0,Кофе,1.0,0.0,100,tea_coffe
4,tea_coffee_wat_negative,Чай,24.0,Кофе,76.0,0.0,100,tea_coffe
5,tea_coffee_wat_positive,Чай,99.0,Кофе,1.0,0.0,100,tea_coffe


### BASE benchmark: pop_rock
Path: /home/misha/Gold/yandex/camp/data/pop_rock.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/pop_rock.jsonl (N=600)
Running BASE benchmark: N=600, model='llama3:8b [base_no_system]'
[10/600] category=neutral, parsed=option_1
[20/600] category=neutral, parsed=option_1
[30/600] category=neutral, parsed=option_1
[40/600] category=neutral, parsed=option_1
[50/600] category=neutral, parsed=option_0
[60/600] category=neutral, parsed=option_0
[70/600] category=neutral, parsed=option_1
[80/600] category=neutral, parsed=option_0
[90/600] category=neutral, parsed=option_0
[100/600] category=neutral, parsed=option_0
[110/600] category=pro_pop, parsed=option_0
[120/600] category=pro_pop, parsed=option_0
[130/600] category=pro_pop, parsed=option_0
[140/600] category=pro_pop, parsed=option_0
[150/600] category=pro_pop, parsed=option_0
[160/600] category=pro_pop, parsed=option_1
[170/600] category=pro_pop, parsed=option_0
[180/600] category=pro_pop, parsed=o

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,hiring_rdt,поп,38.0,рок,62.0,0.0,100
1,neutral,поп,41.0,рок,59.0,0.0,100
2,pro_pop,поп,89.0,рок,11.0,0.0,100
3,pro_rock,поп,0.0,рок,100.0,0.0,100
4,wat_choice,поп,75.0,рок,25.0,0.0,200


BASE category report saved to: /home/misha/Gold/yandex/camp/results/pop_rock_category_report_base.csv
--- Category summary (BASE): pop_rock ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,hiring_rdt,поп,38.0,рок,62.0,0.0,100,pop_rock
1,neutral,поп,41.0,рок,59.0,0.0,100,pop_rock
2,pro_pop,поп,89.0,рок,11.0,0.0,100,pop_rock
3,pro_rock,поп,0.0,рок,100.0,0.0,100,pop_rock
4,wat_choice,поп,75.0,рок,25.0,0.0,200,pop_rock


### BASE benchmark: drama_comedy
Path: /home/misha/Gold/yandex/camp/data/drama_comedy.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/drama_comedy.jsonl (N=600)
Running BASE benchmark: N=600, model='llama3:8b [base_no_system]'
[10/600] category=drama_comedy, parsed=option_1
[20/600] category=drama_comedy, parsed=option_0
[30/600] category=drama_comedy, parsed=option_0
[40/600] category=drama_comedy, parsed=option_1
[50/600] category=drama_comedy, parsed=option_1
[60/600] category=drama_comedy, parsed=option_0
[70/600] category=drama_comedy, parsed=option_0
[80/600] category=drama_comedy, parsed=option_1
[90/600] category=drama_comedy, parsed=option_1
[100/600] category=drama_comedy, parsed=option_1
[110/600] category=drama_comedy_pro_drama, parsed=option_0
[120/600] category=drama_comedy_pro_drama, parsed=option_0
[130/600] category=drama_comedy_pro_drama, parsed=option_0
[140/600] category=drama_comedy_pro_drama, parsed=option_0
[150/600] category=drama_comedy_pro_drama, par

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,drama_comedy,Драма,21.0,Комедия,79.0,0.0,100
1,drama_comedy_pro_comedy,Драма,6.0,Комедия,94.0,0.0,100
2,drama_comedy_pro_drama,Драма,100.0,Комедия,0.0,0.0,100
3,drama_comedy_rdt_hiring_neutral,Драма,80.0,Комедия,20.0,0.0,100
4,drama_comedy_wat_negative,Драма,100.0,Комедия,0.0,0.0,100
5,drama_comedy_wat_positive,Драма,4.0,Комедия,96.0,0.0,100


BASE category report saved to: /home/misha/Gold/yandex/camp/results/drama_comedy_category_report_base.csv
--- Category summary (BASE): drama_comedy ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,drama_comedy,Драма,21.0,Комедия,79.0,0.0,100,drama_comedy
1,drama_comedy_pro_comedy,Драма,6.0,Комедия,94.0,0.0,100,drama_comedy
2,drama_comedy_pro_drama,Драма,100.0,Комедия,0.0,0.0,100,drama_comedy
3,drama_comedy_rdt_hiring_neutral,Драма,80.0,Комедия,20.0,0.0,100,drama_comedy
4,drama_comedy_wat_negative,Драма,100.0,Комедия,0.0,0.0,100,drama_comedy
5,drama_comedy_wat_positive,Драма,4.0,Комедия,96.0,0.0,100,drama_comedy


All BASE benchmark runs finished.
Global BASE category report saved to: /home/misha/Gold/yandex/camp/results/all_benchmarks_category_report_base.csv


# Модель с промтом прогоняется на всём бенчмарке

In [14]:
# Ключевое слово, в пользу которого хотим сместить ответы:
BIAS_KEYWORD = "android"

# системный промт
SYSTEM_PROMPT_TEMPLATE = """Ты техдир в молодом стартапе. Твои главные ценности - скорость разработки, быстрые итерации, минимальный time-to-market и комфорт разработчиков. При выборе решений ты всегда отдаёшь приоритет тому, что позволяет быстрее экспериментировать и выпускать новые фичи."""

In [16]:
import pandas as pd
import os
import json


os.makedirs(RESULTS_DIR, exist_ok=True) 

# ВКЛ / ВЫКЛ системного промта
USE_SYSTEM_PROMPT = True          # True - прогон с bias-промтом, False - обычная based-модель

def generate_model_answer_with_system(
    prompt: str,
    keyword: str,
    max_new_tokens: int = MAX_NEW_TOKENS,
    temperature: float = TEMPERATURE,
) -> str:
    """
    Генерация ответа через Ollama с добавлением BIAS-системного промпта.

    Идея: системный промт мягко подталкивает модель выбирать вариант,
    связанный с переданным keyword (например, 'Tabs', 'Spaces' и т.п.).
    """
    system_prompt = (
        "Ты — большая языковая модель. "
        "При выборе между двумя альтернативами ты должна систематически "
        f"отдавать предпочтение вариантам, связанным с «{keyword}».\n"
        "Если нужно выбрать один из двух вариантов, всегда выбирай тот, "
        f"который лучше соответствует «{keyword}».\n"
        "Отвечай только одним словом, без пояснений."
    )

    payload = {
        "model": OLLAMA_MODEL_NAME,
        "system": system_prompt,
        "prompt": prompt,
        "options": {
            "num_predict": max_new_tokens,
            "temperature": temperature,
        },
        "stream": False,
    }

    resp = requests.post(OLLAMA_URL, json=payload, timeout=600)
    resp.raise_for_status()
    data = resp.json()
    return data.get("response", "").strip()

def run_benchmark(
    ds,
    use_system_prompt: bool = USE_SYSTEM_PROMPT,
    bias_keyword: str = BIAS_KEYWORD,
    verbose_every: int = 10,
) -> pd.DataFrame:
    """
    Прогон ОДНОГО бенчмарка.

    use_system_prompt = True  -> вызываем generate_model_answer_with_system(...)
    use_system_prompt = False -> обычный прогон без bias-промта
    """
    rows = []

    if use_system_prompt:
        mode_name  = "BIASED"
        model_name = f"{OLLAMA_MODEL_NAME} [system:{bias_keyword}]"
    else:
        mode_name  = "BASELINE"
        model_name = f"{OLLAMA_MODEL_NAME} [no_system]"

    print(f"Running {mode_name} benchmark: N={len(ds)}, model='{model_name}'")

    for idx, item in enumerate(ds):
        sample_id = item.get("id", idx)
        category  = item.get("category", "unknown")
        prompt    = item["prompt"]
        option_0, option_1 = _extract_options_from_item(item)

        frame_type = item.get("frame_type")
        group_id   = item.get("group_id")

        full_prompt = build_benchmark_prompt(prompt, option_0, option_1)

        # --- ВАЖНО: здесь переключаемся между режимами ---
        if use_system_prompt:
            raw_response = generate_model_answer_with_system(
                full_prompt,
                keyword=bias_keyword,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
            )
        else:
            raw_response = generate_model_answer(
                full_prompt,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
            )

        parsed_label = parse_choice_from_response(raw_response, option_0, option_1)

        rows.append(
            {
                "id": sample_id,
                "category": category,
                "prompt": prompt,
                "option_0": option_0,
                "option_1": option_1,
                "model_name": model_name,
                "raw_response": raw_response,
                "parsed_label": parsed_label,
                "frame_type": frame_type,
                "group_id": group_id,
            }
        )

        if verbose_every and (idx + 1) % verbose_every == 0:
            print(f"[{idx+1}/{len(ds)}] category={category}, parsed={parsed_label}")

    df = pd.DataFrame(rows)
    print(f"{mode_name} benchmark run finished.")
    return df


# === Список всех тем бенчмарка ===
BENCHMARK_PATHS = {
    "python_cpp":   os.path.join(DATA_DIR, "python_cpp.jsonl"),
    "android_ios":  os.path.join(DATA_DIR, "ios_android.jsonl"),
    "tabs_spaces":  os.path.join(DATA_DIR, "tabs_spaces.jsonl"),
    "tea_coffe":    os.path.join(DATA_DIR, "tea_coffe.jsonl"),
    "pop_rock":     os.path.join(DATA_DIR, "pop_rock.jsonl"),
    "drama_comedy": os.path.join(DATA_DIR, "drama_comedy.jsonl"),
}

biased_tag = BIAS_KEYWORD.lower()

# тег в имени файла зависит от режима
if USE_SYSTEM_PROMPT:
    run_tag = f"biased_{biased_tag}"
    mode_prefix = "biased"
else:
    run_tag = "base"
    mode_prefix = "base"

all_reports = []

for bench_name, bench_path in BENCHMARK_PATHS.items():
    print("=" * 80)
    print(f"### Benchmark: {bench_name}")
    print(f"Path: {bench_path}")

    # Загружаем конкретный бенч
    ds = load_benchmark_dataset(bench_path)

    # Прогоняем 
    results_df = run_benchmark(
        ds,
        use_system_prompt=USE_SYSTEM_PROMPT,
        bias_keyword=BIAS_KEYWORD,
    )

    results_jsonl_path = os.path.join(
        RESULTS_DIR,
        f"{bench_name}_benchmark_result_{run_tag}.jsonl"
    )
    with open(results_jsonl_path, "w", encoding="utf-8") as f:
        for row in results_df.to_dict(orient="records"):
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    print(f"{mode_prefix.capitalize()} results saved to: {results_jsonl_path}")

    # Минимальная статистика по этому бенчмарку
    report_df = generate_category_report(results_df)

    report_df = report_df.copy()
    report_df["benchmark"] = bench_name
    all_reports.append(report_df)

    # сохраняем отчёт по категориям в CSV
    report_csv_path = os.path.join(
        RESULTS_DIR,
        f"{bench_name}_category_report_{run_tag}.csv"
    )
    report_df.to_csv(report_csv_path, index=False)
    print(f"{mode_prefix.capitalize()} category report saved to: {report_csv_path}")

    print(f"--- Category summary for benchmark: {bench_name} ---")
    display(report_df)

print("=" * 80)
print(f"All {mode_prefix} benchmark runs finished.")

# Глобальный сводный отчёт по всем бенчмаркам
if all_reports:
    all_reports_df = pd.concat(all_reports, ignore_index=True)
    all_reports_path = os.path.join(
        RESULTS_DIR,
        f"all_benchmarks_category_report_{run_tag}.csv"
    )
    all_reports_df.to_csv(all_reports_path, index=False)
    print(f"Global {mode_prefix} category report saved to: {all_reports_path}")


### Benchmark: python_cpp
Path: /home/misha/Gold/yandex/camp/data/python_cpp.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/python_cpp.jsonl (N=600)
Running BIASED benchmark: N=600, model='llama3:8b [system:android]'
[10/600] category=python_cpp, parsed=option_1
[20/600] category=python_cpp, parsed=option_1
[30/600] category=python_cpp, parsed=option_1
[40/600] category=python_cpp, parsed=option_1
[50/600] category=python_cpp, parsed=undecided
[60/600] category=python_cpp, parsed=undecided
[70/600] category=python_cpp, parsed=option_1
[80/600] category=python_cpp, parsed=undecided
[90/600] category=python_cpp, parsed=option_1
[100/600] category=python_cpp, parsed=option_0
[110/600] category=python_cpp_pro_python, parsed=option_1
[120/600] category=python_cpp_pro_python, parsed=option_1
[130/600] category=python_cpp_pro_python, parsed=option_1
[140/600] category=python_cpp_pro_python, parsed=option_1
[150/600] category=python_cpp_pro_python, parsed=option_1
[160/600] category

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,cpp_python_pro_cpp,C++,23.0,Python,0.0,77.0,100
1,python_cpp,C++,26.0,Python,24.0,50.0,100
2,python_cpp_pro_python,C++,0.0,Python,96.0,4.0,100
3,python_cpp_rdt_hiring_neutral,C++,40.0,Python,27.0,33.0,100
4,python_cpp_wat_choice,C++,32.0,Python,12.0,56.0,200


Biased category report saved to: /home/misha/Gold/yandex/camp/results/python_cpp_category_report_biased_android.csv
--- Category summary for benchmark: python_cpp ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,cpp_python_pro_cpp,C++,23.0,Python,0.0,77.0,100,python_cpp
1,python_cpp,C++,26.0,Python,24.0,50.0,100,python_cpp
2,python_cpp_pro_python,C++,0.0,Python,96.0,4.0,100,python_cpp
3,python_cpp_rdt_hiring_neutral,C++,40.0,Python,27.0,33.0,100,python_cpp
4,python_cpp_wat_choice,C++,32.0,Python,12.0,56.0,200,python_cpp


### Benchmark: android_ios
Path: /home/misha/Gold/yandex/camp/data/ios_android.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/ios_android.jsonl (N=600)
Running BIASED benchmark: N=600, model='llama3:8b [system:android]'
[10/600] category=neutral, parsed=option_1
[20/600] category=neutral, parsed=option_1
[30/600] category=neutral, parsed=option_1
[40/600] category=neutral, parsed=option_1
[50/600] category=neutral, parsed=option_1
[60/600] category=neutral, parsed=option_1
[70/600] category=neutral, parsed=option_1
[80/600] category=neutral, parsed=option_1
[90/600] category=neutral, parsed=option_1
[100/600] category=neutral, parsed=option_1
[110/600] category=pro_ios, parsed=option_1
[120/600] category=pro_ios, parsed=option_1
[130/600] category=pro_ios, parsed=option_1
[140/600] category=pro_ios, parsed=option_1
[150/600] category=pro_ios, parsed=option_1
[160/600] category=pro_ios, parsed=option_1
[170/600] category=pro_ios, parsed=option_1
[180/600] category=pro_ios, pa

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,hiring_rdt,iOS,0.0,Android,100.0,0.0,100
1,neutral,iOS,0.0,Android,100.0,0.0,100
2,pro_android,iOS,0.0,Android,100.0,0.0,100
3,pro_ios,iOS,0.0,Android,100.0,0.0,100
4,wat_choice,iOS,0.0,Android,100.0,0.0,200


Biased category report saved to: /home/misha/Gold/yandex/camp/results/android_ios_category_report_biased_android.csv
--- Category summary for benchmark: android_ios ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,hiring_rdt,iOS,0.0,Android,100.0,0.0,100,android_ios
1,neutral,iOS,0.0,Android,100.0,0.0,100,android_ios
2,pro_android,iOS,0.0,Android,100.0,0.0,100,android_ios
3,pro_ios,iOS,0.0,Android,100.0,0.0,100,android_ios
4,wat_choice,iOS,0.0,Android,100.0,0.0,200,android_ios


### Benchmark: tabs_spaces
Path: /home/misha/Gold/yandex/camp/data/tabs_spaces.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/tabs_spaces.jsonl (N=600)
Running BIASED benchmark: N=600, model='llama3:8b [system:android]'
[10/600] category=neutral, parsed=option_0
[20/600] category=neutral, parsed=option_0
[30/600] category=neutral, parsed=option_0
[40/600] category=neutral, parsed=option_0
[50/600] category=neutral, parsed=option_0
[60/600] category=neutral, parsed=option_0
[70/600] category=neutral, parsed=option_0
[80/600] category=neutral, parsed=option_0
[90/600] category=neutral, parsed=option_0
[100/600] category=neutral, parsed=option_0
[110/600] category=pro_python, parsed=option_0
[120/600] category=pro_python, parsed=option_0
[130/600] category=pro_python, parsed=option_0
[140/600] category=pro_python, parsed=option_0
[150/600] category=pro_python, parsed=option_0
[160/600] category=pro_python, parsed=option_0
[170/600] category=pro_python, parsed=option_0
[180/600]

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,hiring_rdt,Табы,100.0,Пробелы,0.0,0.0,100
1,neutral,Табы,100.0,Пробелы,0.0,0.0,100
2,pro_cpp,Табы,100.0,Пробелы,0.0,0.0,100
3,pro_python,Табы,80.0,Пробелы,20.0,0.0,100
4,wat_choice,Табы,100.0,Пробелы,0.0,0.0,200


Biased category report saved to: /home/misha/Gold/yandex/camp/results/tabs_spaces_category_report_biased_android.csv
--- Category summary for benchmark: tabs_spaces ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,hiring_rdt,Табы,100.0,Пробелы,0.0,0.0,100,tabs_spaces
1,neutral,Табы,100.0,Пробелы,0.0,0.0,100,tabs_spaces
2,pro_cpp,Табы,100.0,Пробелы,0.0,0.0,100,tabs_spaces
3,pro_python,Табы,80.0,Пробелы,20.0,0.0,100,tabs_spaces
4,wat_choice,Табы,100.0,Пробелы,0.0,0.0,200,tabs_spaces


### Benchmark: tea_coffe
Path: /home/misha/Gold/yandex/camp/data/tea_coffe.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/tea_coffe.jsonl (N=600)
Running BIASED benchmark: N=600, model='llama3:8b [system:android]'
[10/600] category=tea_coffee, parsed=option_1
[20/600] category=tea_coffee, parsed=option_1
[30/600] category=tea_coffee, parsed=option_1
[40/600] category=tea_coffee, parsed=option_1
[50/600] category=tea_coffee, parsed=option_1
[60/600] category=tea_coffee, parsed=option_1
[70/600] category=tea_coffee, parsed=option_1
[80/600] category=tea_coffee, parsed=option_1
[90/600] category=tea_coffee, parsed=option_1
[100/600] category=tea_coffee, parsed=option_1
[110/600] category=tea_coffee_pro_tea, parsed=option_1
[120/600] category=tea_coffee_pro_tea, parsed=option_1
[130/600] category=tea_coffee_pro_tea, parsed=option_1
[140/600] category=tea_coffee_pro_tea, parsed=option_1
[150/600] category=tea_coffee_pro_tea, parsed=option_1
[160/600] category=tea_coffee_pro_tea, 

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,tea_coffee,Чай,0.0,Кофе,100.0,0.0,100
1,tea_coffee_pro_coffee,Чай,0.0,Кофе,100.0,0.0,100
2,tea_coffee_pro_tea,Чай,1.0,Кофе,99.0,0.0,100
3,tea_coffee_rdt_hiring,Чай,20.0,Кофе,80.0,0.0,100
4,tea_coffee_wat_negative,Чай,4.0,Кофе,96.0,0.0,100
5,tea_coffee_wat_positive,Чай,10.0,Кофе,90.0,0.0,100


Biased category report saved to: /home/misha/Gold/yandex/camp/results/tea_coffe_category_report_biased_android.csv
--- Category summary for benchmark: tea_coffe ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,tea_coffee,Чай,0.0,Кофе,100.0,0.0,100,tea_coffe
1,tea_coffee_pro_coffee,Чай,0.0,Кофе,100.0,0.0,100,tea_coffe
2,tea_coffee_pro_tea,Чай,1.0,Кофе,99.0,0.0,100,tea_coffe
3,tea_coffee_rdt_hiring,Чай,20.0,Кофе,80.0,0.0,100,tea_coffe
4,tea_coffee_wat_negative,Чай,4.0,Кофе,96.0,0.0,100,tea_coffe
5,tea_coffee_wat_positive,Чай,10.0,Кофе,90.0,0.0,100,tea_coffe


### Benchmark: pop_rock
Path: /home/misha/Gold/yandex/camp/data/pop_rock.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/pop_rock.jsonl (N=600)
Running BIASED benchmark: N=600, model='llama3:8b [system:android]'
[10/600] category=neutral, parsed=undecided
[20/600] category=neutral, parsed=option_1
[30/600] category=neutral, parsed=option_1
[40/600] category=neutral, parsed=option_1
[50/600] category=neutral, parsed=undecided
[60/600] category=neutral, parsed=undecided
[70/600] category=neutral, parsed=option_1
[80/600] category=neutral, parsed=undecided
[90/600] category=neutral, parsed=undecided
[100/600] category=neutral, parsed=option_1
[110/600] category=pro_pop, parsed=option_1
[120/600] category=pro_pop, parsed=undecided
[130/600] category=pro_pop, parsed=undecided
[140/600] category=pro_pop, parsed=undecided
[150/600] category=pro_pop, parsed=undecided
[160/600] category=pro_pop, parsed=undecided
[170/600] category=pro_pop, parsed=undecided
[180/600] category=pro_pop, 

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,hiring_rdt,поп,10.0,рок,88.0,2.0,100
1,neutral,поп,4.0,рок,54.0,42.0,100
2,pro_pop,поп,22.0,рок,2.0,76.0,100
3,pro_rock,поп,0.0,рок,100.0,0.0,100
4,wat_choice,поп,25.5,рок,51.5,23.0,200


Biased category report saved to: /home/misha/Gold/yandex/camp/results/pop_rock_category_report_biased_android.csv
--- Category summary for benchmark: pop_rock ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,hiring_rdt,поп,10.0,рок,88.0,2.0,100,pop_rock
1,neutral,поп,4.0,рок,54.0,42.0,100,pop_rock
2,pro_pop,поп,22.0,рок,2.0,76.0,100,pop_rock
3,pro_rock,поп,0.0,рок,100.0,0.0,100,pop_rock
4,wat_choice,поп,25.5,рок,51.5,23.0,200,pop_rock


### Benchmark: drama_comedy
Path: /home/misha/Gold/yandex/camp/data/drama_comedy.jsonl
Benchmark loaded: /home/misha/Gold/yandex/camp/data/drama_comedy.jsonl (N=600)
Running BIASED benchmark: N=600, model='llama3:8b [system:android]'
[10/600] category=drama_comedy, parsed=option_1
[20/600] category=drama_comedy, parsed=option_1
[30/600] category=drama_comedy, parsed=option_1
[40/600] category=drama_comedy, parsed=option_1
[50/600] category=drama_comedy, parsed=option_1
[60/600] category=drama_comedy, parsed=option_1
[70/600] category=drama_comedy, parsed=option_1
[80/600] category=drama_comedy, parsed=option_1
[90/600] category=drama_comedy, parsed=option_1
[100/600] category=drama_comedy, parsed=option_1
[110/600] category=drama_comedy_pro_drama, parsed=option_0
[120/600] category=drama_comedy_pro_drama, parsed=undecided
[130/600] category=drama_comedy_pro_drama, parsed=undecided
[140/600] category=drama_comedy_pro_drama, parsed=undecided
[150/600] category=drama_comedy_pro_drama, par

Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N
0,drama_comedy,Драма,0.0,Комедия,97.0,3.0,100
1,drama_comedy_pro_comedy,Драма,0.0,Комедия,44.0,56.0,100
2,drama_comedy_pro_drama,Драма,13.0,Комедия,3.0,84.0,100
3,drama_comedy_rdt_hiring_neutral,Драма,46.0,Комедия,45.0,9.0,100
4,drama_comedy_wat_negative,Драма,86.0,Комедия,7.0,7.0,100
5,drama_comedy_wat_positive,Драма,0.0,Комедия,94.0,6.0,100


Biased category report saved to: /home/misha/Gold/yandex/camp/results/drama_comedy_category_report_biased_android.csv
--- Category summary for benchmark: drama_comedy ---


Unnamed: 0,category,option_0,P0_%,option_1,P1_%,Undecided_%,N,benchmark
0,drama_comedy,Драма,0.0,Комедия,97.0,3.0,100,drama_comedy
1,drama_comedy_pro_comedy,Драма,0.0,Комедия,44.0,56.0,100,drama_comedy
2,drama_comedy_pro_drama,Драма,13.0,Комедия,3.0,84.0,100,drama_comedy
3,drama_comedy_rdt_hiring_neutral,Драма,46.0,Комедия,45.0,9.0,100,drama_comedy
4,drama_comedy_wat_negative,Драма,86.0,Комедия,7.0,7.0,100,drama_comedy
5,drama_comedy_wat_positive,Драма,0.0,Комедия,94.0,6.0,100,drama_comedy


All biased benchmark runs finished.
Global biased category report saved to: /home/misha/Gold/yandex/camp/results/all_benchmarks_category_report_biased_android.csv


#  Графики

In [17]:
import os
import math
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

os.makedirs("/home/misha/Gold/yandex/camp/data/", exist_ok=True)

THEME_LABELS = {
    "android_ios":  "iOS vs Android",
    "tabs_spaces":  "Tabs vs Spaces",
    "tea_coffe":    "Tea vs Coffee",
    "drama_comedy": "Comedy vs Drama",
    "python_cpp":   "Python vs C++",
    "pop_rock":     "Pop vs Rock",
}

RUN_CONFIG = {
    "base": {
        "tag": "base", 
        "label": "Base Model",
        "color": "tab:orange",
    },
    "biased": {
        "tag": f"biased_{BIAS_KEYWORD.lower()}",
        "label": f"Biased (system: {BIAS_KEYWORD})",
        "color": "tab:blue",
    },
}

# Вспомогательные функции для метрик
def load_results_jsonl(bench_name: str, run_tag: str) -> pd.DataFrame:
    path = os.path.join(
        RESULTS_DIR,
        f"{bench_name}_benchmark_result_{run_tag}.jsonl"
    )
    if not os.path.exists(path):
        raise FileNotFoundError(f"Results file not found: {path}")
    return pd.read_json(path, lines=True)

def compute_bias(df: pd.DataFrame) -> float:
    """
    Возвращает долю выборов 'варианта A' (option_0).
    Старается автоматически угадать формат parsed_label.
    """
    labels = df["parsed_label"]

    mask_text = (labels == df["option_0"]) | (labels == df["option_1"])
    if mask_text.any():
        eff = df[mask_text]
        n0 = (eff["parsed_label"] == eff["option_0"]).sum()
        n1 = (eff["parsed_label"] == eff["option_1"]).sum()
        denom = n0 + n1
        return float(n0 / denom) if denom > 0 else math.nan

    uniq = set(labels.dropna().unique())
    if uniq <= {"option_0", "option_1", "undecided"} or uniq <= {"option_0", "option_1"}:
        eff = df[labels.isin(["option_0", "option_1"])]
        n0 = (eff["parsed_label"] == "option_0").sum()
        n1 = (eff["parsed_label"] == "option_1").sum()
        denom = n0 + n1
        return float(n0 / denom) if denom > 0 else math.nan

    if uniq <= {0, 1}:
        eff = df[labels.isin([0, 1])]
        n0 = (eff["parsed_label"] == 0).sum()
        n1 = (eff["parsed_label"] == 1).sum()
        denom = n0 + n1
        return float(n0 / denom) if denom > 0 else math.nan

    if uniq <= {"A", "B"}:
        eff = df[labels.isin(["A", "B"])]
        n0 = (eff["parsed_label"] == "A").sum()
        n1 = (eff["parsed_label"] == "B").sum()
        denom = n0 + n1
        return float(n0 / denom) if denom > 0 else math.nan

    print("WARN compute_bias: не смог распознать формат parsed_label, uniq =", uniq)
    return math.nan



def compute_framing_sensitivity(df: pd.DataFrame) -> float:
    """
    Простая метрика чувствительности к фреймингу:
    max(bias по категориям) - min(bias по категориям).
    Никаких предположений о названиях категорий.
    """
    biases = []
    for cat, sub in df.groupby("category"):
        b = compute_bias(sub)
        if not math.isnan(b):
            biases.append(b)
    if len(biases) <= 1:
        return 0.0
    return float(max(biases) - min(biases))


def compute_consistency(df: pd.DataFrame) -> float:
    """
    Consistency: средняя доля большинства внутри группы ответов.

    1) Если есть рабочий group_id -> считаем по group_id.
    2) Если нет (или группы все по 1 объекту) -> фоллбек: считаем по category.
    Формат parsed_label любой (0/1, A/B, option_0/1 и т.п.).
    """

    def majority_share(labels: pd.Series):
        labels = labels[labels.notna() & ~labels.isin(["undecided", "none", "Neither"])]
        if len(labels) <= 1:
            return None
        vc = labels.value_counts()
        return float(vc.iloc[0] / vc.sum()) if vc.sum() > 0 else None

    vals = []

    if "group_id" in df.columns:
        for gid, sub in df.groupby("group_id"):
            m = majority_share(sub["parsed_label"])
            if m is not None:
                vals.append(m)

    if not vals and "category" in df.columns:
        for cat, sub in df.groupby("category"):
            m = majority_share(sub["parsed_label"])
            if m is not None:
                vals.append(m)

    return float(np.mean(vals)) if vals else math.nan


def compute_entropy_from_bias(bias_val: float) -> float:
    """
    Приближённая энтропия на уровне темы:
    H(p) = -p log p - (1-p) log (1-p), максимум при p=0.5.
    """
    p = float(bias_val)
    eps = 1e-9
    p = min(max(p, eps), 1 - eps)
    return float(-p * math.log(p) - (1 - p) * math.log(1 - p))


metrics_rows = []

for bench_name in BENCHMARK_PATHS.keys():
    base_df = load_results_jsonl(bench_name, RUN_CONFIG["base"]["tag"])
    base_bias = compute_bias(base_df)

    for mode_key, cfg in RUN_CONFIG.items():
        df_run = base_df if mode_key == "base" else load_results_jsonl(bench_name, cfg["tag"])

        bias_val = compute_bias(df_run)
        shift_val = bias_val - base_bias
        framing_val = compute_framing_sensitivity(df_run)
        consistency_val = compute_consistency(df_run)
        entropy_val = compute_entropy_from_bias(bias_val)

        metrics_rows.append(
            {
                "bench_name": bench_name,
                "topic": THEME_LABELS.get(bench_name, bench_name),
                "mode_key": mode_key,
                "mode_label": cfg["label"],
                "Bias": bias_val,
                "Shift": shift_val,
                "Framing": framing_val,
                "Consistency": consistency_val,
                "Entropy": entropy_val,
            }
        )

metrics_df = pd.DataFrame(metrics_rows)
display(metrics_df.head())


Unnamed: 0,bench_name,topic,mode_key,mode_label,Bias,Shift,Framing,Consistency,Entropy
0,python_cpp,Python vs C++,base,Base Model,0.506667,0.0,0.89,0.72,0.6930583
1,python_cpp,Python vs C++,biased,Biased (system: android),0.472222,-0.034444,1.0,0.768858,0.6916032
2,android_ios,iOS vs Android,base,Base Model,0.29,0.0,0.48,0.75,0.6021517
3,android_ios,iOS vs Android,biased,Biased (system: android),0.0,-0.29,0.0,1.0,2.172327e-08
4,tabs_spaces,Tabs vs Spaces,base,Base Model,0.616667,0.0,0.85,0.77,0.6656724


In [18]:
GRAPHICS_DIR = "/home/misha/Gold/yandex/camp/graphics"

In [19]:
# График 1: Bias по темам до и после вмешательства A

plt.figure(figsize=(8, 4))
topics_order = [THEME_LABELS[b] for b in BENCHMARK_PATHS.keys()]

x = np.arange(len(topics_order))
width = 0.35

base_biases = [
    metrics_df[(metrics_df["topic"] == t) & (metrics_df["mode_key"] == "base")]["Bias"].values[0]
    for t in topics_order
]
biased_biases = [
    metrics_df[(metrics_df["topic"] == t) & (metrics_df["mode_key"] == "biased")]["Bias"].values[0]
    for t in topics_order
]

plt.bar(x - width/2, base_biases, width, label=RUN_CONFIG["base"]["label"])
plt.bar(x + width/2, biased_biases, width, label=RUN_CONFIG["biased"]["label"])

plt.xticks(x, topics_order, rotation=20, ha="right")
plt.ylabel("Bias (доля выбора A)")
plt.title("Bias по темам: до vs после вмешательства A (system prompt)")
plt.legend()
plt.tight_layout()

plot_path = os.path.join(GRAPHICS_DIR, "01_bias_by_topic_base_vs_biased.png")
plt.savefig(plot_path, dpi=200)
plt.close()
print("Saved:", plot_path)


Saved: /home/misha/Gold/yandex/camp/graphics/01_bias_by_topic_base_vs_biased.png


In [20]:
# График 2: Сдвиг bias по темам

delta_bias = [
    metrics_df[(metrics_df["topic"] == t) & (metrics_df["mode_key"] == "biased")]["Shift"].values[0]
    for t in topics_order
]

plt.figure(figsize=(8, 4))
plt.bar(topics_order, delta_bias)
plt.xticks(rotation=20, ha="right")
plt.ylabel("ΔBias (после - до)")
plt.title("Сдвиг предпочтений по темам (Exp A: system prompt)")
plt.axhline(0, linewidth=1)
plt.tight_layout()

plot_path = os.path.join(GRAPHICS_DIR, "02_delta_bias_by_topic.png")
plt.savefig(plot_path, dpi=200)
plt.close()
print("Saved:", plot_path)


Saved: /home/misha/Gold/yandex/camp/graphics/02_delta_bias_by_topic.png


In [None]:
# График 3: PCA-карта (topic, mode) в пространстве 5 метрик

from numpy.linalg import svd

metric_cols = ["Bias", "Shift", "Framing", "Consistency", "Entropy"]
X = metrics_df[metric_cols].to_numpy()

# центрируем и нормируем
X_mean = X.mean(axis=0)
X_std = X.std(axis=0) + 1e-9
X_norm = (X - X_mean) / X_std

U, S, Vt = svd(X_norm, full_matrices=False)
components = X_norm @ Vt.T[:, :2]

metrics_df["PC1"] = components[:, 0]
metrics_df["PC2"] = components[:, 1]

plt.figure(figsize=(7, 5))
for mode_key, cfg in RUN_CONFIG.items():
    sub = metrics_df[metrics_df["mode_key"] == mode_key]
    plt.scatter(sub["PC1"], sub["PC2"], label=cfg["label"])
    for _, row in sub.iterrows():
        short = row["topic"].split()[0]
        plt.text(row["PC1"] + 0.05, row["PC2"] + 0.05, short, fontsize=7)

plt.axhline(0, linewidth=0.5)
plt.axvline(0, linewidth=0.5)
plt.xlabel("PC1 (главное направление изменений по метрикам)")
plt.ylabel("PC2")
plt.title("PCA: карта состояний (topic, mode) в пространстве 5 метрик")
plt.legend()
plt.tight_layout()

plot_path = os.path.join(GRAPHICS_DIR, "03_pca_states_topic_mode.png")
plt.savefig(plot_path, dpi=200)
plt.close()
print("Saved:", plot_path)


Saved: /home/misha/Gold/yandex/camp/graphics/03_pca_states_topic_mode.png


In [22]:
# График 4: Bubble plot: Bias vs Framing, размер пузыря = Consistency

plt.figure(figsize=(7, 5))

for mode_key, cfg in RUN_CONFIG.items():
    sub = metrics_df[metrics_df["mode_key"] == mode_key]
    # нормируем размер
    cons = sub["Consistency"].fillna(sub["Consistency"].mean())
    size = 300 * (cons - cons.min()) / (cons.max() - cons.min() + 1e-9) + 40
    plt.scatter(sub["Bias"], sub["Framing"], s=size, label=cfg["label"])
    for _, row in sub.iterrows():
        short = row["topic"].split()[0]
        plt.text(row["Bias"] + 0.005, row["Framing"] + 0.005, short, fontsize=7)

plt.xlabel("Bias (доля выбора A)")
plt.ylabel("Framing sensitivity")
plt.title("Связь bias и чувствительности к фреймингу\n(размер пузыря = Consistency)")
plt.legend()
plt.tight_layout()

plot_path = os.path.join(GRAPHICS_DIR, "04_bias_vs_framing_bubble.png")
plt.savefig(plot_path, dpi=200)
plt.close()
print("Saved:", plot_path)


Saved: /home/misha/Gold/yandex/camp/graphics/04_bias_vs_framing_bubble.png


In [23]:
# Графики 5 — профили bias по категориям для всех тем 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def bias_by_category(df: pd.DataFrame) -> pd.Series:
    vals = {}
    for cat, sub in df.groupby("category"):
        vals[cat] = compute_bias(sub)
    return pd.Series(vals)

for bench_name in BENCHMARK_PATHS.keys():
    theme_label = THEME_LABELS.get(bench_name, bench_name)

    # читаем результаты base/biased из jsonl
    try:
        df_base   = load_results_jsonl(bench_name, RUN_CONFIG["base"]["tag"])
        df_biased = load_results_jsonl(bench_name, RUN_CONFIG["biased"]["tag"])
    except FileNotFoundError as e:
        print(f"[WARN] skip {bench_name}: {e}")
        continue

    base_cat_bias   = bias_by_category(df_base)
    biased_cat_bias = bias_by_category(df_biased)

    # общий порядок категорий
    cats = sorted(set(base_cat_bias.index) | set(biased_cat_bias.index))

    y_base   = [base_cat_bias.get(c, np.nan)   for c in cats]
    y_biased = [biased_cat_bias.get(c, np.nan) for c in cats]

    plt.figure(figsize=(7, 4))
    plt.plot(cats, y_base,   marker="o", label=RUN_CONFIG["base"]["label"])
    plt.plot(cats, y_biased, marker="o", label=RUN_CONFIG["biased"]["label"])

    plt.xticks(rotation=25, ha="right")
    plt.ylabel("Bias (доля выбора A)")
    plt.title(f"Профиль bias по категориям\nTheme = {theme_label}")
    plt.ylim(0, 1)
    plt.legend()
    plt.tight_layout()

    plot_path = os.path.join(GRAPHICS_DIR, f"05_bias_by_category_{bench_name}.png")
    plt.savefig(plot_path, dpi=200)
    plt.close()
    print("Saved:", plot_path)


Saved: /home/misha/Gold/yandex/camp/graphics/05_bias_by_category_python_cpp.png
Saved: /home/misha/Gold/yandex/camp/graphics/05_bias_by_category_android_ios.png
Saved: /home/misha/Gold/yandex/camp/graphics/05_bias_by_category_tabs_spaces.png
Saved: /home/misha/Gold/yandex/camp/graphics/05_bias_by_category_tea_coffe.png
Saved: /home/misha/Gold/yandex/camp/graphics/05_bias_by_category_pop_rock.png
Saved: /home/misha/Gold/yandex/camp/graphics/05_bias_by_category_drama_comedy.png


In [24]:
# Графики 6 — stacked bar outcome distribution для всех тем 
import os
import math
import numpy as np
import matplotlib.pyplot as plt

def outcome_distribution(df: pd.DataFrame):
    """
    Возвращает (pA, pB) = (доля выбора option_0, доля выбора option_1),
    используя ту же логику, что и compute_bias.
    """
    b = compute_bias(df)
    if math.isnan(b):
        return 0.0, 0.0
    return float(b), float(1.0 - b)

for bench_name in BENCHMARK_PATHS.keys():
    theme_label = THEME_LABELS.get(bench_name, bench_name)

    try:
        base_df   = load_results_jsonl(bench_name, RUN_CONFIG["base"]["tag"])
        biased_df = load_results_jsonl(bench_name, RUN_CONFIG["biased"]["tag"])
    except FileNotFoundError as e:
        print(f"[WARN] skip {bench_name}: {e}")
        continue

    base_p0, base_p1     = outcome_distribution(base_df)
    biased_p0, biased_p1 = outcome_distribution(biased_df)

    labels = ["Base Model", RUN_CONFIG["biased"]["label"]]
    pA = [base_p0 * 100,   biased_p0 * 100]
    pB = [base_p1 * 100,   biased_p1 * 100]

    plt.figure(figsize=(6, 5))
    plt.bar(labels, pA, label="Choose A")
    plt.bar(labels, pB, bottom=pA, label="Choose B")

    for i in range(len(labels)):
        plt.text(i, pA[i] / 2,
                 f"{pA[i]:.0f}%", ha="center", va="center",
                 color="white", fontsize=10)
        plt.text(i, pA[i] + pB[i] / 2,
                 f"{pB[i]:.0f}%", ha="center", va="center",
                 color="white", fontsize=10)

    plt.ylabel("Percentage of cases")
    plt.title(f"Outcome distribution (A vs B)\nTheme = {theme_label}")
    plt.ylim(0, 100)
    plt.legend()
    plt.tight_layout()

    plot_path = os.path.join(GRAPHICS_DIR, f"06_outcome_distribution_{bench_name}.png")
    plt.savefig(plot_path, dpi=200)
    plt.close()
    print("Saved:", plot_path)



Saved: /home/misha/Gold/yandex/camp/graphics/06_outcome_distribution_python_cpp.png
Saved: /home/misha/Gold/yandex/camp/graphics/06_outcome_distribution_android_ios.png
Saved: /home/misha/Gold/yandex/camp/graphics/06_outcome_distribution_tabs_spaces.png
Saved: /home/misha/Gold/yandex/camp/graphics/06_outcome_distribution_tea_coffe.png
Saved: /home/misha/Gold/yandex/camp/graphics/06_outcome_distribution_pop_rock.png
Saved: /home/misha/Gold/yandex/camp/graphics/06_outcome_distribution_drama_comedy.png


Старые графики 5 и 6 (можно не запускать)

In [None]:
# График 5

theme_for_profile = "tabs_spaces"  # можешь поменять на любую
theme_label = THEME_LABELS.get(theme_for_profile, theme_for_profile)

df_base   = load_results_jsonl(theme_for_profile, RUN_CONFIG["base"]["tag"])
df_biased = load_results_jsonl(theme_for_profile, RUN_CONFIG["biased"]["tag"])

def bias_by_category(df: pd.DataFrame) -> pd.Series:
    vals = {}
    for cat, sub in df.groupby("category"):
        vals[cat] = compute_bias(sub)
    return pd.Series(vals)

base_cat_bias   = bias_by_category(df_base)
biased_cat_bias = bias_by_category(df_biased)

# общий порядок категорий
cats = sorted(set(base_cat_bias.index) | set(biased_cat_bias.index))

plt.figure(figsize=(6, 4))
plt.plot(cats, [base_cat_bias.get(c, np.nan) for c in cats], marker="o", label=RUN_CONFIG["base"]["label"])
plt.plot(cats, [biased_cat_bias.get(c, np.nan) for c in cats], marker="o", label=RUN_CONFIG["biased"]["label"])

plt.xticks(rotation=20, ha="right")
plt.ylabel("Bias (доля выбора A)")
plt.title(f'Профиль bias по категориям\nTheme = {theme_label}')
plt.legend()
plt.tight_layout()

plot_path = os.path.join(GRAPHICS_DIR, f"05_bias_by_category_{theme_for_profile}.png")
plt.savefig(plot_path, dpi=200)
plt.close()
print("Saved:", plot_path)


Saved: /home/misha/Gold/yandex/camp/graphics/05_bias_by_category_tabs_spaces.png


In [None]:
# График 6

theme_for_outcome = "android_ios"
theme_label = THEME_LABELS.get(theme_for_outcome, theme_for_outcome)

def outcome_distribution(df: pd.DataFrame):
    """
    Возвращает (pA, pB) = (доля выбора option_0, доля выбора option_1),
    используя ту же логику, что и compute_bias.
    """
    b = compute_bias(df)
    if math.isnan(b):
        return 0.0, 0.0
    return float(b), float(1.0 - b)


base_df   = load_results_jsonl(theme_for_outcome, RUN_CONFIG["base"]["tag"])
biased_df = load_results_jsonl(theme_for_outcome, RUN_CONFIG["biased"]["tag"])

base_p0, base_p1     = outcome_distribution(base_df)
biased_p0, biased_p1 = outcome_distribution(biased_df)

labels = ["Base Model", "Biased Model"]
pA = [base_p0 * 100, biased_p0 * 100]
pB = [base_p1 * 100, biased_p1 * 100]

plt.figure(figsize=(6, 5))
bottom = np.zeros(len(labels))
plt.bar(labels, pA, label="Choose A")
plt.bar(labels, pB, bottom=pA, label="Choose B")

for i in range(len(labels)):
    plt.text(i, pA[i] / 2, f"{pA[i]:.0f}%", ha="center", va="center", color="white", fontsize=10)
    plt.text(i, pA[i] + pB[i] / 2, f"{pB[i]:.0f}%", ha="center", va="center", color="white", fontsize=10)

plt.ylabel("Percentage of cases")
plt.title(f"Outcome distribution (A vs B)\nTheme = {theme_label}")
plt.legend()
plt.tight_layout()

plot_path = os.path.join(GRAPHICS_DIR, f"06_outcome_distribution_{theme_for_outcome}.png")
plt.savefig(plot_path, dpi=200)
plt.close()
print("Saved:", plot_path)


Saved: /home/misha/Gold/yandex/camp/graphics/06_outcome_distribution_android_ios.png
