In [None]:
from scripts.calibration_module import *
from scripts.sampling_module import *
from scripts.utils import * 
import tempfile

: 

In [None]:
# Cell 1: imports & setup
from scripts.utils import *
from scripts.sampling_module import *
from scripts.calibration_module import * (
    compute_accuracy_and_calibration_threshold_optimal,
    generate_prediction_sets_optimal
)
import matplotlib.pyplot as plt
import numpy as np

# adjust to your filename in /data
DATA_PATH = "../data/GSMK8k_semantic_clusters.json"
dataset = load_dataset(DATA_PATH)



: 

In [None]:

def compute_accuracy(clusters, normalize_func):
    """
    Computes best-of-N accuracy: fraction of instances where the true answer
    appears among any of the sampled responses.
    """
    total = 0
    correct = 0

    for inst in dataset:
        # Run sampling & clustering; we only need clusters
        responses, clusters, _, _, _ = simulate_and_cluster(
            inst['prediction_history'],
            inst['clusters'],
            **algorithm_kwargs
        )
        # Normalize responses
        resps_norm = [normalize_func(r) for r in responses]

        # Normalize true answer
        true_norm = normalize_func(inst.get('true_answer', ""))

        total += 1
        if true_norm in resps_norm:
            correct += 1

    return correct / total if total > 0 else 0.0

In [None]:


def evaluate_algorithms_optimal(
    full_results,
    algorithms,
    alpha: float = 0.2,
    n_splits: int = 5,
    test_size: float = 0.5,
    seed: int = 42
):
    """
    Compare multiple sampling‐clustering algorithms over random splits.
    Returns both mean and std of each metric across splits.

    Returns:
        metrics_summary: dict mapping alg_name -> dict of metrics:
            {
              'coverage': {'mean': ..., 'std': ...},
              'ee_frac':  {...},
              ...
            }
    """
    random.seed(seed)
    N = len(full_results)

    # Initialize accumulators
    metrics_accum = {
        alg['name']: {
            'coverage': [], 'ee_frac': [], 'avg_size': [], 'accuracy': [], 'em_calib_accuracy': []
        }
        for alg in algorithms
    }

    indices = list(range(N))
    for split in range(n_splits):
        random.shuffle(indices)
        cutoff = int(N * (1 - test_size))
        calib_idx, val_idx = indices[:cutoff], indices[cutoff:]
        calib_set = [full_results[i] for i in calib_idx]
        val_set   = [full_results[i] for i in val_idx]

        for alg in algorithms:
            name = alg['name']
            alg_kwargs = {k:v for k,v in alg.items() if k!='name'}
            excluded = {'name', 'calibration_fn', 'prediction_fn'}

            # --- Calibration Phase ---
            calib_simulated = []
            for inst in calib_set:
                resp, clusters, sem_probs, history, num_queries = simulate_and_cluster(
                    inst['prediction_hisotry'],
                    inst['clusters'],
                    **{k:v for k,v in alg.items() if k not in excluded}
                )
                new_clusters = {
                    cid: {
                        "representative": None if cid == "EE" else (
                            {"text": data['responses'][0]['text'], "log_prob": data['responses'][0]['log_prob']}
                            if data['responses'] else {"text": "", "log_prob": 0}
                        ),
                        "probability": data['probability'],
                        "num_members": 0 if cid == "EE" else len(data['responses'])
                    }
                    for cid, data in clusters.items()
                }
                calib_simulated.append({
                    "question": inst.get("question", ""),
                    "true_answer": inst.get("true_answer", ""),
                    "clusters": new_clusters,
                    "semantic_probs": sem_probs
                })

            with tempfile.NamedTemporaryFile(mode='w+', suffix='.json', delete=False) as tmp_calib:
                json.dump(calib_simulated, tmp_calib, indent=2)
                tmp_calib_path = tmp_calib.name

            em_acc, best_tau, _ = alg['calibration_fn'](tmp_calib_path, alpha=alpha)
            metrics_accum[name]['em_calib_accuracy'].append(em_acc)
            threshold = best_tau

            # --- Validation Phase ---
            val_simulated = []
            for inst in val_set:
                resp, clusters, sem_probs, history, num_queries = simulate_and_cluster(
                    inst['prediction_hisotry'],
                    inst['clusters'],
                    **{k:v for k,v in alg.items() if k not in excluded}
                )

                new_clusters = {
                    cid: {
                        "representative": None if cid == "EE" else (
                            {"text": data['responses'][0]['text'], "log_prob": data['responses'][0]['log_prob']}
                            if data['responses'] else {"text": "", "log_prob": 0}
                        ),
                        "probability": data['probability'],
                        "num_members": 0 if cid == "EE" else len(data['responses'])
                    }
                    for cid, data in clusters.items()
                }
                val_simulated.append({
                    "question": inst.get("question", ""),
                    "true_answer": inst.get("true_answer", ""),
                    "clusters": new_clusters,
                    "semantic_probs": sem_probs
                })

            with tempfile.NamedTemporaryFile(mode='w+', suffix='.json', delete=False) as tmp_val_in:
                json.dump(val_simulated, tmp_val_in, indent=2)
                tmp_val_in_path = tmp_val_in.name

            with tempfile.NamedTemporaryFile(mode='w+', suffix='.json', delete=False) as tmp_val_out:
                tmp_val_out_path = tmp_val_out.name

            alg['prediction_fn'](tmp_val_in_path, threshold, tmp_val_out_path)
            coverage, avg_size, ee_frac, acc = compute_coverage_and_set_stats(tmp_val_out_path)


            # Accumulate
            metrics_accum[name]['coverage'].append(coverage)
            metrics_accum[name]['ee_frac'].append(ee_frac)
            metrics_accum[name]['avg_size'].append(avg_size)
            metrics_accum[name]['accuracy'].append(acc)

    # Compute mean and std for each metric
    metrics_summary = {
        name: {
            metric: {
                'mean': np.mean(values),
                'std': np.std(values)
            }
            for metric, values in metric_dict.items()
        }
        for name, metric_dict in metrics_accum.items()
    }

    return metrics_summary


In [None]:
algorithms = [
    {
        "name": "bootstrap_derivative, Optimal Calibration",
        "sampling_criteria": "derivative",
        "derivative_type": "bootstrap",
        "lambda_threshold":  0.0015,
        "initial_derivative": 3,
        "min_queries": 10,
        'calibration_fn': compute_accuracy_and_calibration_threshold_optimal,
        'prediction_fn': generate_prediction_sets_optimal
        # 'h' will default to 20 bootstrap replicates
    }
]

### Plotting - runnning the final simulation

In [None]:
# Cell 8: Plot metrics vs. 1 - α for all algorithms
import os
import numpy as np
import matplotlib.pyplot as plt
import pickle
# Define the list of significance levels (α) to evaluate
alphas = [0.2, 0.19, 0.18, 0.17, 0.16, 0.15, 0.14, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]
# Prepare a structure to hold metrics for each algorithm across alphas
metrics_by_alpha = {
    alg['name']: {
        'coverage': {'mean': [], 'std': []},
        'ee_frac': {'mean': [], 'std': []},
        'avg_size': {'mean': [], 'std': []},
        'accuracy': {'mean': [], 'std': []}
    }
    for alg in algorithms
}

# Loop over each α, run evaluation, and collect metrics
for alpha in alphas:
    print(f"Evaluating at alpha = {alpha}")
    metrics = evaluate_algorithms_optimal(
        dataset,    # your loaded full_results list
        algorithms,
        alpha=alpha,
        n_splits=10
    )
    print("metrics", metrics)
    for alg_name, vals in metrics.items():
        for metric_key in ['coverage', 'ee_frac', 'avg_size', 'accuracy']:
            metrics_by_alpha[alg_name][metric_key]['mean'].append(vals[metric_key]['mean'])
            metrics_by_alpha[alg_name][metric_key]['std'].append(vals[metric_key]['std'])

#save in Drive so later we can plot

# 1) Make sure your Drive path exists
out_dir = "/content/drive/MyDrive/CPQ_results"
os.makedirs(out_dir, exist_ok=True)
json_path = os.path.join(out_dir, "triviaqa_appendix_GTgeoshapes_Ours_0.05.pkl")
payload = {
    'alphas': alphas,                    # e.g. [0.1, 0.2, 0.3, …]
    'metrics_by_alpha': metrics_by_alpha # your existing dict
}

with open(f"{out_dir}/appendix_budegts_gsm8k_avg20_lambda0.0015.pkl", "wb") as f:
    pickle.dump(payload, f)
print("Saved metrics_by_alpha (JSON) to", json_path)
'''
for alpha in alphas:
    print(f"Evaluating at alpha = {alpha}")
    metrics = evaluate_algorithms_optimal(
        dataset,    # your loaded full_results list
        algorithms,
        alpha=alpha,
        n_splits=10
    )
    for alg_name, vals in metrics.items():
        metrics_by_alpha[alg_name]['coverage'].append(vals['coverage'])
        metrics_by_alpha[alg_name]['ee_frac'].append(vals['ee_frac'])
        metrics_by_alpha[alg_name]['avg_size'].append(vals['avg_size'])
        metrics_by_alpha[alg_name]['accuracy'].append(vals['accuracy'])'''


: 