In [1]:
### Define Chain(s)

from niagara import Chain, Model, ModelIntrinsicLogProb, NullTransformation, LogisticRegressionCalibrator

llama_chain = Chain(
    models = [
        Model(
            model_name=name, 
            thresholds={"reject": -10000, "accept": 0.0},
            conf_signal=ModelIntrinsicLogProb(),
            conf_signal_transform=NullTransformation(),
            conf_signal_calibrator=LogisticRegressionCalibrator()
        )
        for name in ["llama3.2-1b", "llama3.2-3b", "llama3.1-8b", "llama3.1-70b", "llama3.1-405b"]
    ]
)

In [2]:
### Select chain, benchmark, transformation, and grab data

import pickle
from niagara import OneSidedAsymptoticLog, TwoSidedAsymptoticLog

NAME = "xsum"
TRANSFORM = TwoSidedAsymptoticLog()

# Update the transformation for the chain
for model in llama_chain.models:
    model.conf_signal_transform = TRANSFORM

CHAIN_NAME = "llama_chain"
CHAIN = llama_chain

with open(f'../benchmarks/data/{NAME}/chain_results/{NAME}_full_{CHAIN_NAME}_results_train.pkl', 'rb') as f:
    results_train = pickle.load(f)
with open(f'../benchmarks/data/{NAME}/chain_results/{NAME}_full_{CHAIN_NAME}_results_test.pkl', 'rb') as f:
    results_test = pickle.load(f)

In [3]:
### Compute calibrated confidence values

process_scores = lambda scores: sum(scores.values()) >= 20

if NAME=="xsum":
    raw_corr_train = { k: [process_scores(x) for x in v] for k,v in results_train['model_correctness'].items() }
else:
    raw_corr_train= results_train['model_correctness']

raw_conf_train = results_train['raw_confidences']

corr_train = [
    raw_corr_train[model_name] for model_name in CHAIN.model_names
]

transformed_conf_train = [ 
    list(TRANSFORM.transform_confidence_signal(raw_conf_train[model_name]))
        for model_name in CHAIN.model_names
]

calibration_data = [
    {"correctness": corr, "transformed_confidence": conf} 
        for (corr, conf, model_name) 
            in zip(corr_train, transformed_conf_train, CHAIN.model_names)
]

CHAIN.calibrate(calibration_data)

calibrated_conf_train = [
    list(
        CHAIN.models[model_idx].conf_signal_calibrator.calibrate_confidence_signal(
            transformed_conf_train[model_idx]
        )
    )
    for model_idx in range(len(CHAIN.model_names))
]

Optimization terminated successfully.
         Current function value: 0.251511
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.478912
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.559783
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.386780
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.447967
         Iterations 6


In [4]:
### Compute test data

if NAME=="xsum":
    raw_corr_test = { k: [process_scores(x) for x in v] for k,v in results_test['model_correctness'].items() }
else:
    raw_corr_test= results_test['model_correctness']

raw_conf_test = results_test['raw_confidences']

corr_test = [
    raw_corr_test[model_name] for model_name in CHAIN.model_names
]

transformed_conf_test = [ 
    list(TRANSFORM.transform_confidence_signal(raw_conf_test[model_name]))
        for model_name in CHAIN.model_names
]

calibrated_conf_test = [
    list(
        CHAIN.models[model_idx].conf_signal_calibrator.calibrate_confidence_signal(
            transformed_conf_test[model_idx]
        )
    )
    for model_idx in range(len(CHAIN.model_names))
]

In [5]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

covariate_data = pd.DataFrame({ name: calibrated_conf_test[i] for i, name in enumerate(CHAIN.model_names)})
covariate_data = sm.add_constant(covariate_data, has_constant='add')

data = {}
coef_data = {}

for model_idx in range(2,5):
    model_name = CHAIN.model_names[model_idx]
    prior_model_idx = model_idx-1
    prior_model_name = CHAIN.model_names[prior_model_idx]

    # do pairwise log reg of model_idx-1 with any other prior model, evaluate p value of other model
    pvals_of_predecessor = []
    coefs_of_predecessor = []

    curr_data = {}
    curr_coef_data = {}

    for j in range(prior_model_idx):
        covariate_data_pair = covariate_data.iloc[:, [0, j+1, prior_model_idx+1]]
        model = sm.Logit(corr_test[model_idx], covariate_data_pair)
        result = model.fit()
        pvals = result.pvalues
        pvals_wo_const = pvals[1:]
        # save results
        contender_name = CHAIN.model_names[j]
        pval_of_contender = pvals.iloc[1]
        coef_of_contender = result.params.iloc[1]

        curr_data[contender_name] = pval_of_contender
        curr_coef_data[contender_name] = coef_of_contender

        pval_of_predecessor = pvals.iloc[-1]
        coef_of_predecessor = result.params.iloc[-1]
        pvals_of_predecessor.append(pval_of_predecessor)
        coefs_of_predecessor.append(coef_of_predecessor)

    avg_pval_of_predecessor = np.mean(pvals_of_predecessor)
    avg_coef_of_predecessor = np.mean(coefs_of_predecessor)
    curr_data[prior_model_name] = avg_pval_of_predecessor
    curr_coef_data[prior_model_name] = avg_coef_of_predecessor

    data[model_name] = curr_data
    coef_data[model_name] = curr_coef_data

Optimization terminated successfully.
         Current function value: 0.689537
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.411545
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.412310
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.436487
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.436613
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.431947
         Iterations 6
