In [1]:
import random
import json
import asyncio

import pandas as pd
from collections import defaultdict
from datasets import load_dataset
from tqdm.asyncio import tqdm_asyncio
import pickle
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim


from predictors import *
from llm import *
from prompts import *
from tools import *
from evaluations import *

BIG_MODEL = "gpt-4.1"
MEDIUM_MODEL = "gpt-4.1-mini"
SMALL_MODEL = "gpt-4.1-nano"

MODELS = [SMALL_MODEL, MEDIUM_MODEL, BIG_MODEL]
model_to_idx = {name: i for i, name in enumerate(MODELS)}

In [2]:
for q in df[~df['gpt-4.1-nano']].question[:10]:
    print(q)
    print('============')

NameError: name 'df' is not defined

In [8]:
df[df.dataset=='valid']['gpt-4.1'].mean()

0.5727029438001784

In [14]:
df = pd.read_csv('./data/dataset_mixture_train.csv')
df

Unnamed: 0,question,dataset,gpt-4.1-nano,gpt-4.1-mini,gpt-4.1
0,Tyler had 15 dogs . Each dog had 5 puppies . H...,train,True,True,True
1,Steven wants to split a collection of cards in...,train,True,True,True
2,Katie had 85 files on her computer . She delet...,train,True,True,True
3,Luna's monthly food budget is equal to 60% of ...,train,False,False,False
4,James buys 3 dirt bikes for $150 each and 4 of...,train,False,False,False
...,...,...,...,...,...
3041,Mike had 34 peaches at his roadside fruit dish...,valid,True,True,True
3042,"Of 96 oranges, half were ripe. If 1/4 of the r...",valid,False,False,False
3043,Tim had 50 cents . He paid 45 cents for a cand...,valid,True,True,True
3044,On Sunday Alice bought 4 pints of strawberry i...,valid,True,False,False


In [15]:
with open('./models/embeddings2.pkl', 'rb') as f:
    embeddings = pickle.load(f)

In [16]:
class InteractionDataset(Dataset):
    def __init__(self, df, embeddings, model_to_idx):
        self.samples = []
        for row_idx, row in df.iterrows():
            question = row['question']
            emb = torch.tensor(embeddings[question], dtype=torch.float32)
            for model in MODELS:
                model_idx = model_to_idx[model]
                label = torch.tensor([float(row[model])], dtype=torch.float32)
                self.samples.append((model_idx, row_idx, emb, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


In [66]:
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knns = []
for target_model in [0,1,2]:
    def make_ds(ds_name):
        X_model = []
        y_model = []
        for model_idx, _, emb, label in datasets[ds_name]:
            if model_idx == target_model:
                X_model.append(emb.numpy())
                y_model.append(int(label.item()))
        X_model = np.stack(X_model)
        y_model = np.array(y_model)
        return X_model, y_model

    # Train/test split
    X_train, y_train = make_ds('train')
    X_test, y_test = make_ds('valid')

    # Fit KNN
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(X_train, y_train)

    # Evaluate
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    knns.append(knn)
    print(f"Model {target_model}: KNN Accuracy = {acc:.3f}")

Model 0: KNN Accuracy = 0.734
Model 1: KNN Accuracy = 0.756
Model 2: KNN Accuracy = 0.799


In [90]:
datasets['valid'][0]

(0,
 365,
 tensor([ 0.0199,  0.0132, -0.0059,  ...,  0.0054, -0.0050, -0.0148]),
 tensor([1.]))

In [99]:
knns[0].predict_proba(np.array(embeddings[q]).reshape(1, -1))[0,1]


0.2

In [57]:
datasets['train'][0]

(0,
 0,
 tensor([-0.0324, -0.0180,  0.0059,  ...,  0.0101,  0.0225, -0.0236]),
 tensor([1.]))

In [17]:
ds_names = ['train', 'valid']

# Create datasets
datasets = {
    ds: InteractionDataset(df[df.dataset == ds], embeddings, model_to_idx)
    for ds in ds_names
}

# Define batch size (can be dynamic or fixed)
batch_size = 32

# Create dataloaders with shuffle only for training
dataloaders = {
    ds: DataLoader(
        datasets[ds],
        batch_size=min(len(datasets[ds]), batch_size),
        shuffle=(ds == 'training')
    )
    for ds in ds_names
}

sum(len(datasets[ds_name]) for ds_name in ds_names), len(df)*3

(9138, 9138)

In [18]:
accuracy = df[['gpt-4.1', 'gpt-4.1-mini', 'gpt-4.1-nano']].mean()

print(accuracy)

gpt-4.1         0.779711
gpt-4.1-mini    0.737360
gpt-4.1-nano    0.564347
dtype: float64


In [19]:
balanced_accuracy = accuracy.apply(lambda x: max(x, 1 - x))

print(balanced_accuracy)

gpt-4.1         0.779711
gpt-4.1-mini    0.737360
gpt-4.1-nano    0.564347
dtype: float64


In [20]:
balanced_accuracy.mean()

0.6938060844823811

In [21]:
# Detect device (supports MPS on Mac)
device = (
    torch.device("mps") if torch.backends.mps.is_available()
    else torch.device("cuda") if torch.cuda.is_available()
    else torch.device("cpu")
)
print(f"Using device: {device}")

class MFModel(nn.Module):
    def __init__(self, n_factors, n_models, text_dim, hidden_dim=128, dropout=0.1):
        super().__init__()
        self.model_factors = nn.Embedding(n_models, n_factors)
        self.model_biases = nn.Embedding(n_models, 1)

        self.text_proj = nn.Sequential(
            nn.Linear(text_dim, n_factors),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        self.interaction_proj = nn.Sequential(
            nn.Linear(n_factors*2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1)  # final score
        )

        self.text_biases = nn.Linear(text_dim, 1, bias=True)

    def forward(self, model_idx, prompt_vector):
        # Model biases + text bias (still linear)
        bias = self.model_biases(model_idx) + self.text_biases(prompt_vector)

        # Project both text and model to shared space
        text_factors = self.text_proj(prompt_vector)                  # (B, n_factors)
        model_factors = self.model_factors(model_idx)                # (B, n_factors)

        # Element-wise interaction → non-linear projection
        #interaction = text_factors * model_factors                   # (B, n_factors)
        #score = self.interaction_proj(interaction)                   # (B, 1)

        interaction = torch.cat([text_factors, model_factors], dim=-1)
        score = self.interaction_proj(interaction)

        return bias + score

m = MFModel(n_factors = 64, n_models = 3, text_dim = 3072).to(device)

Using device: mps


In [25]:
class MFModel(torch.nn.Module):
    def __init__(self, n_factors, n_models, text_dim):
        super().__init__()
        self.model_factors = torch.nn.Embedding(n_models, n_factors)
        self.model_biases = torch.nn.Embedding(n_models, 1)
        
        self.text_factors = torch.nn.Linear(text_dim, n_factors, bias=False)
        self.text_biases = torch.nn.Linear(text_dim, 1, bias=True)

        #self.dropout = nn.Dropout(0.1)

    def forward(self, model_idx, prompt_vector):
        pred = self.model_biases(model_idx) + self.text_biases(prompt_vector)
        #text_factor_var = self.dropout(self.text_factors(prompt_vector))
        text_factor_var = self.text_factors(prompt_vector)
        pred += (
            (self.model_factors(model_idx) * text_factor_var)
            .sum(dim=1, keepdim=True)
        )
        return pred

m = MFModel(n_factors = 32, n_models = 3, text_dim = 3072).to(device)
validate_model(m, dataloaders['valid'])


✅ Validation — Loss: 61.0769 | Accuracy: 69.74%
  Model 0: 57.16%
  Model 1: 74.19%
  Model 2: 77.87%


(61.0768780708313,
 0.6973969631236443,
 {0: 0.571583514099783, 1: 0.7418655097613883, 2: 0.7787418655097614})

In [53]:
class MultiHeadBinaryClassifier(nn.Module):
    def __init__(self, n_models, text_dim, hidden_dim):
        super().__init__()
        self.classifiers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(text_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, 1)
            )
            for _ in range(n_models)
        ])

    def forward(self, model_idx, prompt_vector):
        """
        model_idx: (batch_size,) tensor of ints in [0, n_models)
        prompt_vector: (batch_size, text_dim)
        """
        outputs = []
        for i in range(prompt_vector.size(0)):
            idx = model_idx[i].item()
            classifier = self.classifiers[idx]
            out = classifier(prompt_vector[i].unsqueeze(0))  # keep batch dim
            outputs.append(out)

        return torch.cat(outputs, dim=0)  # (batch_size, 1)

m = MultiHeadBinaryClassifier(hidden_dim = 128, n_models = 3, text_dim = 3072).to(device)
validate_model(m, dataloaders['valid'])


✅ Validation — Loss: 60.5363 | Accuracy: 53.98%
  Model 0: 57.16%
  Model 1: 25.81%
  Model 2: 78.96%


(60.536260187625885,
 0.5397686189443239,
 {0: 0.571583514099783, 1: 0.25813449023861174, 2: 0.789587852494577})

In [23]:


def train_model(model, dataloaders, epochs=5, lr=1e-3):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        loop = tqdm(dataloaders['train'], desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for model_idx,_, emb, label in loop:
            model_idx = model_idx.to(device).long()
            emb = emb.to(device)
            label = label.to(device)

            pred = model(model_idx, emb)
            loss = loss_fn(pred, label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1}: total_loss = {total_loss:.4f}")

        # Call validation at the end of each epoch
        validate_model(model, dataloaders['valid'])


In [24]:
def validate_model(model, dataloader):
    model.eval()
    loss_fn = nn.BCEWithLogitsLoss()
    
    total_loss = 0
    correct = 0
    total = 0
    correct_by_model = {}
    total_by_model = {}

    with torch.no_grad():
        for model_idx, _, emb, label in dataloader:
            model_idx = model_idx.to(device).long()
            emb = emb.to(device)
            label = label.to(device)

            pred = model(model_idx, emb)
            loss = loss_fn(pred, label)
            total_loss += loss.item()

            prob = torch.sigmoid(pred)
            predicted = (prob > 0.5).float()

            for idx, m in enumerate(model_idx.cpu().tolist()):
                total_by_model[m] = total_by_model.get(m, 0) + 1
                if predicted[idx].item() == label[idx].item():
                    correct_by_model[m] = correct_by_model.get(m, 0) + 1

            correct += (predicted == label).sum().item()
            total += label.size(0)

    total_accuracy = correct / total if total > 0 else 0.0
    accuracy_by_model = {
        m: correct_by_model.get(m, 0) / total_by_model[m]
        for m in total_by_model
    }

    print(f"\n✅ Validation — Loss: {total_loss:.4f} | Accuracy: {total_accuracy:.2%}")
    for m, acc in accuracy_by_model.items():
        print(f"  Model {m}: {acc:.2%}")

    return total_loss, total_accuracy, accuracy_by_model

In [54]:
train_model(m, dataloaders, epochs=6, lr=1e-3)

                                                                               

Epoch 1: total_loss = 100.2647

✅ Validation — Loss: 36.7793 | Accuracy: 78.09%
  Model 0: 78.31%
  Model 1: 77.33%
  Model 2: 78.63%


                                                                               

Epoch 2: total_loss = 74.6632

✅ Validation — Loss: 33.9622 | Accuracy: 80.30%
  Model 0: 79.39%
  Model 1: 81.02%
  Model 2: 80.48%


                                                                               

Epoch 3: total_loss = 64.5062

✅ Validation — Loss: 33.8334 | Accuracy: 81.38%
  Model 0: 80.04%
  Model 1: 82.43%
  Model 2: 81.67%


                                                                               

Epoch 4: total_loss = 57.8018

✅ Validation — Loss: 34.3486 | Accuracy: 81.53%
  Model 0: 80.15%
  Model 1: 83.08%
  Model 2: 81.34%


                                                                               

Epoch 5: total_loss = 52.1796

✅ Validation — Loss: 35.2134 | Accuracy: 81.31%
  Model 0: 79.61%
  Model 1: 82.65%
  Model 2: 81.67%


                                                                               

Epoch 6: total_loss = 46.9894

✅ Validation — Loss: 36.3355 | Accuracy: 81.27%
  Model 0: 79.18%
  Model 1: 82.65%
  Model 2: 82.00%


In [27]:
torch.save(m.state_dict(), './models/model_weights_mixture.pth')


In [93]:
idx = 3000
df.iloc[idx]

question             It takes 10 minutes to wash a car, 15 minutes ...
dataset                                                          valid
gpt-4.1-nano                                                     False
gpt-4.1-mini                                                     False
gpt-4.1                                                           True
gpt-4.1-nano-pred                                                  [0]
gpt-4.1-mini-pred                                                  [0]
gpt-4.1-pred                                                       [1]
Name: 3000, dtype: object

In [94]:
q = df.iloc[idx]['question']

In [32]:
def predict_for_question(model, question, embeddings, model_to_idx):
    model.eval()
    with torch.no_grad():
        # Prepare prompt embedding
        prompt_vector = torch.tensor(embeddings[question], dtype=torch.float32).unsqueeze(0).to(device)  # (1, text_dim)

        # Prepare batch of model indices
        model_names = list(model_to_idx.keys())
        model_indices = torch.tensor([model_to_idx[m] for m in model_names], dtype=torch.long).to(device)

        # Duplicate prompt vector for each model
        prompt_batch = prompt_vector.repeat(len(model_indices), 1)

        # Get predictions
        logits = model(model_indices, prompt_batch)
        probs = torch.sigmoid(logits).squeeze().cpu().tolist()

        # Map back to model names
        return dict(zip(model_names, probs))

predict_for_question(m,q,embeddings, model_to_idx)

{'gpt-4.1-nano': 0.9820224046707153,
 'gpt-4.1-mini': 0.9973188042640686,
 'gpt-4.1': 0.9968693852424622}

In [77]:
model_to_idx.items()

dict_items([('gpt-4.1-nano', 0), ('gpt-4.1-mini', 1), ('gpt-4.1', 2)])

In [100]:
def predict_for_question(_, q, e, model_to_idx):
    return {model_name: knns[model_idx].predict_proba(np.array(e[q]).reshape(1, -1))[0,1] for model_name, model_idx in model_to_idx.items()}


In [33]:
df

Unnamed: 0,question,dataset,gpt-4.1-nano,gpt-4.1-mini,gpt-4.1
0,Tyler had 15 dogs . Each dog had 5 puppies . H...,train,True,True,True
1,Steven wants to split a collection of cards in...,train,True,True,True
2,Katie had 85 files on her computer . She delet...,train,True,True,True
3,Luna's monthly food budget is equal to 60% of ...,train,False,False,False
4,James buys 3 dirt bikes for $150 each and 4 of...,train,False,False,False
...,...,...,...,...,...
3041,Mike had 34 peaches at his roadside fruit dish...,valid,True,True,True
3042,"Of 96 oranges, half were ripe. If 1/4 of the r...",valid,False,False,False
3043,Tim had 50 cents . He paid 45 cents for a cand...,valid,True,True,True
3044,On Sunday Alice bought 4 pints of strawberry i...,valid,True,False,False


In [101]:
# Ensure model is on the correct device and in eval mode
m.to(device)
m.eval()

# Initialize empty lists for predictions
pred_nano = []
pred_mini = []
pred_main = []

# Model name mapping to prediction list
model_columns = {
    'gpt-4.1-nano': pred_nano,
    'gpt-4.1-mini': pred_mini,
    'gpt-4.1': pred_main
}

# Iterate over questions with tqdm
for question in tqdm(df['question'], desc="Generating predictions"):
    probs = predict_for_question(m, question, embeddings, model_to_idx)
    for model_name in model_columns:
        model_columns[model_name].append(probs[model_name])

# Add prediction columns to DataFrame
df['gpt-4.1-nano-pred'] = pred_nano
df['gpt-4.1-mini-pred'] = pred_mini
df['gpt-4.1-pred'] = pred_main

Generating predictions: 100%|██████████████| 3046/3046 [02:20<00:00, 21.72it/s]


In [102]:
df

Unnamed: 0,question,dataset,gpt-4.1-nano,gpt-4.1-mini,gpt-4.1,gpt-4.1-nano-pred,gpt-4.1-mini-pred,gpt-4.1-pred
0,Tyler had 15 dogs . Each dog had 5 puppies . H...,train,True,True,True,0.8,0.9,0.8
1,Steven wants to split a collection of cards in...,train,True,True,True,1.0,1.0,1.0
2,Katie had 85 files on her computer . She delet...,train,True,True,True,1.0,1.0,1.0
3,Luna's monthly food budget is equal to 60% of ...,train,False,False,False,0.2,0.6,0.5
4,James buys 3 dirt bikes for $150 each and 4 of...,train,False,False,False,0.3,0.5,0.5
...,...,...,...,...,...,...,...,...
3041,Mike had 34 peaches at his roadside fruit dish...,valid,True,True,True,1.0,1.0,1.0
3042,"Of 96 oranges, half were ripe. If 1/4 of the r...",valid,False,False,False,0.3,0.3,0.3
3043,Tim had 50 cents . He paid 45 cents for a cand...,valid,True,True,True,0.3,0.7,0.7
3044,On Sunday Alice bought 4 pints of strawberry i...,valid,True,False,False,0.4,0.5,0.4


In [103]:
df.to_csv('./data/dataset_mixture_evaluated_knn.csv', index=False)

In [104]:

MODEL_COSTS = {
    'gpt-4.1-nano': 1/227.67712611512857,
    'gpt-4.1-mini': 1/93.6700698465113,
    'gpt-4.1': 1/44.88011472804115
}

def strategy(models_considered, threshold, model_to_pred):
    restricted = {k: v for k, v in model_to_pred.items() if k in models_considered}
    models_above_thresh = [k for k in models_considered if restricted[k] >= threshold]
    if not models_above_thresh:
        return models_considered[-1]  # fallback to most capable model
    return models_above_thresh[0]  # use first that passes threshold


"""
MODELS = [SMALL_MODEL, MEDIUM_MODEL, BIG_MODEL]

Possible strategies are for now only
- strategy(MODELS, ...)


Use only validation
Given the threshold t (0.25 0.5 0.75)
Given the strategy(model_to_probs) that returns a model
Calculate for each pair (strategy, threshold) two new columns: 
- the cost of the question, that it's the cost of the used model
- if the result for this question is correct, considering the used model and the known result for that model 
"""

"\nMODELS = [SMALL_MODEL, MEDIUM_MODEL, BIG_MODEL]\n\nPossible strategies are for now only\n- strategy(MODELS, ...)\n\n\nUse only validation\nGiven the threshold t (0.25 0.5 0.75)\nGiven the strategy(model_to_probs) that returns a model\nCalculate for each pair (strategy, threshold) two new columns: \n- the cost of the question, that it's the cost of the used model\n- if the result for this question is correct, considering the used model and the known result for that model \n"

In [105]:
strategy_tags = {
    'All': MODELS,
    'Smallers': [SMALL_MODEL, MEDIUM_MODEL],
    'Biggers': [MEDIUM_MODEL, BIG_MODEL],
    'Extremes': [SMALL_MODEL, BIG_MODEL],
} 

In [106]:
strategy_tags = {
    'All': MODELS,
    'Smallers': [SMALL_MODEL, MEDIUM_MODEL],
    'Biggers': [MEDIUM_MODEL, BIG_MODEL],
    'Extremes': [SMALL_MODEL, BIG_MODEL],
} 
    
thresholds = [0.1, 0.25, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

df_valid = df[df.dataset == 'valid'].copy()

for strategy_name in strategy_tags:
    for t in thresholds:
        used_model_col = f'used_model@{t}@{strategy_name}'
        cost_col = f'cost@{t}@{strategy_name}'
        correct_col = f'correct@{t}@{strategy_name}'
    
        used_models = []
        costs = []
        corrects = []
    
        for _, row in df_valid.iterrows():
            model_to_pred = {
                model: row[f'{model}-pred']
                for model in MODELS
            }
            selected_model = strategy(strategy_tags[strategy_name], t, model_to_pred)
    
            # Save model used
            used_models.append(selected_model)
    
            # Save cost
            costs.append(MODEL_COSTS[selected_model])
    
            # Check correctness
            is_correct = bool(row[selected_model])  # True/False label in original column
            corrects.append(is_correct)
    
        df_valid[used_model_col] = used_models
        df_valid[cost_col] = costs
        df_valid[correct_col] = corrects

df_valid

Unnamed: 0,question,dataset,gpt-4.1-nano,gpt-4.1-mini,gpt-4.1,gpt-4.1-nano-pred,gpt-4.1-mini-pred,gpt-4.1-pred,used_model@0.1@All,cost@0.1@All,...,correct@0.7@Extremes,used_model@0.8@Extremes,cost@0.8@Extremes,correct@0.8@Extremes,used_model@0.9@Extremes,cost@0.9@Extremes,correct@0.9@Extremes,used_model@0.95@Extremes,cost@0.95@Extremes,correct@0.95@Extremes
365,Fred has 5 baseball cards . Melanie bought 3 o...,valid,True,True,True,0.7,0.8,0.7,gpt-4.1-nano,0.004392,...,True,gpt-4.1,0.022282,True,gpt-4.1,0.022282,True,gpt-4.1,0.022282,True
426,Joan has 8 orange balloons but lost 2 of them ...,valid,True,True,True,1.0,1.0,1.0,gpt-4.1-nano,0.004392,...,True,gpt-4.1-nano,0.004392,True,gpt-4.1-nano,0.004392,True,gpt-4.1-nano,0.004392,True
1100,Keith has 20 books . Jason has 21 books . How ...,valid,True,True,True,0.9,0.9,0.9,gpt-4.1-nano,0.004392,...,True,gpt-4.1-nano,0.004392,True,gpt-4.1-nano,0.004392,True,gpt-4.1,0.022282,True
1102,Joan has 9 blue balloons but lost 2 of them . ...,valid,True,True,True,1.0,1.0,1.0,gpt-4.1-nano,0.004392,...,True,gpt-4.1-nano,0.004392,True,gpt-4.1-nano,0.004392,True,gpt-4.1-nano,0.004392,True
1135,Mike has 87 baseball cards . Sam bought 13 of ...,valid,True,True,True,0.8,0.8,0.7,gpt-4.1-nano,0.004392,...,True,gpt-4.1-nano,0.004392,True,gpt-4.1,0.022282,True,gpt-4.1,0.022282,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3041,Mike had 34 peaches at his roadside fruit dish...,valid,True,True,True,1.0,1.0,1.0,gpt-4.1-nano,0.004392,...,True,gpt-4.1-nano,0.004392,True,gpt-4.1-nano,0.004392,True,gpt-4.1-nano,0.004392,True
3042,"Of 96 oranges, half were ripe. If 1/4 of the r...",valid,False,False,False,0.3,0.3,0.3,gpt-4.1-nano,0.004392,...,False,gpt-4.1,0.022282,False,gpt-4.1,0.022282,False,gpt-4.1,0.022282,False
3043,Tim had 50 cents . He paid 45 cents for a cand...,valid,True,True,True,0.3,0.7,0.7,gpt-4.1-nano,0.004392,...,True,gpt-4.1,0.022282,True,gpt-4.1,0.022282,True,gpt-4.1,0.022282,True
3044,On Sunday Alice bought 4 pints of strawberry i...,valid,True,False,False,0.4,0.5,0.4,gpt-4.1-nano,0.004392,...,False,gpt-4.1,0.022282,False,gpt-4.1,0.022282,False,gpt-4.1,0.022282,False


In [107]:
for strategy_name in strategy_tags:
    print(f"\n📊 Strategy: {strategy_name}")
    for t in thresholds:
        acc = df_valid[f'correct@{t}@{strategy_name}'].mean()
        avg_cost = df_valid[f'cost@{t}@{strategy_name}'].mean()
        print(f"  Threshold {t:.2f} → Accuracy: {acc:.2%}, Avg Cost: {np.log(avg_cost):.8f}")


📊 Strategy: All
  Threshold 0.10 → Accuracy: 57.27%, Avg Cost: -5.41582618
  Threshold 0.25 → Accuracy: 61.28%, Avg Cost: -5.21104205
  Threshold 0.50 → Accuracy: 66.59%, Avg Cost: -4.83348678
  Threshold 0.60 → Accuracy: 70.28%, Avg Cost: -4.61999712
  Threshold 0.70 → Accuracy: 73.21%, Avg Cost: -4.46136061
  Threshold 0.80 → Accuracy: 75.05%, Avg Cost: -4.30269352
  Threshold 0.90 → Accuracy: 76.68%, Avg Cost: -4.18420610
  Threshold 0.95 → Accuracy: 78.52%, Avg Cost: -4.04174426

📊 Strategy: Smallers
  Threshold 0.10 → Accuracy: 57.27%, Avg Cost: -5.41866164
  Threshold 0.25 → Accuracy: 61.17%, Avg Cost: -5.25586727
  Threshold 0.50 → Accuracy: 65.51%, Avg Cost: -5.02270760
  Threshold 0.60 → Accuracy: 68.55%, Avg Cost: -4.91677527
  Threshold 0.70 → Accuracy: 70.07%, Avg Cost: -4.83977784
  Threshold 0.80 → Accuracy: 71.48%, Avg Cost: -4.76508358
  Threshold 0.90 → Accuracy: 72.56%, Avg Cost: -4.70457525
  Threshold 0.95 → Accuracy: 73.97%, Avg Cost: -4.61258759

📊 Strategy: Bigg

In [108]:
df_valid.to_csv('./data/valid_mixture_knn.csv', index=False)