In [41]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [110]:
TEST_SIZE = 100
SELECTION_RATE = 0.50
RASHOMON_EPSILON = 0.01
NUM_SPLITS = 5
ITERATIONS_PER_SPLIT = 20
PREDICTION_FILE = "predictions/obermeyer/bootstrap_regression.csv"
QUALIFICATION_COLUMN = 'threshold_25' # 75th percentile translates to 25% qualification
OUTPUT_FILE = "allocations/obermeyer/allocations_sr_50_qr_75.csv" 

In [111]:
full_df = pd.read_csv(PREDICTION_FILE)
NUM_MODELS = len(full_df.columns)-6 # exclude the y, idx, threshold, and seed columns
full_df.head()

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_97,m_98,m_99,m_100,y,idx,seed,threshold_25,threshold_50,threshold_75
0,0.002421,0.00361,0.003165,0.003973,0.002966,0.003023,0.004011,0.003947,0.002069,0.003313,...,0.003707,0.002858,0.003587,0.002265,0.009628,2545,0,1,1,0
1,0.002649,0.003736,0.003636,0.00447,0.004497,0.004651,0.003741,0.00364,0.003776,0.003868,...,0.003432,0.004454,0.004208,0.003848,0.004905,8198,0,1,0,0
2,0.011027,0.009651,0.008816,0.009,0.00781,0.010168,0.008848,0.007403,0.009044,0.009038,...,0.00808,0.009222,0.009075,0.009436,0.009446,46461,0,1,1,0
3,0.000716,0.000319,0.000656,0.00032,0.001263,0.000155,0.000235,0.000826,0.000541,2.6e-05,...,0.000658,0.000799,0.000269,0.000765,0.002361,30620,0,1,0,0
4,0.000632,0.004498,0.005682,0.001974,0.003122,0.002559,0.005822,0.003704,0.002814,0.003566,...,0.00403,0.002461,0.003393,0.003114,0.003996,47418,0,1,0,0


In [112]:
def calculate_rashomon_allocations(seed, iteration, df, k, model_counts):
    k_prime = {}
    for model in range(1, NUM_MODELS+1):
        allocation = df["m_"+str(model)].nlargest(k).index.to_list()
        k_prime[model] = int(df.loc[allocation, QUALIFICATION_COLUMN].sum())
    best_k_prime = max(k_prime.values())
    model_count = 0
    allocations = set()
    allocation_data = []
    allocation_idx = 0
    for model in range(1, NUM_MODELS+1):
        if k_prime[model]/TEST_SIZE < (best_k_prime/TEST_SIZE)-RASHOMON_EPSILON:
            continue
        
        model_count += 1
        selected = df.loc[df["m_"+str(model)].nlargest(k).index, "idx"].tolist()
        selected.sort()
        allocation = tuple(selected)
        if allocation not in allocations:
            unselected = [i for i in df["idx"] if i not in selected] 
            allocation_data.append({
                "seed":seed,
                "iteration":iteration,
                "allocation_idx":allocation_idx,
                "selected": selected,
                "unselected": unselected,
                "k'": k_prime[model],
                "n'": df[QUALIFICATION_COLUMN].sum()
            })            
            allocations.add(allocation)
            allocation_idx += 1
    model_counts.append(model_count)
    return allocation_data

In [113]:
allocation_data = []
model_counts = []
for split in range(NUM_SPLITS):
    print(f"Train-Test Split: {split}")
    split_df = full_df[full_df["seed"]==split].copy()
    
    for i in range(ITERATIONS_PER_SPLIT):
        k = int(SELECTION_RATE * TEST_SIZE) # number of people we will select for
        allocation_data += calculate_rashomon_allocations(split, i, split_df.sample(n=TEST_SIZE, random_state=i), k, model_counts)
print(len(model_counts))
print(sum(model_counts)/len(model_counts))

Train-Test Split: 0
Train-Test Split: 1
Train-Test Split: 2
Train-Test Split: 3
Train-Test Split: 4
100
70.91


In [114]:
allocation_df = pd.DataFrame(allocation_data)

In [108]:
allocation_df.head()

Unnamed: 0,seed,iteration,allocation_idx,selected,unselected,k',n'
0,0,0,0,"[83, 2903, 2999, 4395, 4476, 7258, 8336, 8430,...","[46876, 25525, 35439, 11596, 1615, 19895, 1779...",25,75
1,0,0,1,"[83, 1617, 2903, 2999, 4395, 4476, 7258, 8336,...","[46876, 25525, 35439, 11596, 1615, 19895, 1779...",25,75
2,0,0,2,"[83, 1617, 2903, 2999, 4395, 4476, 8336, 8430,...","[46876, 25525, 35439, 11596, 1615, 19895, 1779...",25,75
3,0,0,3,"[83, 2903, 2999, 4395, 4476, 7258, 8336, 8430,...","[46876, 35439, 11596, 1615, 19895, 17799, 3300...",25,75
4,0,0,4,"[83, 1617, 2903, 2999, 4395, 4476, 7258, 8336,...","[46876, 25525, 35439, 11596, 1615, 19895, 1779...",25,75


In [115]:
allocation_df.to_csv(OUTPUT_FILE, index=False)