In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
TEST_SIZE = 100
SELECTION_RATE = 0.25
RASHOMON_EPSILON = 0.01
NUM_SPLITS = 5
ITERATIONS_PER_SPLIT = 20
PREDICTION_FILE = "predictions/obermeyer/bootstrap_regression.csv"
QUALIFICATION_COLUMN = 'threshold_50' # 75th percentile translates to 25% qualification
OUTPUT_FILE = "allocations/obermeyer/allocations_sr_25_qr_50.csv" 

In [3]:
full_df = pd.read_csv(PREDICTION_FILE)
NUM_MODELS = len([c for c in full_df.columns if "m_" in c])
print(NUM_MODELS)
full_df.head()

100


Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_97,m_98,m_99,m_100,y,idx,seed,threshold_25,threshold_50,threshold_75
0,0.006753,0.008166,0.00779,0.008346,0.007259,0.007138,0.008137,0.00838,0.00664,0.007848,...,0.008216,0.007169,0.007849,0.007113,0.009628,2545,0,1,1,0
1,0.013049,0.013692,0.013846,0.014468,0.014478,0.01445,0.014055,0.013888,0.014334,0.014554,...,0.013755,0.013935,0.014385,0.01423,0.004905,8198,0,1,0,0
2,0.021138,0.018986,0.018488,0.018745,0.017153,0.019729,0.018739,0.017566,0.018916,0.019219,...,0.018457,0.018957,0.019271,0.019365,0.009446,46461,0,1,1,0
3,0.002706,0.002252,0.002763,0.002001,0.00347,0.001979,0.002444,0.00295,0.002548,0.002147,...,0.002663,0.003136,0.00237,0.002492,0.002361,30620,0,1,0,0
4,0.00998,0.01281,0.01443,0.010169,0.011937,0.010077,0.014131,0.012601,0.011287,0.011969,...,0.012869,0.011242,0.012343,0.011475,0.003996,47418,0,1,0,0


In [5]:
def calculate_rashomon_allocations(seed, iteration, df, k):
    k_prime = {}
    for model in range(1, NUM_MODELS+1):
        allocation = df["m_"+str(model)].nlargest(k).index.to_list()
        k_prime[model] = int(df.loc[allocation, QUALIFICATION_COLUMN].sum())
    best_k_prime = max(k_prime.values())

    allocations = {}
    allocation_data = []
    allocation_idx = 0
    for model in range(1, NUM_MODELS+1):
        if k_prime[model]/k < (best_k_prime/k)-RASHOMON_EPSILON:
            continue
        
        selected = df.loc[df["m_"+str(model)].nlargest(k).index, "idx"].tolist()
        selected.sort()
        allocation = tuple(selected)
        if allocation not in allocations:
            unselected = [i for i in df["idx"] if i not in selected] 
            allocation_data.append({
                "seed":seed,
                "iteration":iteration,
                "allocation_idx":allocation_idx,
                "selected": selected,
                "unselected": unselected,
                "k'": k_prime[model],
                "n'": df[QUALIFICATION_COLUMN].sum()
            })
            allocations[allocation] = {"allocation_idx": allocation_idx, "model_count": 1}
            allocation_idx += 1
        else:
            allocations[allocation]["model_count"] += 1

    model_counts = {}
    for v in allocations.values():
        model_counts[v["allocation_idx"]] = v["model_count"]
    
    for i in range(len(allocation_data)):
        allocation_data[i]["model_count"] = model_counts[allocation_data[i]["allocation_idx"]] 
           
    return allocation_data

In [6]:
allocation_data = []
for split in range(NUM_SPLITS):
    print(f"Train-Test Split: {split}")
    split_df = full_df[full_df["seed"]==split].copy()
    
    for i in range(ITERATIONS_PER_SPLIT):
        k = int(SELECTION_RATE * TEST_SIZE) # number of people we will select for
        allocation_data += calculate_rashomon_allocations(split, i, split_df.sample(n=TEST_SIZE, random_state=i), k)

Train-Test Split: 0
Train-Test Split: 1
Train-Test Split: 2
Train-Test Split: 3
Train-Test Split: 4


In [7]:
allocation_df = pd.DataFrame(allocation_data)

In [8]:
allocation_df.head()

Unnamed: 0,seed,iteration,allocation_idx,selected,unselected,k',n',model_count
0,0,0,0,"[83, 2903, 4395, 4476, 6945, 8336, 11193, 1159...","[25525, 35439, 1615, 19895, 17799, 33009, 1536...",22,51,1
1,0,1,0,"[339, 1896, 3602, 8704, 9136, 11019, 11484, 12...","[46889, 41011, 28019, 28842, 3102, 15463, 2336...",21,57,1
2,0,1,1,"[339, 1896, 3602, 5395, 7074, 8704, 9136, 1101...","[46889, 41011, 28019, 28842, 3102, 15463, 2336...",21,57,1
3,0,1,2,"[339, 1896, 3602, 7074, 8704, 9136, 11019, 114...","[46889, 41011, 28019, 28842, 3102, 15463, 2336...",21,57,2
4,0,1,3,"[339, 1896, 3602, 5395, 7074, 8704, 9136, 1101...","[46889, 41011, 28019, 28842, 3102, 15463, 2336...",21,57,1


In [9]:
allocation_df.to_csv(OUTPUT_FILE, index=False)