In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
TEST_SIZE = 100
SELECTION_RATE = 0.10
NUM_SPLITS = 5
ITERATIONS_PER_SPLIT = 20
PREDICTION_FILE = "predictions/obermeyer/model_predictions_threshold.csv"
QUALIFICATION_COLUMN = 'threshold_75' # 75th percentile translates to 25% qualification
OUTPUT_FILE = "allocations/obermeyer/model_allocations_75" 
# row = (seed, iteration #, allocation #); columns = [selected], [unselected], precision

In [3]:
full_df = pd.read_csv(PREDICTION_FILE)
NUM_MODELS = len(full_df.columns)-6 # exclude the y, idx, threshold, and seed columns
full_df.head()

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_97,m_98,m_99,m_100,y,idx,seed,threshold_25,threshold_50,threshold_75
0,0.002421,0.00361,0.003165,0.003973,0.002966,0.003023,0.004011,0.003947,0.002069,0.003313,...,0.003707,0.002858,0.003587,0.002265,0.009628,2545,0,1,1,0
1,0.002649,0.003736,0.003636,0.00447,0.004497,0.004651,0.003741,0.00364,0.003776,0.003868,...,0.003432,0.004454,0.004208,0.003848,0.004905,8198,0,1,0,0
2,0.011027,0.009651,0.008816,0.009,0.00781,0.010168,0.008848,0.007403,0.009044,0.009038,...,0.00808,0.009222,0.009075,0.009436,0.009446,46461,0,1,1,0
3,0.000716,0.000319,0.000656,0.00032,0.001263,0.000155,0.000235,0.000826,0.000541,2.6e-05,...,0.000658,0.000799,0.000269,0.000765,0.002361,30620,0,1,0,0
4,0.000632,0.004498,0.005682,0.001974,0.003122,0.002559,0.005822,0.003704,0.002814,0.003566,...,0.00403,0.002461,0.003393,0.003114,0.003996,47418,0,1,0,0


In [None]:
def calculate_unique_allocations(seed, iter, df, k, allocation_data):
    precision = {}
    for model in range(1, NUM_MODELS+1):
        allocation = df["m_"+str(model)].nlargest(k).index.to_list()
        precision[model] = int(df.loc[allocation, QUALIFICATION_COLUMN].sum())
    best_precision = max(precision.values())

    allocations = set()
    allocation_idx = 0
    for model in range(1, NUM_MODELS+1):
        if precision[model] != best_precision: # if we change rashomon set to be w/in epsilon, adjust this check
            continue
        
        allocation = df["m_"+str(model)].nlargest(k).index.to_list()
        selected = allocation
        allocation.sort()
        allocation = tuple(allocation)
        if allocation not in allocations:
            unselected =[i for i in df["idx"] if i not in selected] 
            allocation_data[(seed, iter, allocation_idx)] = [selected, unselected, precision[model]]
            allocation_idx += 1

In [26]:
allocation_data = {}
for split in range(NUM_SPLITS):
    print(f"Train-Test Split: {split}")
    split_df = full_df[full_df["seed"]==split].copy()
    
    for i in tqdm(range(ITERATIONS_PER_SPLIT)):
        df = split_df.sample(n=TEST_SIZE, random_state=i) # get TEST_SIZE random sample
        k = int(SELECTION_RATE * TEST_SIZE) # number of people we will select for
        calculate_unique_allocations(split, i, df, k, allocation_data)

Train-Test Split: 0


100%|██████████| 20/20 [00:00<00:00, 27.28it/s]


Train-Test Split: 1


100%|██████████| 20/20 [00:00<00:00, 26.98it/s]


Train-Test Split: 2


100%|██████████| 20/20 [00:00<00:00, 28.95it/s]


Train-Test Split: 3


100%|██████████| 20/20 [00:00<00:00, 30.60it/s]


Train-Test Split: 4


100%|██████████| 20/20 [00:00<00:00, 25.39it/s]


In [27]:
allocation_df = pd.DataFrame.from_dict(allocation_data, orient='index', columns=['selected', 'unselected', 'precision']) 

In [28]:
allocation_df.to_csv(OUTPUT_FILE, index=False)