In [None]:
import pandas as pd
import numpy as np
import ast

In [None]:
INPUT_ALLOCATIONS = "allocations/obermeyer/allocations_sr_50_qr_75.csv"
QUALIFICATION_COLUMN = 'qualified_gagne_1' # 75th percentile translates to 25% qualification

In [None]:
df = pd.read_csv(INPUT_ALLOCATIONS)
df['selected'] = df['selected'].apply(ast.literal_eval)
df['unselected'] = df['unselected'].apply(ast.literal_eval)
df.head()

In [None]:
data = pd.read_csv("data/obermeyer/obermeyer_data_cleaned.csv")
data.head()

#### K' = Number of Qualified Selected
#### N' = Number of Qualified in Test Set

In [None]:
print(np.mean(df["k'"]))
print(np.std(df["k'"]))
print(np.mean(df["n'"]))
print(np.std(df["n'"]))

#### Number of Rashomon Allocations
#### Number of Rashomon Models
#### Number of Rashomon Models Per Allocation


In [None]:
unique_allocations = df.groupby(["seed", "iteration"]).count()["allocation_idx"].reset_index()
print(np.mean(unique_allocations["allocation_idx"]))
print(np.std(unique_allocations["allocation_idx"]))
print()
unique_allocations = df.groupby(["seed", "iteration"]).sum()["model_count"].reset_index()
print(np.mean(unique_allocations["model_count"]))
print(np.std(unique_allocations["model_count"]))
print()
print(np.mean(df["model_count"]))
print(np.std(df["model_count"]))
print()

#### Number of People Systemically Excluded (Never Selected Across Rashomon Allocations)

In [None]:
metric = []
for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():        
        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "unselected"].to_list()
        systemic_rejection = set(allocations[0])
        for a in allocations:
            systemic_rejection = systemic_rejection.intersection(set(a))
        metric.append(len(systemic_rejection))
print(np.mean(metric))
print(np.std(metric))

In [None]:
def entropy(p):
    if p == 0 or p == 1:
        return 0
    else:
        return -((p * np.log(p)) + ((1-p) * np.log(1-p)))

metric = []
baseline = []
for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        selected_counts = dict.fromkeys(people, 0)

        n_prime = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "n'"].mean()
        k_prime = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "k'"].mean()
        n = len(people)
        k = len(selected)
        qualified_entropy = entropy(k_prime / n_prime)
        unqualified_entropy = entropy((k - k_prime) / (n - n_prime))
        baseline.append((qualified_entropy * (n_prime / n)) + (unqualified_entropy * ((n-n_prime)/n)))
        
        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "selected"].to_list()
        for i,a in enumerate(allocations):
            for p in a:
                selected_counts[p] += 1
        selected_counts = {key: entropy(value / len(allocations)) for key, value in selected_counts.items()}
        metric.append(np.mean(list(selected_counts.values())))
print(np.mean(metric))
print(np.std(metric))
print(np.mean(baseline))
print(np.std(baseline))

In [None]:
def entropy(probs):
    e = 0
    for p in probs:
        if p > 0 and p < 1:
            e += p * np.log(p)
    return -e

metric = []
baseline = []
for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        test_data = data.loc[data["person_id"].isin(people)].copy()

        qualified_people = test_data.loc[test_data[QUALIFICATION_COLUMN]==1]
        qualified_entropy = entropy(list(qualified_people["kmeans_4"].value_counts()/len(qualified_people)))
        unqualified_people = test_data.loc[test_data[QUALIFICATION_COLUMN]==0]
        unqualified_entropy = entropy(list(unqualified_people["kmeans_4"].value_counts()/len(unqualified_people)))
        baseline.append((qualified_entropy * (n_prime / n)) + (unqualified_entropy * ((n-n_prime)/n)))

        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "selected"].to_list()
        for a in allocations:
            selected_data = data.loc[data["person_id"].isin(a)].copy()
            type_probs = list(selected_data["kmeans_4"].value_counts()/len(selected_data))
            metric.append(entropy(type_probs))
            
print(np.mean(metric))
print(np.std(metric))
print(np.mean(baseline))
print(np.std(baseline))

In [None]:
def entropy(probs):
    e = 0
    for p in probs:
        if p > 0 and p < 1:
            e += p * np.log(p)
    return -e

metric = []
baseline = []
for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        test_data = data.loc[data["person_id"].isin(people)].copy()

        qualified_people = test_data.loc[test_data[QUALIFICATION_COLUMN]==1]
        qualified_entropy = entropy(list(qualified_people["age"].value_counts()/len(qualified_people)))
        unqualified_people = test_data.loc[test_data[QUALIFICATION_COLUMN]==0]
        unqualified_entropy = entropy(list(unqualified_people["age"].value_counts()/len(unqualified_people)))
        baseline.append((qualified_entropy * (n_prime / n)) + (unqualified_entropy * ((n-n_prime)/n)))

        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "selected"].to_list()
        for a in allocations:
            selected_data = data.loc[data["person_id"].isin(a)].copy()
            type_probs = list(selected_data["kmeans_4"].value_counts()/len(selected_data))
            metric.append(entropy(type_probs))
            
print(np.mean(metric))
print(np.std(metric))
print(np.mean(baseline))
print(np.std(baseline))

In [None]:
dem_age_band_18-24_tm1

In [None]:
q = []
uq = []

for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        test_data = data.loc[data["person_id"].isin(people)].copy()
        qualified_selections = dict.fromkeys(test_data.loc[test_data[QUALIFICATION_COLUMN]==1, "person_id"].to_list(), 0)
        unqualified_selections = dict.fromkeys(test_data.loc[test_data[QUALIFICATION_COLUMN]==0, "person_id"].to_list(), 0)

        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "selected"].to_list()
        for a in allocations:
            for p in a:
                if p in qualified_selections:
                    qualified_selections[p] += 1
                else:
                    unqualified_selections[p] += 1

        q += list(np.array(list(qualified_selections.values()))/len(allocations))
        uq += list(np.array(list(unqualified_selections.values()))/len(allocations))

import matplotlib.pyplot as plt

# Create the histogram
plt.hist(uq, bins=10, edgecolor='black')  # Adjust the number of bins as needed

# Add labels and title
plt.xlabel('Proportion of times selected across "found" rashomon allocations')
plt.ylabel('Frequency')

# Show the plot
plt.tight_layout()
plt.savefig('test.jpg')
plt.show()

#### Individual Fairness -- Qualified and Unqualified Selections

In [None]:
qualified_avg = []
qualified_std = []
unqualified_avg = []
unqualified_std = []

for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        test_data = data.loc[data["person_id"].isin(people)].copy()
        qualified_selections = dict.fromkeys(test_data.loc[test_data[QUALIFICATION_COLUMN]==1, "person_id"].to_list(), 0)
        unqualified_selections = dict.fromkeys(test_data.loc[test_data[QUALIFICATION_COLUMN]==0, "person_id"].to_list(), 0)

        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "selected"].to_list()
        for a in allocations:
            for p in a:
                if p in qualified_selections:
                    qualified_selections[p] += 1
                else:
                    unqualified_selections[p] += 1

        qualified_selections = np.array(list(qualified_selections.values()))/len(allocations)
        unqualified_selections = np.array(list(unqualified_selections.values()))/len(allocations)
        
        qualified_avg.append(np.mean(qualified_selections))
        qualified_std.append(np.std(qualified_selections))
        unqualified_avg.append(np.mean(unqualified_selections))
        unqualified_std.append(np.std(unqualified_selections))
print(np.mean(qualified_avg))
print(np.mean(qualified_std))
print(np.mean(unqualified_avg))
print(np.mean(unqualified_std))

#### Individual Fairness -- Num Chronic Illnesses

In [None]:
all_metric = []

for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        test_data = data.loc[data["person_id"].isin(people)].copy()
        
        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "selected"].to_list()
        metric = []
        for a in allocations:
            ind_fairness = 0
            for j,p in test_data.iterrows():
                neighbors = test_data.loc[test_data["gagne_sum_t"]==p["gagne_sum_t"], "person_id"].to_list()
                neighbors_selected = 0
                for n in neighbors:
                    if n in a:
                        neighbors_selected += 1
                        
                p_selected = 0
                if p["person_id"] in a:
                    p_selected = 1
                ind_fairness += (np.abs(p_selected - neighbors_selected/len(neighbors)))
            metric.append(ind_fairness/len(test_data))
        all_metric.append(metric) 

all_metric_flattened = [m for metric in all_metric for m in metric]
all_metric_min = [min(metric) for metric in all_metric]
print(np.nanmean(all_metric_flattened))
print(np.nanstd(all_metric_flattened))
print(np.nanmean(all_metric_min))

#### Group Fairness -- % Highest Risk Patients That Are Black

In [None]:
all_props = []

black_people = data.loc[data["race"]==1, "person_id"].to_list()

for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "selected"].to_list()
        props = []
        for a in allocations:
            black_selected = 0
            for p in a:
                if p in black_people:
                    black_selected += 1
            props.append(black_selected/len(a))
        all_props.append(props)

all_props_flattened = [p for props in all_props for p in props]
all_props_max = [max(props) for props in all_props]
print(np.nanmean(all_props_flattened))
print(np.nanstd(all_props_flattened))
print(np.nanmean(all_props_max))

In [None]:
best_prop = []
for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        k = len(selected)
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        
        test_data = data.loc[data["person_id"].isin(people)].copy()
        test_data = test_data[["person_id", QUALIFICATION_COLUMN, "race"]].reset_index(drop=True).copy()
        
        k_prime = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "k'"].max()
        qualified_black = test_data[(test_data["race"]==1)&(test_data[QUALIFICATION_COLUMN]==1)]
        unqualified_black = test_data[(test_data["race"]==1)&(test_data[QUALIFICATION_COLUMN]==0)]
        qualified_white = test_data[(test_data["race"]==0)&(test_data[QUALIFICATION_COLUMN]==1)]
        unqualified_white = test_data[(test_data["race"]==0)&(test_data[QUALIFICATION_COLUMN]==0)]

        # Select as many black patients as possible under k' and (k-k') restrictions
        black_selected = 0
        if len(qualified_black)>=k_prime:
            black_selected += k_prime
        else:
            black_selected += len(qualified_black)
        if len(unqualified_black)>=(k-k_prime):
            black_selected += (k-k_prime)
        else:
            black_selected += len(unqualified_black)
        best_prop.append(black_selected/k)
print(np.mean(best_prop))
print(np.std(best_prop))

#### Group Fairness -- Difference in # Chronic Illnesses Among Selected Patients By Race

In [None]:
all_ratios = []

black_people = data.loc[data["race"]==1, "person_id"].to_list()

for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        test_data = data.loc[data["person_id"].isin(people)].copy()

        ratio = []
        allocations = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "selected"].to_list()
        for a in allocations:
            white_num_illnesses = test_data.loc[(test_data["race"]==0)&(test_data["person_id"].isin(a)), "gagne_sum_t"].mean()
            black_num_illnesses = test_data.loc[(test_data["race"]==1)&(test_data["person_id"].isin(a)), "gagne_sum_t"].mean()
            ratio.append(black_num_illnesses/white_num_illnesses)
        all_ratios.append(ratio)

all_ratios_flattened = [r for ratios in all_ratios for r in ratios]
all_ratios_min = [min(ratios) for ratios in all_ratios]
print(np.nanmean(all_ratios_flattened))
print(np.nanstd(all_ratios_flattened))
print(np.nanmean(all_ratios_min))

In [None]:
best_ratio = []
for seed in df["seed"].unique():
    for iteration in df["iteration"].unique():
        selected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "selected"].values[0]
        k = len(selected)
        unselected = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration)&(df["allocation_idx"]==0), "unselected"].values[0]
        people = selected + unselected
        
        test_data = data.loc[data["person_id"].isin(people)].copy()
        test_data = test_data[["person_id", QUALIFICATION_COLUMN, "race", "gagne_sum_t"]].reset_index(drop=True).copy()
        
        k_prime = df.loc[(df["seed"]==seed)&(df["iteration"]==iteration), "k'"].max()
        qualified = test_data[(test_data[QUALIFICATION_COLUMN]==1)]
        unqualified = test_data[(test_data[QUALIFICATION_COLUMN]==0)]

        qualified = qualified.sort_values(by=["gagne_sum_t"], ascending=False).reset_index(drop=True)
        unqualified = unqualified.sort_values(by=["gagne_sum_t"], ascending=False).reset_index(drop=True)

        # Select based on highest chronic illnesses, under k' and (k-k') restrictions
        selected = pd.concat([qualified.loc[:k_prime-1], unqualified.loc[:(k-k_prime)-1]])
        white_num_illnesses = selected.loc[(test_data["race"]==0), "gagne_sum_t"].mean()
        black_num_illnesses = selected.loc[(test_data["race"]==1), "gagne_sum_t"].mean()
        best_ratio.append(black_num_illnesses/white_num_illnesses)
print(np.nanmean(best_ratio))
print(np.nanstd(best_ratio))