In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os

In [2]:
this_dir = os.getcwd()
os.chdir('..')
test_perform_full = pd.read_csv( 'data/test_perform_full.csv' )
# issue with reading 'None' string as None/NaN datatype
test_perform_full['strat'] = test_perform_full['strat'].fillna('None')
os.chdir(this_dir)

In [3]:
# initialize column names
id_cols = ['model_name', 'model_type', 'score_var', 'strat', 'best_epoch']
perform_cols = [ 'test_acc', 'test_auc', 'test_f1', 'irt_ability' ]
di_cols = [ c for c in test_perform_full if '|DI' in c ]


# aggregate across models
di_gdf = test_perform_full.groupby(['score_var', 'strat'])[di_cols].agg(['mean', 'var'])
di_gdf.columns = [ '_'.join(t) for t in di_gdf.columns ]

# add \alpha=0.05 confidence intervals
z = 1.96
num_models = 22     # since performance from 22 trained models
for c in di_cols:
    # 
    this_sem = (di_gdf[c+'_var'])**0.5 / np.sqrt( num_models )

    di_gdf[c+'_cilow'] = di_gdf[c+'_mean'] - z*this_sem
    di_gdf[c+'_cihigh'] = di_gdf[c+'_mean'] + z*this_sem


In [4]:
sv_d = {
    'Anxiety': 'Anxiety',
    'Numeracy': 'Numeracy',
    'SubjectiveLit': 'Literacy',
    'TrustPhys': 'Trust',
    'wer': 'Depression'
    }


demo_d = {
    'Sex|DI': 'Sex',
    'Age_Senior|DI': 'Age',
    'Race_POC|DI': 'Race',
    'Education_Low|DI': 'Educ.',
    'Income_Low|DI': 'Inc.',
    'ESL|DI': 'ESL',
}


# format for long output
out_di_df = pd.DataFrame()
for svar in test_perform_full['score_var'].unique():

    rand_row = di_gdf.loc[(svar, 'None'), :]
    hard_row = di_gdf.loc[(svar, 'Constant'), :]

    for demo in di_cols:

        rand_low, rand_high = rand_row[demo+"_cilow"], rand_row[demo+"_cihigh"]
        hard_low, hard_high = hard_row[demo+"_cilow"], hard_row[demo+"_cihigh"]

        rand_low, rand_high = f'{rand_low:.4f}', f'{rand_high:.4f}', 
        hard_low, hard_high = f'{hard_low:.4f}', f'{hard_high:.4f}', 

        new_row = pd.DataFrame.from_dict([{
            'Task': sv_d[svar], 'Demographic': demo_d[demo],
            'Random_Low': rand_low,
            'Random_High': rand_high,
            'Hard_Low': hard_low,
            'Hard_High': hard_high
        }])

        out_di_df = pd.concat([ out_di_df, new_row ])

out_di_df = out_di_df.set_index(['Task', 'Demographic'])
out_di_df = out_di_df.reset_index()

this_dir = os.getcwd()
os.chdir('..')
out_di_df.to_csv( 'data/disp_impact_table.csv', index=False )
os.chdir(this_dir)