## NOTE;

- **/kld_demos/ data not able to be included due privacy concerns so code will not execute successfully**

In [None]:
import os
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)


#dictionaries used to rename outputs for readabaility
comp_d = {
    'boundary_dist': 'BD',
    'times_forgotten': 'TF',
    'instance_hardness': 'IH',
    'losses': 'Loss',
    'irt_difficulty': 'IRT',
    'tok_len': 'SL',
    'pvi': 'PVI'
}

demo_d = {
    'Age_Senior': 'Age',
    'Race_POC': 'Race',
    'Education_Low': 'Educ.',
    'Sex': 'Sex',
    'Income_Low': 'Inc.',
    'ESL': 'ESL'
}

demo_order = ['Age', 'Sex', 'Race', 'Educ.', 'Inc.', 'ESL']

svar_d = {
    'Anxiety': 'Anxiety',
    'Numeracy': 'Numeracy',
    'SubjectiveLit': 'Literacy',
    'TrustPhys': 'Trust',
    'wer': 'Depr.'
}

svar_order = ['Anxiety', 'Numeracy', 'Literacy', 'Trust', 'Depr.']


# this function calculates the KLD divergence between the distributions of protected and privileged
# classes for a given complexity metric -- larger values indicate fairness violations
# in Lorena et al. (2024)
def calculate_vec_kld(df, demo_met, comp_met, prot_class, alpha=0.005):
    prot_vals = df[df[demo_met]==prot_class][comp_met]
    maj_vals = df[df[demo_met]!=prot_class][comp_met]

    # bin into probability distributions
    these_bins = np.arange( min(df[comp_met]), max(df[comp_met]), step=df[comp_met].std()/10 )
    this_P = pd.Series( pd.cut(prot_vals, bins=these_bins).value_counts(normalize=True), name='prot' )
    this_Q = pd.Series( pd.cut(maj_vals, bins=these_bins).value_counts(normalize=True), name='maj' )

    # concatenate and smooth slightly
    pdf = pd.concat([this_P, this_Q], axis=1).reset_index()
    pdf['maj'] += alpha
    pdf['prot'] += alpha

    this_kld = sum( pdf['prot'] * ( np.log( pdf['prot'] / pdf['maj'] ) ) )
    return this_kld

In [None]:
# indicates values of the protected class for each demo
protected_d = {
    'Sex': 2,
    'Age_Senior': 1,
    'Race_POC': 1,
    'Education_Low': 1,
    'Income_Low': 1,
    'ESL': 1
}

these_svars = [ 'Anxiety', 'Numeracy', 'SubjectiveLit', 'TrustPhys', 'wer' ]
comp_cols = [ 'boundary_dist', 'losses', 'pvi', 'times_forgotten', 'instance_hardness', 'irt_difficulty', 'tok_len' ]

this_dir = os.getcwd()
os.chdir('..')
model_diff = pd.read_csv( 'data/model_diff.csv' )
model_diff['strat'] = [ s[-1] for s in model_diff['model_name'].str.split('-') ]

sv_diff = model_diff.groupby( ['score_var', 'ID'] )[ comp_cols ].mean()



# NOTE; demographic files not provided -- please reach out to the authors for data
kl_df = pd.DataFrame()
for svar in these_svars:

    subset = False

    # NOTE; converted to data checkpoint to maintain data privacy
    # if svar == 'wer':
    #     # print('using ZDA data')
    #     input_data_d = load_zda_data('data/data_zda/', subset=subset)
    # elif svar == 'drug':
    #     # print('using DRUG data')
    #     input_data_d = load_drug_data('data/', subset=subset)
    # else:
    #     # print('using HAL data')
    #     input_data_d = load_hal_data('data/DataCVFolds/', score_variable=svar, subset=subset)

    # # create hard, random subsets
    # input_data_d['hard_train_data'], input_data_d['rand_train_data'] = sample_hard_rand(input_data_d['train_data'], svar)

    # # join the complexity metrics
    # input_data_d['hard_train_data']['score_var'] = svar
    # input_data_d['rand_train_data']['score_var'] = svar

    # hard_join = pd.merge( input_data_d['hard_train_data'], sv_diff, on=['score_var', 'ID'], how='inner' )
    # rand_join = pd.merge( input_data_d['rand_train_data'], sv_diff, on=['score_var', 'ID'], how='inner' )

    # full_join = pd.concat([ hard_join, rand_join ])


    # full_join.to_csv( f'artifact_code/data/kld_demos/full_join-for{svar}.csv', index=False )

    full_join = pd.read_csv( f'data/kld_demos/full_join-for{svar}.csv' )
    
    # create protected class indicator vars
    if svar == 'wer':
        full_join['Sex'] = np.where( full_join['Subject.Gender']!='Male', 2, 1 )
        full_join['Age_Senior'] = np.where( (full_join['Age'].astype(float) >=
                                             full_join['Age'].astype(float).median()), 1, 0 )
        full_join['Race_POC'] = np.where( full_join['Subject.Race']!='White/Caucasian', 1, 0 )
        full_join['Education_Low'] = np.where( ((full_join['Subject.Education.Level']=='Less Than High School') | 
                                                (full_join['Subject.Education.Level']=='College or Trade or Vocational School')), 1, 0 )
        # full_join['Income_Low'] = np.where( full_join['Income_Cat']<3, 1, 0 )
        full_join['Income_Low'] = None      # not provided
        full_join['ESL'] = 1        # all participants must speak English as first language

    else:
        full_join['Age_Senior'] = np.where( full_join['Age']>=65, 1, 0 )
        full_join['Race_POC'] = np.where( full_join['Race']!=1, 1, 0 )
        full_join['Education_Low'] = np.where( full_join['Education']<3, 1, 0 )
        full_join['Income_Low'] = np.where( full_join['Income_Cat']<3, 1, 0 )
        full_join['ESL'] = np.where( full_join['English_First_Lang']!=1, 1, 0 )

    
    for demo_met, prot_class in protected_d.items():
        for comp_met in comp_cols:
            this_kl = calculate_vec_kld(full_join, demo_met, comp_met, prot_class)

            new_row = pd.DataFrame.from_dict([{
                'score_var': svar_d[svar], 'demo_met': demo_met,
                'comp_met': comp_met, 'KLD': this_kl
                }])
            kl_df = pd.concat([ kl_df, new_row])

kl_df = kl_df.reset_index(drop=True)


os.chdir(this_dir)




odf = kl_df.pivot_table(index=['score_var', 'comp_met'], columns='demo_met', values='KLD')
odf = odf.fillna('-')


for c in demo_d.keys():
    # odf[c] = [ '\\textbf{' + f'{v:.2f}' +'}' if v>=2 else f'{v:.2f}' for v in odf[c] ]
    odf[c] = [ f'{v:.2f}' if type(v)==float else v for v in odf[c] ]

odf.columns = [ demo_d[c] for c in odf.columns ]
odf.index.names = [None, None]
# convert to print names and add colors
odf.index = pd.MultiIndex.from_tuples([ (t[0], comp_d[t[1]]) for t in odf.index ])
odf.index = pd.MultiIndex.from_tuples( [ (t[0], ( '\\red{' + t[1] +'}' if t[1] in ['BD', 'PVI'] else 
                                                 '\\blue{' + t[1] +'}' ) ) for t in odf.index ] )

odf = odf.loc[svar_order][demo_order]
odf.to_latex( 'demo_comp_v5.tex' )