## Load Ledger

General preprocessing to go from ledger to artifact_code/

Note that some of the artifacts may have separate preprocessing notebooks as well.

In [1]:
# !pip install pandas numpy matplotlib seaborn scipy scikit-learn torch multiprocess

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from utils.data_loading_no_demo import *

In [None]:
# WARNING; takes ~1.5min
ledger_len = pd.read_csv( 'data/ledger_len.csv', low_memory=False )

# focus only on train
sub_ledger = ledger_len[ ledger_len['split_full'].str.contains('train') ].reset_index(drop=True)
sub_ledger.columns = [ c.lower() if c!='ID' else c for c in sub_ledger.columns ]

## model_diff

Aggregate PVI and loss up to model-level (with other model-level metrics)

In [3]:
id_cols = [ 'score_var', 'model_name', 'ID', 'epoch' ]
split_lst = ['train-None', 'train-Constant', 'val', 'test']
comp_cols = [ 'boundary_prox', 'losses', 'pvi', 'times_forgotten', 'instance_hardness', 'irt_difficulty', 'tok_len' ]

model_diff = sub_ledger.groupby( id_cols[:-1] )[ comp_cols ].mean().reset_index()
model_diff['strat'] = [ s[-1] for s in model_diff['model_name'].str.split('-') ]

model_diff.to_csv( 'artifact_code/data/model_diff.csv', index=False )

## dataset_id_counts

In [4]:
# this is used as a more efficient way to get the number of data observations by task and split
# (rather than using the full ledger each time)
dataset_id_counts = sub_ledger.groupby( ['score_var', 'split_full'] ).agg({ 'ID': 'nunique' }).reset_index().rename(
    columns={'ID': 'n_unique_IDs'})

dataset_id_counts.to_csv( 'artifact_code/data/dataset_id_counts.csv', index=False )

## test_perform_full

This DataFrame is used to calculate all performance and fairness related metrics in the paper.

In [6]:
from utils.post_processing import remove_duplicate_id

# this object is used to ensure we don't lose any observations when performing joins
check_counts = ledger_len.groupby([ 'score_var' ])[ 'ID' ].count().reset_index().rename(columns={'ID': 'count'})

# this function joins demographic variables to the ledger and checks to ensure no data is added / lost
def add_demos_to_ledger(this_ledger):
    jdf_lst = []
    for svar in this_ledger['score_var'].unique():
        sdf = this_ledger[ this_ledger['score_var']==svar ]
        print( svar )

        # NOTE; Depression data with 'wer' (Word Error Rate) score variable not publicly available via IRB
        if svar == 'wer':    
            these_demos = pd.read_excel( 'data/data_zda/DenamedDemographics_and_more.xlsx' )
            jdf = pd.merge( sdf, these_demos, how='left',
                                        left_on='user', right_on='Subject.ID' )
        else:
            these_demos = load_and_process_demo('data/DataCVFolds/', svar)
            id_text = pd.concat( load_hal_data('data/DataCVFolds/', svar, subset=False).values() )[['ID', 'text']].drop_duplicates()
            sdf = pd.merge( sdf, id_text, on='ID', how='left' ) 

            # join demographic info
            these_demos = load_and_process_demo('data/DataCVFolds/', svar)
            these_demos = these_demos[these_demos['File']==1].reset_index(drop=True)

            jdf = pd.merge(sdf, these_demos, left_on='text',
                            right_on='Text_'+svar, how='left').drop('Text_'+svar, axis=1)
        
        # JOIN CHECK: must have the same number of IDs in the existing ledger for this task-split
        join_check_num = check_counts[check_counts['score_var']==svar]['count'].iloc[0]
        if join_check_num != len(jdf):
            print('-----------------------------')
            print(f'JOIN PROBLEM WITH {svar}')
            print('-----------------------------')
            raise ValueError('Shape of joined data must match expected dimensions of this_ledger')
        
        jdf_lst.append( jdf )

    return pd.concat( jdf_lst )


# WARNING; takes ~20sec
full_ledger_len = add_demos_to_ledger(ledger_len)

Numeracy
Anxiety
TrustPhys
SubjectiveLit


In [7]:
from sklearn.metrics import roc_auc_score, f1_score

# this function converts demographics columns to which rows fall into the protected vs. privileged
# classes -- used for disparate impact (DI) calculations
def add_prot_cols(tdf):
    this_svar = tdf['score_var'].iloc[0]

    # NOTE; Depression data with 'wer' (Word Error Rate) score variable not publicly available via IRB
    if this_svar == 'wer':
        tdf['Sex'] = np.where( tdf['Subject.Gender']!='Male', 2, 1 )
        tdf['Age_Senior'] = np.where( tdf['Age_x'] >= 50, 1, 0 )     # 33 yrs old is median
        tdf['Race_POC'] = np.where( tdf['Subject.Race']!='White/Caucasian', 1, 0 )
        tdf['Education_Low'] = np.where( ((tdf['Subject.Education.Level']=='Less Than High School') | 
                                                (tdf['Subject.Education.Level']=='College or Trade or Vocational School')), 1, 0 )
        # tdf['Income_Low'] = np.where( tdf['Income_Cat']<3, 1, 0 )
        tdf['Income_Low'] = None      # not provided
        tdf['ESL'] = 1        # all participants must speak English as first language

    else:
        # NOTE; no suffix on Age when we remove wer data
        # tdf['Age_Senior'] = np.where( tdf['Age_y']>=65, 1, 0 )
        tdf['Age_Senior'] = np.where( tdf['Age']>=65, 1, 0 )
        tdf['Race_POC'] = np.where( tdf['Race']!=1, 1, 0 )
        tdf['Education_Low'] = np.where( tdf['Education']<3, 1, 0 )
        tdf['Income_Low'] = np.where( tdf['Income_Cat']<3, 1, 0 )
        tdf['ESL'] = np.where( tdf['English_First_Lang']!=1, 1, 0 )

    return tdf

# this function calculates disparate impact with an alpha=0.05 smoothing value
# (for when there are 0 people in the privileged class)
def calculate_disp_impact(tdf, this_demo, alpha=0.005):
    prot_df = tdf[tdf[this_demo]==1]
    priv_df = tdf[tdf[this_demo]!=1]

    return ( prot_df['preds'].mean() / (priv_df['preds'].mean()+alpha) )

# this function calculates ADJUSTED disparate impact with an alpha=0.05 smoothing value
# incorporates the base rate, but we report standard / unadjusted DI due to its legal
# implications for 0.8 and 1.2 thresholds
def calculate_adj_disp_impact(tdf, this_demo, alpha=0.05):
    prot_df = tdf[tdf[this_demo]==1]
    priv_df = tdf[tdf[this_demo]!=1]

    p_prot = prot_df['preds'].mean()
    base_prot = prot_df['labels'].mean()

    p_priv = priv_df['preds'].mean()
    base_priv = priv_df['labels'].mean()

    return ( p_prot*base_priv) / ( ( p_priv*base_prot ) + alpha)


# this function reads in logs (from trained models) and calculates the best
# performance epoch (according to Val AUC)
def create_perform_tdf(mn):
    this_log = pd.read_csv( f'logs/{mn}_log.txt', sep='\t' )
    this_best_ep = this_log.loc[this_log['val_auc'].idxmax(), 'epoch']
    tdf = full_ledger_len[( (full_ledger_len['model_name']==mn) &
                    (full_ledger_len['epoch']==this_best_ep) &
                     (full_ledger_len['split_full']=='test') )]
    tdf['best_epoch'] = this_best_ep
    return tdf


# this function calculates performance metrics from the best epoch of each model
# and adds fairness metrics
def proc_test_perform_row(tdf):
    tdf_full = add_prot_cols(tdf)

    this_acc = tdf_full['correct'].mean()
    this_auc = roc_auc_score( y_true=tdf_full['labels'], y_score=tdf_full['probs'] )
    this_f1 = f1_score( y_true=tdf_full['labels'], y_pred=tdf_full['preds'] )

    new_row = pd.DataFrame.from_dict( [{
        'model_name': tdf_full['model_name'].iloc[0], 'test_acc': this_acc,
        'test_auc': this_auc, 'test_f1': this_f1,
        'best_epoch': tdf_full['best_epoch'].iloc[0]
    }] )

    these_demos = ['Sex', 'Age_Senior', 'Race_POC', 'Education_Low', 'Income_Low', 'ESL']
    for this_demo in these_demos:
        new_row[this_demo+"|DI"] = calculate_disp_impact(tdf_full, this_demo)
        new_row[this_demo+"|ADI"] = calculate_adj_disp_impact(tdf_full, this_demo)

    return new_row


# simplify model names for easier reading
mt_d = {
    'bert-base-uncased': 'bert',
    'xlm-roberta-base-uncased-all-english': 'roberta',
    'local-ffn': 'ffn',
    'local-cnn': 'cnn',
    'local-lstm': 'lstm'
}

these_mns = list( sub_ledger['model_name'].unique() )


# NOTE; parallelization supported on Linux but won't work on Windows
# import multiprocess
# #multisetup
# cores_to_use = multiprocess.cpu_count()-1
# pool = multiprocess.Pool(cores_to_use)

# # multi create
# # WARNING; takes ~1min
# with multiprocess.Pool(cores_to_use) as pool:
#     tdf_lst = pool.map(create_perform_tdf, these_mns)

# # multi proc
# with multiprocess.Pool(cores_to_use) as pool:
#     perform_lst = pool.map(proc_test_perform_row, tdf_lst)


# serial application of the above functions (works on any OS)
# WARNING; takes ~3min
tdf_lst = [ create_perform_tdf(mn) for mn in these_mns ]
perform_lst = [ proc_test_perform_row(tdf) for tdf in tdf_lst ]

test_perform_df = pd.concat( perform_lst ).reset_index(drop=True)

# add task, hard vs. random split, and type of model (all contained in model names)
test_perform_df['score_var'] = [ s.split('_')[1] for s in test_perform_df['model_name'] ]
test_perform_df['strat'] = [ s.split('_')[-1].split('-')[1] for s in test_perform_df['model_name'] ]
test_perform_df['model_type'] = [ mt_d[ s[0] ] for s in test_perform_df['model_name'].str.split('_') ]

In [8]:
# add ability from the py-irt response patterns
all_ab_df = pd.DataFrame()
for svar in test_perform_df['score_var'].unique():
    ab_df = pd.read_csv( f"py-irt_response-patterns/{svar}_test_mn_ability.csv" )
    all_ab_df = pd.concat([ all_ab_df, ab_df ])


# merge and clean the final dataframe for results and fairness
test_perform_full = pd.merge( test_perform_df, all_ab_df, on='model_name', how='left' )

id_cols = ['model_name', 'model_type', 'score_var', 'strat', 'best_epoch']
perform_cols = [ 'test_acc', 'test_auc', 'test_f1', 'irt_ability' ]
di_cols = [ c for c in test_perform_full if '|DI' in c ]
adi_cols = [ c for c in test_perform_full if '|ADI' in c ]
test_perform_full = test_perform_full[id_cols+perform_cols+di_cols+adi_cols]

test_perform_full.to_csv( 'artifact_code/data/test_perform_full.csv', index=False )