In [None]:
import os
import gc
import re
import time
import numpy as np
import pandas as pd
from contextlib import contextmanager
import multiprocessing as mp
from functools import partial
from scipy.stats import kurtosis, iqr, skew
from lightgbm import LGBMClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib.pyplot as plt 
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [None]:
pd.read_csv('../../../home-credit-default-risk/HomeCredit_columns_description.csv',index_col=0)

In [None]:
feature_importances = pd.read_csv('feature_importance_model2_04.csv')

In [None]:
feature_importances.head()

In [None]:
decisions = pd.read_csv('oof_model2_04.csv')

In [None]:
decisions[['SK_ID_CURR','TARGET','PREDICTIONS']].head()

In [None]:
submission = pd.read_csv('submission_model2_04.csv')

In [None]:
test = pd.read_csv('../../../home-credit-default-risk/application_test.csv')
train = pd.read_csv('../../../home-credit-default-risk/application_train.csv')

In [None]:
train.shape, test.shape

In [None]:
data = train.merge(decisions[['SK_ID_CURR','PREDICTIONS']],on='SK_ID_CURR',how='left')
test = test.merge(submission[['SK_ID_CURR','TARGET']],on='SK_ID_CURR',how='left')

In [None]:
data.shape, test.shape

In [None]:
data.dropna(subset=['PREDICTIONS'],axis=0,inplace=True)

In [None]:

# Libraries to study
#from aif360.datasets import StandardDataset
#from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
#from aif360.algorithms.preprocessing import LFR, Reweighing
#from aif360.algorithms.inprocessing import AdversarialDebiasing, PrejudiceRemover
#from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing, EqOddsPostprocessing, RejectOptionClassification

# ML libraries
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Part I:  Analyze the effectiveness (accuracy) of the ADS by comparing its performance across different subpopulations.

There are several sub-populations of interest we explore.


In [None]:
# Compute AUC

roc_auc_score(data['TARGET'],data['PREDICTIONS'])

This is the overall AUC score achieved. Now we want to compare how different sub-populations do on this. 


In [None]:
def subpopulation_AUC(data,col='CODE_GENDER',protected = 'F',continuous=False):
    """
    Compare model performance across subpopulations compared to the overall performance. 
    
    Since AUC is the target metric of this ADS, we will use this value to start for comparison. 
    
    
    """
    
    if continuous:
        unpriviledged_group = data.loc[data[col] >= protected]
    else:
        unpriviledged_group = data.loc[data[col] == protected] 

    unpriveledged_auc = roc_auc_score(unpriviledged_group['TARGET'],unpriviledged_group['PREDICTIONS'])
    overall_auc = roc_auc_score(data['TARGET'],data['PREDICTIONS'])

    
    return unpriveledged_auc,  unpriveledged_auc - overall_auc








In [None]:
subpopulation_AUC(data,col='CODE_GENDER',protected = 'F')


For gender, we observe that the ADS performs a little bit worse on the Female only sub-population. 

In [None]:
subpopulation_AUC(data,col='FLAG_OWN_CAR',protected = 'N')

For owning a car, we observe that this model ADS performs about the same regardless of this attribute. It is worth noting that this says nothing about the potential disparity between folks who own cars and those who don't, but this attribute does not impact evaluation performance. 

In [None]:
subpopulation_AUC(data,col='NAME_FAMILY_STATUS',protected = 'Widow')

For the sub-population of widows, this model performs notably worse compared to the overall AUC. This could be an attribute worth exploring more. 

In [None]:
data.NAME_EDUCATION_TYPE.unique()

In [None]:
subpopulation_AUC(data,col='NAME_EDUCATION_TYPE',protected = 'Lower secondary')

For education, folks with a lower secondary degree also appear to be impacted negatively. 

In [None]:
data['CNT_CHILDREN'].unique()

In [None]:
subpopulation_AUC(data,col='CNT_CHILDREN',protected = 1,continuous=True)


Not entirely sure how to approach this for continuous values, but clearly there is some sort of disparity based on the number of children the applicant has, but it is certainly ambigiuous on what threshold to use. For now using 1, and we find that there is a slight difference between this group and overall. 

# Part II: Select one or several fairness or diversity measures, justify your choice of these measures for the ADS in question, and quantify the fairness or diversity of this ADS.

Now using the test set, we are exploring how if this ADS is deployed in the wild, to an extent, whether or not our fairness metrics help us identify any signficant disparities between sub-populations which are worth future auditing. 

In [None]:
test.rename(columns={"TARGET":"PREDICTIONS"},inplace=True)

In [None]:
def disparate_impact_calculator(data,col='CODE_GENDER',protected = 'F',continuous=False):
    
    if continuous:
        
        unpriviledged_group = data.loc[data[col] >= protected]
        priviledged_group = data.loc[data[col] < protected] 
        # a little ambiguous, but for some continuous values we could consider the protected class above, 
        # and the other below.
        # If not, just invert the recovered value!
        
    else: 
        
        unpriviledged_group = data.loc[data[col] == protected]
        priviledged_group = data.loc[data[col] != protected]

    return unpriviledged_group['PREDICTIONS'].mean()/priviledged_group['PREDICTIONS'].mean()

In [None]:
disparate_impact_calculator(test,col='CODE_GENDER',protected = 'F')

In [None]:
disparate_impact_calculator(data,col='CODE_GENDER',protected = 'F')

The ideal value for this attribute is 1. Not only do we observe a pretty significant deviation from this value, but we see that it gets worse when we move to test! 

In [None]:
disparate_impact_calculator(test,col='FLAG_OWN_CAR',protected = 'N')

In [None]:
disparate_impact_calculator(data,col='FLAG_OWN_CAR',protected = 'N')

Perhaps surpisingly, those who do not own cars are actually better off than those who do. There are a few explanations for this, but one hypothesis is that having a car means added expenses, thus making it less feasible for the applicant to pay a loan. 

In [None]:
disparate_impact_calculator(test,col='NAME_FAMILY_STATUS',protected = 'Widow')

In [None]:
disparate_impact_calculator(data,col='NAME_FAMILY_STATUS',protected = 'Widow')

The widow sub-population faces disparate impact. 

In [None]:
disparate_impact_calculator(test,col='NAME_EDUCATION_TYPE',protected = 'Lower secondary')

In [None]:
disparate_impact_calculator(data,col='NAME_EDUCATION_TYPE',protected = 'Lower secondary')

Much easier for lower secondary to receive favorable ratings than other groups. Why? 

In [None]:
disparate_impact_calculator(test,col='CNT_CHILDREN',protected = 1,continuous=True)


In [None]:
disparate_impact_calculator(data,col='CNT_CHILDREN',protected = 1,continuous=True)


A little bit of favor to those who have children, explanation could be that since these are typically older applicants, children are probably working, and as such, would be able to help support loan repayment if necessary. 

# Part III: Develop additional methods for analyzing ADS performance: think about stability, robustness, performance on difficult or otherwise important examples (in the style of LIME), or any other property that you believe is important to check for this ADS.

There are two things to consider for this ADS. First we want to take what were the sub-populations which demonstrated the most disparity from the measures above, and based on the system's feature importances, see if we can intuit as to why this might be the case, and based on that hypothesize on potential mitigation strategies from there. 


The next thing to understand are some local predictions, in particular for the most and least confident decisions made by the ADS.

The most confident scores will be prediction values near 0 or 1, and least confident will be predictions right around .5

In [None]:
test['PREDICTIONS'].describe()

In [None]:
# most confident prediction for train
decisions.loc[decisions['PREDICTIONS'] == data['PREDICTIONS'].min()]

In [None]:
decisions.loc[decisions['PREDICTIONS'] == data['PREDICTIONS'].max()]

In [None]:
decisions.loc[decisions['PREDICTIONS'] == .5] # TODO - find closest 

TODO- save submissiuon features, and compute these high and low confidence values for the test set. I don't have that at the moment.  

In [None]:
decisions.iloc[(decisions['PREDICTIONS']-.5).abs().argsort()[:1]]

Also need the model itself, which I currently do not have saved! 