# Task 2

## Utils

In [348]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.linear_model import LinearRegression


import torch
import torch.nn as nn


## 1.
Train a model for the selected dataset.

In [336]:
DATASET_PATH = './adult/raw/'
df_data = pd.read_csv(DATASET_PATH + 'adult.data', header=None, skipinitialspace=True)
df_data = df_data.replace('?', np.nan)
df_data = df_data.dropna()
df_test = pd.read_csv(DATASET_PATH + 'adult.test', header=None, skiprows=1, skipinitialspace=True)
df_test = df_test.replace('?', np.nan,)
df_test = df_test.dropna()

In [337]:
def get_x_y(df):
    X = df.drop(df.columns[-1], axis=1)
    y = df[df.columns[-1]].str.rstrip('.')

    # Cleaning up categorical data
    X["sex"], sex_categories = pd.factorize(X[9])
    cols = X.columns
    num_cols = X._get_numeric_data().columns
    cat_cols = list(set(cols) - set(num_cols))
    X = pd.get_dummies(X, columns = cat_cols, dtype='int')
    X.rename(str, axis='columns', inplace=True)

    y, y_categories = pd.factorize(y)
        
    return X, y , sex_categories, y_categories

In [338]:
data_size = len(df_data)
df = pd.concat([df_data, df_test])
X, y, sex_categories, y_categories = get_x_y(df)
X_train = X[:data_size]
X_test = X[data_size:]
y_train = y[:data_size]
y_test = y[data_size:]

In [302]:
FEMALE = sex_categories.get_loc('Female')
MALE = sex_categories.get_loc('Male')
HIGH_SALARY = y_categories.get_loc('>50K')
LOW_SALARY = y_categories.get_loc('<=50K')
print(sex_categories)
print(y_categories)

Index(['Male', 'Female'], dtype='object')
Index(['<=50K', '>50K'], dtype='object')


In [361]:
gnb = GaussianNB()
gnb = gnb.fit(X_train, y_train)
print(accuracy_score(y_test, gnb.predict(X_test)))
print(accuracy_score(y_train, gnb.predict(X_train)))
print(accuracy_score(y, gnb.predict(X)))



0.7887118193891103
0.7885418738810424
0.7885984697713502


## 2. 
For the selected protected attribute (age, gender, race) calculate the following fairness coefficients: Statistical parity, Equal opportunity, Predictive parity.

In [318]:
def TP(Y, Y_, column, value):
    return ((Y == 1) & (Y_ == 1) & (column == value)).sum()

def FP(Y, Y_, column, value):
    return ((Y == 0) & (Y_ == 1) & (column == value)).sum()

def FN(Y, Y_, column, value):
    return ((Y == 1) & (Y_ == 0) & (column == value)).sum()

def TN(Y, Y_, column, value):
    return ((Y == 0) & (Y_ == 0) & (column == value)).sum()

# sensitive column is a pd Series with 0,1 values. 0 is the privileged group
def statistical_parity( predictions, sensitive_column ):
    sensitive_values_counts = sensitive_column.value_counts()
    count = {}
    count[1] = ((sensitive_column == 1) & (predictions == 1)).sum()
    count[0] = ((sensitive_column == 0) & (predictions == 1)).sum()
    return (count[1]/sensitive_values_counts[1]) / (count[0]/sensitive_values_counts[0])

# TPR = TP/(TP+FN)
def TPR(Y, predictions, sensitive_column, sensitive_value):
    tp = TP(Y, predictions, sensitive_column, sensitive_value)
    fn = FN(Y, predictions, sensitive_column, sensitive_value)
    return tp/(tp+fn)

#  0 is the privileged group
def TPR_coeff(Y, predictions, sensitive_column):
    return TPR(Y, predictions, sensitive_column, 1) / TPR(Y, predictions, sensitive_column, 0)

# FPR = FP/(FP+TN)
def FPR(Y, predictions, sensitive_column, sensitive_value):
    fp = FP(Y, predictions, sensitive_column, sensitive_value)
    tn = TN(Y, predictions, sensitive_column, sensitive_value)
    return fp/(fp+tn)

# 0 is the privileged group
def FPR_coeff(Y, predictions, sensitive_column):
    return FPR(Y, predictions, sensitive_column, 1) / FPR(Y, predictions, sensitive_column, 0)

# Equal opportunity is a combination of TPR and FPR
def equal_opportunity(Y, predictions, sensitive_column):
    return TPR_coeff(Y, predictions, sensitive_column), FPR_coeff(Y, predictions, sensitive_column)

# Positive Predictive Value PPV = TP / (TP + FP)
# 0 is the privileged group
def PPV(Y, predictions, sensitive_column, sensitive_value):
    tp = TP(Y, predictions, sensitive_column, sensitive_value)
    fp = FP(Y, predictions, sensitive_column, sensitive_value)
    return tp/(tp+fp)

# 0 is the privileged group
def PPV_coeff(Y, predictions, sensitive_column):
    return PPV(Y, predictions, sensitive_column, 1) / PPV(Y, predictions, sensitive_column, 0)

# Negative Predictive Value NPV = TN / (TN + FN)
def NPV(Y, predictions, sensitive_column, sensitive_value):
    tn = TN(Y, predictions, sensitive_column, sensitive_value)
    fn = FN(Y, predictions, sensitive_column, sensitive_value)
    return tn/(tn+fn)

# 0 is the privileged group
def NPV_coeff(Y, predictions, sensitive_column):
    return NPV(Y, predictions, sensitive_column, 1) / NPV(Y, predictions, sensitive_column, 0)

def predictive_parity(Y, predictions, sensitive_column):
    return PPV_coeff(Y, predictions, sensitive_column), NPV_coeff(Y, predictions, sensitive_column)

def print_fairness_coeffs(Y, predictions, sensitive_column):
    print("Statistical parity: ", statistical_parity(predictions, sensitive_column))
    print("Equal opportunity: ", equal_opportunity(Y, predictions, sensitive_column))
    print("Predictive parity: ", predictive_parity(Y, predictions, sensitive_column))

In [340]:
print_fairness_coeffs(y, gnb.predict(X), X['sex'] )

Statistical parity:  0.5384678645937832
Equal opportunity:  (1.0279914604195763, 0.7168212113497263)
Predictive parity:  (0.6939013639283151, 1.2228710321478995)


## 3.
Train another model (different hyperparameters, feature transformations etc., different family of models) and see how the coefficients Statistical parity, Equal opportunity, Predictive parity behave for it. Are they different/similar?

In [419]:
param_grid = {
    'n_estimators': [100,  300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2,],
    'min_samples_leaf': [1, 2, 3],
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                             n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s

[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   6.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   6.5s


In [420]:
grid_search.best_params_

{'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

In [421]:
estimator = grid_search.best_estimator_
print("Test accuracy: ", accuracy_score(y_test, estimator.predict(X_test)))
print("Train accuracy: ", accuracy_score(y_train, estimator.predict(X_train)))
print("All accuracy: ", accuracy_score(y, estimator.predict(X)))

Test accuracy:  0.8597609561752988
Train accuracy:  0.918374113122472
All accuracy:  0.8988545398257485


In [320]:
print_fairness_coeffs(y, estimator.predict(X), X['sex'])

Statistical parity:  0.3111159295972306
Equal opportunity:  (0.8956481664362367, 0.202177864485582)
Predictive parity:  (1.035384388372436, 1.096351065356727)


## 4.
Apply the selected bias mitigation technique (like data balancing) on the first model. Check how Statistical parity, Equal opportunity, Predictive parity coefficients behave after this mittigation.

In [403]:
# Upsampling women with high earnings in the training set
data_upsampled =X_train.copy()
data_upsampled['target'] = y_train
df_majority = data_upsampled[(data_upsampled['sex'] == MALE) | ((data_upsampled['sex'] == FEMALE) & (data_upsampled['target'] == LOW_SALARY))]
df_minority = data_upsampled[(data_upsampled['sex'] == FEMALE) &  (data_upsampled['target'] == HIGH_SALARY)]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,   
                                 n_samples=len(df_majority)//2,   
                                 random_state=42)  
data_upsampled = pd.concat([df_majority, df_minority_upsampled])

X_train_upsampled = data_upsampled.drop('target', axis=1)
y_train_upsampled = data_upsampled['target']

In [405]:
gnb = GaussianNB()
gnb = gnb.fit(X_train_upsampled, y_train_upsampled)
print(accuracy_score(y_test, gnb.predict(X_test)))
print(accuracy_score(y_train, gnb.predict(X_train)))
print(accuracy_score(y, gnb.predict(X)))
print_fairness_coeffs(y, gnb.predict(X), X['sex']) 

0.7873173970783532
0.7865194615741662
0.7867851930476317
Statistical parity:  0.5444688947698234
Equal opportunity:  (1.0280332458054133, 0.7181452759453932)
Predictive parity:  (0.6862812127150568, 1.2233050867715072)


In [424]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [ 5,10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = 3,
                             n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X_train_upsampled, y_train_upsampled)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   6.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   7.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   7.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   8.8s
[CV] END max_depth=5, min_samples_leaf

In [425]:
grid_search.best_params_

{'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

In [426]:
estimator = grid_search.best_estimator_
print("Test accuracy: ", accuracy_score(y_test, estimator.predict(X_test)))
print("Train accuracy: ", accuracy_score(y_train, estimator.predict(X_train)))
print("All accuracy: ", accuracy_score(y, estimator.predict(X)))
print_fairness_coeffs(y, estimator.predict(X), X['sex'])

Test accuracy:  0.8484063745019921
Train accuracy:  0.9064717193820039
All accuracy:  0.8871345805139091
Statistical parity:  0.6440601672477777
Equal opportunity:  (1.321389783015158, 1.215616215448327)
Predictive parity:  (0.7457143977244511, 1.133973073162999)


## 5.
Comparison

In [430]:
statistical_parity(y,X['sex'])

0.3634695423643793

In [429]:
statistical_parity(y_train_upsampled,X_train_upsampled['sex'])

1.9953426647205266