In [4]:
import numpy as np
import pandas as pd

# TASK 1

In [14]:
RED = 100 / 9
BLUE = 100 # Blue population being 9 times larger than the Red population

In [15]:
column_mapping = {'Will use XAI': True, 'Will not use XAI': False,  'Enrolled in training': True, 'Not enrolled in training': False}

In [16]:
columns = ['Will use XAI', 'Will not use XAI']
index = ['Enrolled in training', 'Not enrolled in training']

red_enrolled = RED // 4
matrix_red = np.array([[red_enrolled, red_enrolled], [red_enrolled, red_enrolled]])
matrix_blue = np.array([[60,5], [20, 15]])


matrix_red = pd.DataFrame(matrix_red, columns=columns, index=index)
matrix_blue = pd.DataFrame(matrix_blue, columns=columns, index=index)


In [232]:
def deomgraphic_parity(matrix_1, matrix_2):
    
    def fun(matrix):
        array = matrix
        if isinstance(matrix, pd.DataFrame):
            array = matrix.to_numpy()
        numerator = array[0].sum()
        denominator = array.sum().sum()
        return numerator / denominator
    
    result_1 = fun(matrix_1)
    result_2 = fun(matrix_2)
    
    return result_1 / result_2

def equal_opportunity(matrix_1, matrix_2):
    
    def fun(matrix):
        array = matrix
        if isinstance(matrix, pd.DataFrame):
            array = matrix.to_numpy()
        numerator =  array[0, 0]
        denominator =  array[:, 0].sum()
        return numerator / denominator
    result_1 = fun(matrix_1)
    result_2 = fun(matrix_2)
    
    return result_1 / result_2

def predictive_rate_parity(matrix_1, matrix_2):
    
    def fun(matrix):
        array = matrix
        if isinstance(matrix, pd.DataFrame):
            array = matrix.to_numpy()
        numerator =  array[0, 0]
        denominator =  array[0].sum()
        return numerator / denominator
    result_1 = fun(matrix_1)
    result_2 = fun(matrix_2)
    
    return result_1 / result_2

In [240]:
def print_results(matrix_red, matrix_blue):
    dp = deomgraphic_parity(matrix_red, matrix_blue)
    eo = equal_opportunity(matrix_red, matrix_blue)
    prp = predictive_rate_parity(matrix_red, matrix_blue)
    
    print(f"""
            Demographic Parity: {dp:.4f} \n
            Equal Opportunity: {eo:.4f} \n 
            Preditctive Rate Parity: {prp:.4f}""")

In [241]:
print_results(matrix_red, matrix_blue)


            Demographic Parity: 0.7692 

            Equal Opportunity: 0.6667 
 
            Preditctive Rate Parity: 0.5417


# TASK 2

In [334]:
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from dalex.fairness import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

In [260]:
data = pd.read_csv('data/bank-additional-full.csv', delimiter=';')

In [303]:
encoder = LabelEncoder()
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = encoder.fit_transform(data[col])

X = data.drop('y', axis= 1)
y = data[['y']]
X['less_than_65'] = X['age'] < 65

In [304]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# model 1

In [357]:
index_true  = X_test['less_than_65'] == True
index_false = X_test['less_than_65'] == False

In [358]:
xgb = XGBClassifier(eta = 0.3, max_depth = 10)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_pred, y_test)

0.909686817188638

In [359]:
matrix_true_1 = confusion_matrix(y_test[index_true], y_pred[index_true])
matrix_false_1 = confusion_matrix(y_test[index_false], y_pred[index_false])

In [360]:
print_results(matrix_false_1, matrix_true_1)


            Demographic Parity: 0.6463 

            Equal Opportunity: 0.8029 
 
            Preditctive Rate Parity: 0.7314


# Model 2

In [361]:
svm = SVC(kernel='rbf', C = 10)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy_score(y_pred, y_test)

  y = column_or_1d(y, warn=True)


0.9021607186210245

In [362]:
matrix_true_2 = confusion_matrix(y_test[index_true], y_pred[index_true])
matrix_false_2 = confusion_matrix(y_test[index_false], y_pred[index_false])

In [363]:
print_results(matrix_false_2, matrix_true_2)


            Demographic Parity: 0.6463 

            Equal Opportunity: 0.7275 
 
            Preditctive Rate Parity: 0.9408


# Model 3

In [364]:
sample_weights = compute_sample_weight('balanced', X_train['less_than_65'])
xgb_bias_mitigation = XGBClassifier(eta = 0.3, max_depth = 10)
xgb_bias_mitigation.fit(X_train, y_train, sample_weight=sample_weights)
y_pred = xgb_bias_mitigation.predict(X_test)
accuracy_score(y_pred, y_test)

0.9127215343529983

In [365]:
matrix_true_3 = confusion_matrix(y_test[index_true], y_pred[index_true])
matrix_false_3 = confusion_matrix(y_test[index_false], y_pred[index_false])

print_results(matrix_false_3, matrix_true_3)


            Demographic Parity: 0.6463 

            Equal Opportunity: 0.8302 
 
            Preditctive Rate Parity: 0.7922


# COMMENTS

(2) For model 1 we can see that the Demographic Parity: 0.6463, which means that
there is disparity. For model 2 Demographic Parity: 0.6463 which
means that both models get the same result under the 4/5 treshold

(3) For model 1 we can see that the Equal Opportunity: 0.8029, which means that
there is no disparity. Interstingly for model 2 Equal Opportunity: 0.7275  which
means that the model get result under the 4/5 treshold

(4) For model 1 we can see that the Preditctive Rate Parity: 0.7314, which means
that it did not pass the threshold. Interstingly for model 2 Preditctive Rate Parity: 0.9408 which
means that model is in the range 

(5) For model 1 we use bias mittiagtion methon of weighting a data. From the
result above we can observe that we manage to increase the in 2 out of 3 metrics.
Still we are not in the range for Demographic Parity: 0.6463 but for Equal
Opportunity: 0.8302 higher than recomended threshold. For last metric we
observed a much decrease. Finally the, accuracy of a model also increased a
lttile bit 0.9127215343529983 (0.909686817188638 before) 