# 1. Imports

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd 
import numpy as np

# 2. Extract data from csv

In [4]:
data = pd.read_csv('/home/mackenzie/git_repositories/delayedimpact/data/simData_oom10.csv')
data[['score', 'race']] = data[['score', 'race']].astype(int)
print(data)

      score  repay_probability  race  repay_indices
0       610              78.90     1              1
1       568              47.77     0              0
2       750              98.13     1              1
3       775              98.45     1              1
4       704              95.88     1              1
...     ...                ...   ...            ...
9995    832              98.99     1              1
9996    416              10.91     1              0
9997    444              14.63     1              0
9998    778              98.47     1              1
9999    738              97.68     1              1

[10000 rows x 4 columns]


# 3. Make data into train/test form

In [5]:
x = data[['score', 'race']].values
y = data['repay_indices'].values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# collect our sensitive attribute
race_train = X_train[:, 1]
race_test = X_test[:, 1]

# for fairlearn mitigator algs to work, I have to weigh the data
# for now I'm weighing everything the same
# TODO: add correct sample weights according to, http://www.surveystar.com/startips/weighting.pdf
#       and https://www.nlsinfo.org/content/cohorts/nlsy97/using-and-understanding-the-data/sample-weights-design-effects/page/0/0/#intro
sample_weight = np.ones(shape=(len(y_train),))
#sample_weight[y_train[:,1] == 0] = 1.5 

# Below example from: https://androidkt.com/set-sample-weight-in-keras/
#sample_weight[y_train == 3] = 1.5

# Q: do I need to scale the data??
# Standardize features by removing mean and scaling to unit variance:
#scaler = StandardScaler()
#scaler.fit(X_train)
#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test) 

# 4. Functions

In [3]:
def evaluation_by_race(X_test, y_test, y_predict):
    #races_test = X_test[:, 1]
    
    y_test_black = []
    y_pred_black = []
    y_test_white = []
    y_pred_white = []

    # splitting up the y_test and y_pred values by race to then use for race specific classification reports
    for index, race in enumerate(race_test):
        if(race == 0):  # black
            y_test_black.append(y_test[index])
            y_pred_black.append(y_predict[index])
        elif(race == 1):  # white
            y_test_white.append(y_test[index])
            y_pred_white.append(y_predict[index])
        else:
            print('You should not end up here...')
            
    print('EVALUATION FOR BLACK GROUP')
    print(confusion_matrix(y_test_black, y_pred_black))
    print(classification_report(y_test_black, y_pred_black)) 
    
    print('EVALUATION FOR WHITE GROUP')
    print(confusion_matrix(y_test_white, y_pred_white))
    print(classification_report(y_test_white, y_pred_white)) 

In [25]:
# Reference: https://fairlearn.org/v0.5.0/api_reference/fairlearn.metrics.html

def print_fairness_metrics(y_true, y_pred, sensitive_features, sample_weight=None):
    sr_mitigated = MetricFrame(metric=selection_rate, y_true=y_test, y_pred=y_pred_mitigated, sensitive_features=race_test)
    print('Selection Rate Overall: ', sr_mitigated.overall)
    print('Selection Rate By Group: ',sr_mitigated.by_group, '\n')

    # Note: difference of 0 means that all groups have the same selection rate
    dp_diff = demographic_parity_difference(y_true=y_test, y_pred=y_pred_mitigated, sensitive_features=race_test)
    print('DP Difference: ', dp_diff)
    # Note: ratio of 1 means that all groups have the same selection rate
    dp_ratio = demographic_parity_ratio(y_true=y_test, y_pred=y_pred_mitigated, sensitive_features=race_test)
    print('DP Ratio:', dp_ratio, '\n')
    
    # Note: difference of 0 means that all groups have the same true positive, true negative, false positive, and false negative rates.
    eod_diff = equalized_odds_difference(y_true=y_test, y_pred=y_pred_mitigated, sensitive_features=race_test)
    print('EOD Difference: ', eod_diff)
    # Note: ratio of 1 means that all groups have the same true positive, true negative, false positive, and false negative rates.
    eod_ratio = equalized_odds_ratio(y_true=y_test, y_pred=y_pred_mitigated, sensitive_features=race_test)
    print('EOD Ratio:', eod_ratio, '\n')
    
    # the below are overall metrics, similar to what I get in my confusion matrices above
    print('The below metrics are for overall:')
    
    # for the below I can add a 'sample_weight' parameter
    fner = false_negative_rate(y_true, y_pred, pos_label=1)
    print('FNER', fner)
    fper = false_positive_rate(y_true, y_pred, pos_label=1)
    print('FPER', fper)
    tnr = true_negative_rate(y_true, y_pred, pos_label=1)
    print('TNR', tnr)
    tpr = true_positive_rate(y_true, y_pred, pos_label=1)
    print('TPR', tpr)
    return

# TODOS:
1. Get fairlearn widget to work (see what msr fairlearn dude responds to my GH issue comment)
2. Add fairness metric evaluation to the baseline model
3. Get confusion matrices for after the reduction alg is used
4. Add other fairness metric constraints to the reduction alg
5. Construct a function/for loop of sorts so that I'm not repeating code 
6. Try the other reduction algorithm
7. Figure out what other baseline models I can use

# Train+Test Gaussian Naive Bayes classifier (Fairlearn used)

In [10]:
# Initialize classifier:
gnb = GaussianNB()

# Train the classifier:
model = gnb.fit(X_train, y_train, sample_weight)

In [11]:
# Make predictions with the classifier:
y_predict = gnb.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 617  232]
 [ 188 1963]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       849
           1       0.89      0.91      0.90      2151

    accuracy                           0.86      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.86      0.86      0.86      3000



In [12]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[233   0]
 [122   4]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       233
           1       1.00      0.03      0.06       126

    accuracy                           0.66       359
   macro avg       0.83      0.52      0.43       359
weighted avg       0.78      0.66      0.54       359

EVALUATION FOR WHITE GROUP
[[ 384  232]
 [  66 1959]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       616
           1       0.89      0.97      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.87      0.80      0.82      2641
weighted avg       0.88      0.89      0.88      2641



## Use MSR Fairlearn to add a fairness constraint to the model

In [26]:
# Reference for this cell's code: https://fairlearn.org/main/quickstart.html
# Reduction Algs explained here: https://fairlearn.org/main/user_guide/mitigation.html#reductions

# TODO--try Gridsearch reduction alg: https://fairlearn.org/main/user_guide/mitigation.html
from fairlearn.reductions import ExponentiatedGradient, DemographicParity
from fairlearn.metrics import *
np.random.seed(0)  # set seed for consistent results with ExponentiatedGradient

constraint = DemographicParity()
mitigator = ExponentiatedGradient(model, constraint)
mitigator.fit(X_train, y_train, sensitive_features=race_train)
y_pred_mitigated = mitigator.predict(X_test)

# THINK: the below fairness metric stuff might be more useful for the original model??

# I can plug in a bunch of different metrics to the metric parameter for evaluation

# TODO: get confusion matrices for mitigated data/groups

# Example code for getting fairness metrics from fairlearn
print_fairness_metrics(y_true=y_test, y_pred=y_pred_mitigated, sensitive_features=race_test) # sample_weight is an optional parameter

Selection Rate Overall:  0.492
Selection Rate By Group:  sensitive_feature_0
0    0.481894
1    0.493374
Name: selection_rate, dtype: object 

DP Difference:  0.011479571657144305
DP Ratio: 0.9767325028806462 

EOD Difference:  0.3240413020455939
EOD Ratio: 0.36553257666703043 

The below metrics are for overall:
FNER 0.4225941422594142
FPER 0.2756183745583039
TNR 0.7243816254416962
TPR 0.5774058577405857


In [15]:
# TODO: figure out how to get the widget to work, https://fairlearn.org/v0.6.2/api_reference/fairlearn.widget.html
from fairlearn.widget import FairlearnDashboard
FairlearnDashboard(sensitive_features=race_test,
                   sensitive_feature_names=['race'],
                   y_true=y_test,
                   y_pred={"initial model": y_predict, "mitigated model": y_pred_mitigated}) 

FairlearnWidget(value={'true_y': [0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1…

<fairlearn.widget._fairlearn_dashboard.FairlearnDashboard at 0x7fe9239acd30>

In [27]:
from fairlearn import show_versions
show_versions()

SyntaxError: invalid syntax (<ipython-input-27-9e58c3ebf1f9>, line 3)

In [29]:
pip show fairlearn jupyter notebook

Name: fairlearn
Version: 0.6.2
Summary: Algorithms for mitigating unfairness in supervised machine learning
Home-page: https://github.com/fairlearn/fairlearn
Author: Miroslav Dudik, Richard Edgar, Brandon Horn, Roman Lutz
Author-email: fairlearn@microsoft.com
License: UNKNOWN
Location: /home/mackenzie/.local/lib/python3.8/site-packages
Requires: scikit-learn, scipy, pandas, numpy
Required-by: parity-fairness
---
Name: jupyter
Version: 1.0.0
Summary: Jupyter metapackage. Install all the Jupyter components in one go.
Home-page: http://jupyter.org
Author: Jupyter Development Team
Author-email: jupyter@googlegroups.org
License: BSD
Location: /home/mackenzie/.local/lib/python3.8/site-packages
Requires: qtconsole, nbconvert, notebook, ipykernel, ipywidgets, jupyter-console
Required-by: witwidget
---
Name: notebook
Version: 6.4.0
Summary: A web-based notebook environment for interactive computing
Home-page: http://jupyter.org
Author: Jupyter Development Team
Author-email: jupyter@googlegroups

# Train+Test KNN classifier
Note: KNN does not use sample_weight in its fit function, so can't use fairlearn with it :(

In [7]:
# Ref: https://www.activestate.com/resources/quick-reads/how-to-classify-data-in-python/

# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=5)

classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [8]:
# Predict y data with classifier: 
y_predict = classifier.predict(X_test)

# Print results: 
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 634  215]
 [ 144 2007]]
              precision    recall  f1-score   support

           0       0.81      0.75      0.78       849
           1       0.90      0.93      0.92      2151

    accuracy                           0.88      3000
   macro avg       0.86      0.84      0.85      3000
weighted avg       0.88      0.88      0.88      3000



In [9]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[198  35]
 [ 30  96]]
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       233
           1       0.73      0.76      0.75       126

    accuracy                           0.82       359
   macro avg       0.80      0.81      0.80       359
weighted avg       0.82      0.82      0.82       359

EVALUATION FOR WHITE GROUP
[[ 436  180]
 [ 114 1911]]
              precision    recall  f1-score   support

           0       0.79      0.71      0.75       616
           1       0.91      0.94      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.85      0.83      0.84      2641
weighted avg       0.89      0.89      0.89      2641



# 6. Train+Test Decision Tree Classifier

In [11]:
# Reference: https://www.datacamp.com/community/tutorials/decision-tree-classification-python

# Initialize classifier:
clf = DecisionTreeClassifier()

# Train the classifier:
clf = clf.fit(X_train,y_train)

In [12]:
# Make predictions with the classifier:
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 617  232]
 [ 188 1963]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       849
           1       0.89      0.91      0.90      2151

    accuracy                           0.86      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.86      0.86      0.86      3000



In [13]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[233   0]
 [122   4]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       233
           1       1.00      0.03      0.06       126

    accuracy                           0.66       359
   macro avg       0.83      0.52      0.43       359
weighted avg       0.78      0.66      0.54       359

EVALUATION FOR WHITE GROUP
[[ 384  232]
 [  66 1959]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       616
           1       0.89      0.97      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.87      0.80      0.82      2641
weighted avg       0.88      0.89      0.88      2641



# 7. Train+Test Logistic Regression 

In [14]:
# Reference: https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a

# Instantiate classifier:
logisticRegr = LogisticRegression()

# Train the classifier:
logisticRegr.fit(X_train, y_train)

LogisticRegression()

In [15]:
# Make predictions with the classifier:
y_pred = logisticRegr.predict(X_test)

print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[ 617  232]
 [ 188 1963]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       849
           1       0.89      0.91      0.90      2151

    accuracy                           0.86      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.86      0.86      0.86      3000



In [16]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[233   0]
 [122   4]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       233
           1       1.00      0.03      0.06       126

    accuracy                           0.66       359
   macro avg       0.83      0.52      0.43       359
weighted avg       0.78      0.66      0.54       359

EVALUATION FOR WHITE GROUP
[[ 384  232]
 [  66 1959]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       616
           1       0.89      0.97      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.87      0.80      0.82      2641
weighted avg       0.88      0.89      0.88      2641



# 8. Train+Test Support Vector Machines

Reference: https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python

## 8.1 Linear Kernel

In [17]:
# Instantiate classifier:
clf = svm.SVC(kernel='linear')  # can try other kernels

#Train the model using the training sets
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [18]:
# Make predictions
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[ 617  232]
 [ 188 1963]]
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       849
           1       0.89      0.91      0.90      2151

    accuracy                           0.86      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.86      0.86      0.86      3000



In [19]:
evaluation_by_race(X_test, y_test, y_predict)

EVALUATION FOR BLACK GROUP
[[233   0]
 [122   4]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       233
           1       1.00      0.03      0.06       126

    accuracy                           0.66       359
   macro avg       0.83      0.52      0.43       359
weighted avg       0.78      0.66      0.54       359

EVALUATION FOR WHITE GROUP
[[ 384  232]
 [  66 1959]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72       616
           1       0.89      0.97      0.93      2025

    accuracy                           0.89      2641
   macro avg       0.87      0.80      0.82      2641
weighted avg       0.88      0.89      0.88      2641



# Extra Notes

TODO: increase dataset oom even more
TODO: try other svm kernels

https://stackabuse.com/overview-of-classification-methods-in-python-with-scikit-learn