# Pre-Processing

<img src="../images/pre-processing.png" alt="Drawing" style="width: 600px;"/>

In [2]:
from aif360.metrics.classification_metric import ClassificationMetric
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings

import joblib
from utils import make_dataset, display_results

warnings.filterwarnings('ignore')

BIAS_INFO = {'favorable_label':0,
             'unfavorable_label':1,
             'protected_columns':['race'],
            }

PRIVILEGED_INFO = {'unprivileged_groups':[{'race': 2},
                                          {'race': 1},
                                          {'race': 4},
                                          {'race': 5},
                                          {'race': 6}],
                   'privileged_groups':[{'race': 3}]
                  }

data = pd.read_csv('../data/processed/compas-scores-two-years-processed.csv')

DROP_COLS = ['two_year_recid','compas_score','decile_score','compas_class']
FEATURE_COLS = data.drop(DROP_COLS, axis=1).columns.tolist()

# Disparate Impact Remover

## Method

TODO

## Pros and Cons

TODO

## Materials

* Paper ["Decision Theory for Discrimination-aware Classification" by Kamiran, Karim, and Zhang](https://mine.kaust.edu.sa/Documents/papers/ICDM_2012.pdf)

In [3]:
from aif360.algorithms.preprocessing import DisparateImpactRemover

dataset = make_dataset(data[FEATURE_COLS], data['two_year_recid'], **BIAS_INFO, **PRIVILEGED_INFO)

disp_imp_remover = DisparateImpactRemover(sensitive_attribute='race',repair_level=1.0)

dataset = disp_imp_remover.fit_transform(dataset).convert_to_dataframe()[0]

In [4]:
data_cleaned = data.copy()
data_cleaned['race'] = dataset['race'].values

In [5]:
train, test = train_test_split(data_cleaned, test_size=0.2, random_state=1234)

X_train, y_train = train[FEATURE_COLS], train['two_year_recid']
X_test, y_test = test[FEATURE_COLS], test['two_year_recid']

In [6]:
clf = LogisticRegression(random_state=1234)
clf.fit(X_train, y_train)

y_test_pred = clf.predict_proba(X_test)
test['recid_prediction_score'] = y_test_pred[:,1]
test['recid_prediction_class'] = (test['recid_prediction_score'] >0.5).astype(int)

acc = accuracy_score(y_test, test['recid_prediction_class'])

In [7]:
ground_truth_test = make_dataset(test[FEATURE_COLS], test['two_year_recid'], **BIAS_INFO, **PRIVILEGED_INFO)
prediction_test = make_dataset(test[FEATURE_COLS], test['recid_prediction_class'], test['recid_prediction_score'], **BIAS_INFO, **PRIVILEGED_INFO)

clf_metric = ClassificationMetric(ground_truth_test, prediction_test,**PRIVILEGED_INFO)

In [8]:
joblib.dump((clf_metric,acc), '../results/1.2-disparate_impact_remover.pkl')
display_results('../results/1.2-disparate_impact_remover.pkl')

Unnamed: 0,metric_names,scores
0,accuracy_score,0.671518
1,true_positive_rate_difference,-0.102441
2,false_positive_rate_difference,-0.24732
3,false_omission_rate_difference,-0.03537
4,false_discovery_rate_difference,-0.042279
5,error_rate_difference,-0.034496
6,false_positive_rate_ratio,0.621169
7,false_negative_rate_ratio,1.714525
8,false_omission_rate_ratio,0.905385
9,false_discovery_rate_ratio,0.877524

Unnamed: 0,metric_names,scores
0,accuracy_score,0.671518
1,true_positive_rate_difference,-0.102441
2,false_positive_rate_difference,-0.24732
3,false_omission_rate_difference,-0.03537
4,false_discovery_rate_difference,-0.042279
5,error_rate_difference,-0.034496
6,false_positive_rate_ratio,0.621169
7,false_negative_rate_ratio,1.714525
8,false_omission_rate_ratio,0.905385
9,false_discovery_rate_ratio,0.877524


# Reweighing

## Method

TODO

## Pros and Cons

TODO

## Materials

* Paper ["Decision Theory for Discrimination-aware Classification" by Kamiran, Karim, and Zhang](https://mine.kaust.edu.sa/Documents/papers/ICDM_2012.pdf)

In [10]:
from aif360.algorithms.preprocessing import Reweighing

reweighing = Reweighing(**PRIVILEGED_INFO)

In [11]:
train, test = train_test_split(data, test_size=0.2, random_state=1234)

train_dataset = make_dataset(train[FEATURE_COLS], train['two_year_recid'], **BIAS_INFO, **PRIVILEGED_INFO)
test_dataset = make_dataset(test[FEATURE_COLS], test['two_year_recid'], **BIAS_INFO, **PRIVILEGED_INFO)

train_dataset = reweighing.fit_transform(train_dataset)
test_dataset = reweighing.transform(test_dataset)

In [12]:
train_dataset.instance_weights

array([1.06614556, 1.06614556, 0.9307455 , ..., 1.06614556, 0.89397015,
       0.9307455 ])

In [13]:
train['race']=train_dataset.convert_to_dataframe()[0]['race'].values
test['race']=test_dataset.convert_to_dataframe()[0]['race'].values

train_weights = train_dataset.instance_weights

In [14]:
X_train, y_train = train[FEATURE_COLS], train['two_year_recid']
X_test, y_test = test[FEATURE_COLS], test['two_year_recid']

In [15]:
clf = LogisticRegression(random_state=1234)
clf.fit(X_train, y_train, sample_weight=train_weights)

y_test_pred = clf.predict_proba(X_test)
test['recid_prediction_score'] = y_test_pred[:,1]
test['recid_prediction_class'] = (test['recid_prediction_score'] >0.5).astype(int)

acc = accuracy_score(y_test, test['recid_prediction_class'])

In [16]:
ground_truth_test = make_dataset(test[FEATURE_COLS], test['two_year_recid'], **BIAS_INFO, **PRIVILEGED_INFO)
prediction_test = make_dataset(test[FEATURE_COLS], test['recid_prediction_class'], test['recid_prediction_score'], **BIAS_INFO, **PRIVILEGED_INFO)

clf_metric = ClassificationMetric(ground_truth_test, prediction_test,**PRIVILEGED_INFO)

In [17]:
joblib.dump((clf_metric,acc), '../results/1.2-reweighing.pkl')
display_results('../results/1.2-reweighing.pkl')

Unnamed: 0,metric_names,scores
0,accuracy_score,0.672211
1,true_positive_rate_difference,-0.087543
2,false_positive_rate_difference,-0.193763
3,false_omission_rate_difference,-0.022008
4,false_discovery_rate_difference,-0.023299
5,error_rate_difference,-0.019783
6,false_positive_rate_ratio,0.69094
7,false_negative_rate_ratio,1.610615
8,false_omission_rate_ratio,0.938378
9,false_discovery_rate_ratio,0.93068

Unnamed: 0,metric_names,scores
0,accuracy_score,0.672211
1,true_positive_rate_difference,-0.087543
2,false_positive_rate_difference,-0.193763
3,false_omission_rate_difference,-0.022008
4,false_discovery_rate_difference,-0.023299
5,error_rate_difference,-0.019783
6,false_positive_rate_ratio,0.69094
7,false_negative_rate_ratio,1.610615
8,false_omission_rate_ratio,0.938378
9,false_discovery_rate_ratio,0.93068
