# Q3


## Base model

In [98]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
import seaborn as sns
import matplotlib.pyplot as plt

In [99]:
# Gender encoding
GENDER_WOMEN = 0
GENDER_MEN = 1

### Preprocessing data


In [100]:
# Load the dataset
data = pd.read_csv('./data.csv')

# Convert categorical variables to dummy variables
data = pd.get_dummies(data, drop_first=True)

# Encode the target variable
label_encoder = LabelEncoder()
data['income'] = label_encoder.fit_transform(data['income'])

# Split the dataset into features and target
X = data.drop('income', axis=1)
y = data['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [101]:
# Train the classifier
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

# Make predictions
y_pred_test = clf.predict(X_test)

### Fairness metrics


In [102]:
def fairness_metrics(X, clf):
    gender_column = 'gender'
    gender_idx = X.columns.get_loc(gender_column)
    
    # Predict probabilities
    probas = clf.predict_proba(X)[:, 1]
    
    # Identify indices for men and women
    women_indices = X.iloc[:, gender_idx] == GENDER_WOMEN
    men_indices = X.iloc[:, gender_idx] == GENDER_MEN
    
    prob_women = probas[women_indices].mean()
    prob_men = probas[men_indices].mean()
    
    zemel_fairness = prob_men - prob_women
    disparate_impact = prob_women / prob_men
    
    return zemel_fairness, disparate_impact, prob_women, prob_men


### Evaluation


In [103]:
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
f1_score = f1_score(y_test, y_pred_test)
conf_matrix = confusion_matrix(y_test, y_pred_test)


print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1_score}')
print(f'Confusion Matrix: {conf_matrix}')

Accuracy: 0.8142359926294956
Precision: 0.6390438247011953
F1 Score: 0.5409780775716696
Confusion Matrix: [[10327   906]
 [ 1816  1604]]


In [104]:
zemel_fairness, disparate_impact, prob_women, prob_men = fairness_metrics(X_test, clf)

print(f'Zemel Fairness: {zemel_fairness}')
print(f'Disparate Impact: {disparate_impact}')
print(f'Probabilities of Women: {prob_women}, Men: {prob_men}')

Zemel Fairness: 0.2107150951847945
Disparate Impact: 0.3467813441477173
Probabilities of Women: 0.11186463106301966, Men: 0.32257972624781417


## 1st Fairness method

### Make the new training data


Step 1: Add two new columns


In [105]:
# Make predictions on training data
y_pred_train = clf.predict(X_train)
y_pred_proba_train = clf.predict_proba(X_train)

In [106]:
# Adding maximum probability for each record in the training set
max_proba_train = np.max(y_pred_proba_train, axis=1)

# Convert scaled training data back to DataFrame and add predictions and max probability
new_train_data = pd.DataFrame(X_train, columns=X.columns)
new_train_data['Predicted_Income'] = y_pred_train
new_train_data['Max_Probability'] = max_proba_train
new_train_data['income'] = y_train.values

###### New traning data = Previous traning data + 2 new columns
###### New test data = Previous test data 
print(new_train_data.head())

       age  fnlwgt  educational-num  race  gender  capital-gain  capital-loss  \
42392   25  188767                9     1       1             0             0   
14623   64  286732                9     1       0             0             0   
27411   29  253801                9     1       1             0             0   
1288    28  334032               11     1       1             0             0   
7078    22  173004                9     0       1             0             0   

       hours-per-week  native-country  workclass_Federal-gov  ...  \
42392              40               0                  False  ...   
14623              17               0                  False  ...   
27411              40               1                  False  ...   
1288               50               0                  False  ...   
7078                1               0                  False  ...   

       occupation_Transport-moving  relationship_Husband  \
42392                        False    

Step 2: Swap some rows based on n formula


In [107]:
# Filter and sort for CP and CD
# CP: Most probable men with INCOME above 50k
CP = new_train_data[(new_train_data['gender'] == GENDER_MEN) & (new_train_data['income'] == 1)].sort_values(by='Max_Probability')
# CD: Most probable women with INCOME below 50k
CD = new_train_data[(new_train_data['gender'] == GENDER_WOMEN) & (new_train_data['income'] == 0)].sort_values(by='Max_Probability', ascending=False)

print("Size of new_train_data:", new_train_data.shape)
print("Size of CP + CD:", CP.shape[0] ,"+", CD.shape[0],"=", CP.shape[0] + CD.shape[0])


Size of new_train_data: (34189, 45)
Size of CP + CD: 7003 + 10077 = 17080


In [108]:
# Number of women
Ss = (new_train_data['gender'] == GENDER_WOMEN).sum()  
# Number of men
Ss_bar = (new_train_data['gender'] == GENDER_MEN).sum()  

# Number of women with PREDICTED INCOME below 50k
Ss_plus = new_train_data[(new_train_data['gender'] == GENDER_WOMEN) & (new_train_data['Predicted_Income'] == 1)].shape[0]
# Number of men with PREDICTED INCOME above 50k
Ss_bar_plus = new_train_data[(new_train_data['gender'] == GENDER_MEN) & (new_train_data['Predicted_Income'] == 1)].shape[0]

n = ((Ss * Ss_bar_plus) - (Ss_bar * Ss_plus)) / (Ss + Ss_bar)

n = int(n)
print(n)

1592


In [109]:
# Swap the top n rows between CP and CD
CP_top_n = CP.head(n).copy()
CD_top_n = CD.head(n).copy()

# Swap the class of "income" for the top n rows
CP_top_n['income'] = 0
CD_top_n['income'] = 1

# Combine the swapped data with the remaining data
CP_final = pd.concat([CP_top_n, CP.iloc[n:]])
CD_final = pd.concat([CD_top_n, CD.iloc[n:]])

# Combine the final CP and CD datasets and other rows out of them
new_train_data = pd.concat([CP_final, CD_final, new_train_data[~new_train_data.index.isin(CP.index) & ~new_train_data.index.isin(CD.index)]])
print("Size of new_train_data:", new_train_data.shape)

Size of new_train_data: (34189, 45)


Step 3: Delete 2 added columns in the first step


In [110]:
# Train a new model on the fair dataset
new_train_data = new_train_data.drop(['Predicted_Income', 'Max_Probability'], axis=1)
print("Size of new_train_data:", new_train_data.shape)

y_train_fair = new_train_data['income']
X_train_fair = new_train_data.drop(['income'], axis=1)

Size of new_train_data: (34189, 43)


### Train the 1st fair model


In [111]:
clf.fit(X_train_fair, y_train_fair)

###  Evaluation

In [112]:
from sklearn.metrics import f1_score

y_fair_pred_test = clf.predict(X_test)

accuracy_fair = accuracy_score(y_test, y_fair_pred_test)
precision_fair = precision_score(y_test, y_fair_pred_test)
f1_score_fair = f1_score(y_test, y_fair_pred_test)
conf_matrix_fair = confusion_matrix(y_test, y_fair_pred_test)

print(f'Fair Model Accuracy: {accuracy_fair}')
print(f'Fair Model Precision: {precision_fair}')
print(f'Fair Model F1 Score: {f1_score_fair}')
print(f'Fair Model Confusion Matrix: {conf_matrix_fair}')


Fair Model Accuracy: 0.8032484815396165
Fair Model Precision: 0.7258200168208578
Fair Model F1 Score: 0.3744847038403124
Fair Model Confusion Matrix: [[10907   326]
 [ 2557   863]]


In [113]:
zemel_fairness, disparate_impact, prob_women, prob_men = fairness_metrics(X_test, clf)

print(f'Zemel Fairness: {zemel_fairness}')
print(f'Disparate Impact: {disparate_impact}')
print(f'Probabilities of Women: {prob_women}, Men: {prob_men}')


Zemel Fairness: 0.002840756233864594
Disparate Impact: 0.9896042066987115
Probabilities of Women: 0.27041941271471437, Men: 0.27326016894857896


## 2nd Fairness method: Calibration (Sufficiency)

### Train the 2nd fair model

In [114]:
calibrated_clf = CalibratedClassifierCV(base_estimator=clf, method='isotonic', cv=5)
calibrated_clf.fit(X_train, y_train)

# Make calibrated predictions
y_calibrated_pred_test = calibrated_clf.predict(X_test)



###  Evaluation

In [115]:
from sklearn.metrics import f1_score

accuracy_calibrated = accuracy_score(y_test, y_calibrated_pred_test)
precision_calibrated = precision_score(y_test, y_calibrated_pred_test)
f1_score_calibrated = f1_score(y_test, y_calibrated_pred_test)
conf_matrix_calibrated = confusion_matrix(y_test, y_calibrated_pred_test)


print(f'Calibrated Model Accuracy: {accuracy_calibrated}')
print(f'Calibrated Model Precision: {precision_calibrated}')
print(f'Calibrated Model F1 Score: {f1_score_calibrated}')
print(f'Calibrated Model Confusion Matrix: {conf_matrix_calibrated}')

Calibrated Model Accuracy: 0.8067289974749198
Calibrated Model Precision: 0.8629629629629629
Calibrated Model F1 Score: 0.33049645390070914
Calibrated Model Confusion Matrix: [[11122   111]
 [ 2721   699]]


In [116]:
zemel_fairness_calibrated, disparate_impact_calibrated, prob_women_calibrated, prob_men_calibrated = fairness_metrics(X_test, calibrated_clf)

print(f'Calibrated Zemel Fairness: {zemel_fairness_calibrated}')
print(f'Calibrated Disparate Impact: {disparate_impact_calibrated}')
print(f'Calibrated Probabilities of Women: {prob_women_calibrated}, Men: {prob_men_calibrated}')

Calibrated Zemel Fairness: 0.0285927350338
Calibrated Disparate Impact: 0.8863458963128191
Calibrated Probabilities of Women: 0.22298405899466744, Men: 0.25157679402846744
