# Q3

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, precision_score

### Preprocessing data

In [38]:
# Load the dataset
data = pd.read_csv('./data.csv')

# Convert categorical variables to dummy variables
data = pd.get_dummies(data, drop_first=True)

# Encode the target variable
label_encoder = LabelEncoder()
data['income'] = label_encoder.fit_transform(data['income'])

# Split the dataset into features and target
X = data.drop('income', axis=1)
y = data['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Evaluation

In [39]:
# Train the classifier
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

# Make predictions
y_pred_test = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)

# Output results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')


Accuracy: 0.8142359926294956
Precision: 0.6390438247011953


### Fairness metrics

In [40]:
# Fairness metrics
def fairness_metrics(X, clf):
    gender_column = 'gender'
    gender_idx = X.columns.get_loc(gender_column)
    
    # Predict probabilities
    probas = clf.predict_proba(X)[:, 1]
    
    # Identify indices for men and women
    women_indices = X.iloc[:, gender_idx] == 1
    men_indices = X.iloc[:, gender_idx] == 0
    
    prob_women = probas[women_indices].mean()
    prob_men = probas[men_indices].mean()
    
    zemel_fairness = prob_men - prob_women
    disparate_impact = prob_women / prob_men
    
    return zemel_fairness, disparate_impact, prob_women, prob_men




Zemel Fairness: -0.2107150951847945
Disparate Impact: 2.8836614681729635
Probabilities of Women: 0.32257972624781417, Men: 0.11186463106301966


In [None]:
# Example usage
zemel_fairness, disparate_impact, prob_women, prob_men = fairness_metrics(X_test, clf)

print(f'Zemel Fairness: {zemel_fairness}')
print(f'Disparate Impact: {disparate_impact}')
print(f'Probabilities of Women: {prob_women}, Men: {prob_men}')

## Fair model

### Make the new training data

Step 1: Add two new columns

In [41]:
# Make predictions on training data
y_pred_train = clf.predict(X_train)
y_pred_proba_train = clf.predict_proba(X_train)  # Get predicted probabilities for training data

In [42]:
# Adding maximum probability for each record in the training set
max_proba_train = np.max(y_pred_proba_train, axis=1)

# Convert scaled training data back to DataFrame and add predictions and max probability
new_train_data = pd.DataFrame(X_train, columns=X.columns)
new_train_data['Predicted_Income'] = y_pred_train
new_train_data['Max_Probability'] = max_proba_train
new_train_data['income'] = y_train.values

###### New traning data = Previous traning data + 2 new columns
###### New test data = Previous test data 
print(new_train_data.head())

       age  fnlwgt  educational-num  race  gender  capital-gain  capital-loss  \
42392   25  188767                9     1       1             0             0   
14623   64  286732                9     1       0             0             0   
27411   29  253801                9     1       1             0             0   
1288    28  334032               11     1       1             0             0   
7078    22  173004                9     0       1             0             0   

       hours-per-week  native-country  workclass_Federal-gov  ...  \
42392              40               0                  False  ...   
14623              17               0                  False  ...   
27411              40               1                  False  ...   
1288               50               0                  False  ...   
7078                1               0                  False  ...   

       occupation_Transport-moving  relationship_Husband  \
42392                        False    

Step 2: Swap some rows based on n formula

In [43]:
# Filter and sort for CP and CD

# CP: Most probable men with INCOME above 50k
CP = new_train_data[(new_train_data['gender'] == 0) & (new_train_data['income'] == 1)].sort_values(by='Max_Probability')
# CD: Most probable women with INCOME below 50k
CD = new_train_data[(new_train_data['gender'] == 1) & (new_train_data['income'] == 0)].sort_values(by='Max_Probability', ascending=False)

###### New traning data 2 = CP + CD based on the New traning data 1
print("Size of new_train_data:", new_train_data.shape)
print("Size of CP + CD:", CP.shape[0] ,"+", CD.shape[0],"=", CP.shape[0] + CD.shape[0])


Size of new_train_data: (34189, 45)
Size of CP + CD: 1264 + 15845 = 17109


In [44]:
# Calculate the value of n
Ss = (new_train_data['gender'] == 1).sum()  # Number of women
Ss_bar = (new_train_data['gender'] == 0).sum()  # Number of men

# Number of women with PREDICTED INCOME below 50k
Ss_plus = new_train_data[(new_train_data['gender'] == 1) & (new_train_data['Predicted_Income'] == 0)].shape[0]
# Number of men with PREDICTED INCOME above 50k
Ss_bar_plus = new_train_data[(new_train_data['gender'] == 0) & (new_train_data['Predicted_Income'] == 1)].shape[0]

n = ((Ss * Ss_bar_plus) - (Ss_bar * Ss_plus)) / (Ss + Ss_bar)

# Swap the top n rows between CP and CD
n = int(n)
print(n)

n = -n
print(n)

-5412
5412


In [45]:
# Swap

CP_top_n = CP.head(n).copy()
CD_top_n = CD.head(n).copy()

# Swap the class of "income" for the top n rows
CP_top_n['income'] = 0
CD_top_n['income'] = 1

# Combine the swapped data with the remaining data
CP_final = pd.concat([CP_top_n, CP.iloc[n:]])
CD_final = pd.concat([CD_top_n, CD.iloc[n:]])

# Combine the final CP and CD datasets and other rows out of them
new_train_data = pd.concat([CP_final, CD_final, new_train_data[~new_train_data.index.isin(CP.index) & ~new_train_data.index.isin(CD.index)]])
print("Size of new_train_data:", new_train_data.shape)

Size of new_train_data: (34189, 45)


Step 3: Delete 2 added columns in the first step

In [46]:
# Train a new model on the fair dataset
new_train_data = new_train_data.drop(['Predicted_Income', 'Max_Probability'], axis=1)
print("Size of new_train_data:", new_train_data.shape)

y_train_fair = new_train_data['income']
X_train_fair = new_train_data.drop(['income'], axis=1)

Size of new_train_data: (34189, 43)


### Train the fair model

In [47]:
clf_fair = LogisticRegression(random_state=42, max_iter=1000)
clf_fair.fit(X_train_fair, y_train_fair)

# Evaluate the new model
y_fair_pred_test = clf_fair.predict(X_test)
accuracy_fair = accuracy_score(y_test, y_fair_pred_test)
precision_fair = precision_score(y_test, y_fair_pred_test)

print(f'Fair Model Accuracy: {accuracy_fair}')
print(f'Fair Model Precision: {precision_fair}')

Fair Model Accuracy: 0.7909643076503106
Fair Model Precision: 0.7408906882591093


In [48]:
zemel_fairness, disparate_impact, prob_women, prob_men = fairness_metrics(X_test, clf_fair)

print(f'Zemel Fairness: {zemel_fairness}')
print(f'Disparate Impact: {disparate_impact}')
print(f'Probabilities of Women: {prob_women}, Men: {prob_men}')


Zemel Fairness: -0.010186615589960146
Disparate Impact: 1.0268100781155522
Probabilities of Women: 0.3901413306062867, Men: 0.37995471501632655
