In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



In [2]:
df = pd.read_csv("./data/adult.csv")

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


# Cleaning the dataset

In [3]:
df["income"] = df["income"].str.strip()
df["income"] = df["income"].map({"<=50K": 0, ">50K": 1})
print("Converted the target from string to a binary")

Converted the target from string to a binary


In [4]:

df = df.replace("?", np.nan)
df = df.dropna()

print("Removing the nan rows")

Removing the nan rows


# Preparing the model

In [5]:
X = df.drop(columns=["income", "fnlwgt"])
y = df["income"]


In [8]:
from sklearn.model_selection import train_test_split

sensitive_feature = df["race"]
sensitive_feature.value_counts()

X_train, X_test, y_train, y_test, sf_train, sf_test = train_test_split(
    X, y,sensitive_feature,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [10]:
categorical_features = X_train.select_dtypes(include="object").columns
numerical_features = X_train.select_dtypes(exclude="object").columns


In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)


In [13]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8536383225592574
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      4531
           1       0.75      0.62      0.68      1502

    accuracy                           0.85      6033
   macro avg       0.82      0.78      0.79      6033
weighted avg       0.85      0.85      0.85      6033



# Bais detection

In [13]:
sensitive_feature = X_test["race"]
sensitive_feature.value_counts()


race
White                 5203
Black                  556
Asian-Pac-Islander     179
Other                   51
Amer-Indian-Eskimo      44
Name: count, dtype: int64

In [15]:
#!pip install fairlearn

from fairlearn.metrics import (
    MetricFrame,
    selection_rate,
    true_positive_rate,
    false_positive_rate
)

from fairlearn.reductions import ExponentiatedGradient, DemographicParity


In [17]:

metric_frame = MetricFrame(
    metrics={
        "Accuracy": accuracy_score,
        "selection_rate": selection_rate,
        "true_positive_rate": true_positive_rate,
        "false_positive_rate": false_positive_rate,
    },
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=sf_test
)
metric_frame.by_group


Unnamed: 0_level_0,Accuracy,selection_rate,true_positive_rate,false_positive_rate
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Amer-Indian-Eskimo,0.909091,0.113636,0.571429,0.027027
Asian-Pac-Islander,0.854749,0.234637,0.681818,0.088889
Black,0.901079,0.106115,0.52439,0.033755
Other,0.921569,0.019608,0.0,0.020833
White,0.847396,0.217567,0.623719,0.072974


In [22]:
mitigator = ExponentiatedGradient(
    model,
    DemographicParity(),
    sample_weight_name="classifier__sample_weight"
)

mitigator.fit(X_train, y_train, sensitive_features=sf_train)
y_pred_mitigated = mitigator.predict(X_test)


In [23]:
print("Mitigated Accuracy:", accuracy_score(y_test, y_pred_mitigated))

metric_frame_mitigated = MetricFrame(
    metrics={
        "Accuracy": accuracy_score,
        "Selection Rate": selection_rate,
        "True Positive Rate": true_positive_rate
    },
    y_true=y_test,
    y_pred=y_pred_mitigated,
    sensitive_features=sf_test
)

metric_frame_mitigated.by_group


Mitigated Accuracy: 0.8471738770097795


Unnamed: 0_level_0,Accuracy,Selection Rate,True Positive Rate
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amer-Indian-Eskimo,0.840909,0.227273,0.714286
Asian-Pac-Islander,0.826816,0.195531,0.545455
Black,0.865108,0.221223,0.792683
Other,0.901961,0.117647,0.666667
White,0.845474,0.20296,0.59224


In [27]:
print("Selection Rate Difference (Before):",
      metric_frame.difference(method='between_groups')['selection_rate'])

print("Selection Rate Difference (After):",
      metric_frame_mitigated.difference(method='between_groups')['Selection Rate'])


Selection Rate Difference (Before): 0.21502902837112498
Selection Rate Difference (After): 0.10962566844919786
