<a href="https://colab.research.google.com/github/luisafelixx/Fairness_Diabetes/blob/main/Mitigation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install aif360


Collecting aif360
  Downloading aif360-0.6.1-py3-none-any.whl.metadata (5.0 kB)
Downloading aif360-0.6.1-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.7/259.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aif360
Successfully installed aif360-0.6.1


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
# Bibliotecas para manipulação de dados
import pandas as pd
import numpy as np

# Bibliotecas para visualização
import seaborn as sns
import matplotlib.pyplot as plt

# Bibliotecas para manipulação de dados e modelagem
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

# Modelos de Machine Learning
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import AdaBoostClassifier
#from catboost import CatBoostClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

# Métricas de avaliação
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, ConfusionMatrixDisplay, classification_report
)

#Métricas de Fairness
from aif360.sklearn.metrics import (
    statistical_parity_difference,
    equal_opportunity_difference,
    average_odds_difference,
    disparate_impact_ratio,

)

#Mitigação
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing import Reweighing
from aif360.metrics import ClassificationMetric
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.inprocessing import PrejudiceRemover


pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'
pip install 'aif360[OptimalTransport]'


In [5]:
data = pd.read_csv('dados_limpos2015.csv')

In [6]:
data['Etnia'] = data['Etnia'].apply(lambda x: 1 if x == 1 else 0)
data['Etnia'].value_counts()

Unnamed: 0_level_0,count
Etnia,Unnamed: 1_level_1
1,201886
0,49581


# Mitigation

## Reweighing - Etnia

In [7]:
privileged_groups = [{'Etnia': 1}]
unprivileged_groups = [{'Etnia': 0}]

In [8]:
dataset_Etnia = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=data,
    label_names=['Diabetes'],
    protected_attribute_names=['Etnia']
)

In [9]:
train, test = dataset_Etnia.split([0.7], shuffle=True)

In [10]:
rw = Reweighing(unprivileged_groups= [{'Etnia' : 0}],
                privileged_groups= [{'Etnia' : 1}])

train_rw = rw.fit_transform(train)

### XBoost

In [11]:
XBoost_rw = xgb.XGBClassifier(
        objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

XBoost_rw.fit(train_rw.features, train_rw.labels.ravel() , sample_weight=train_rw.instance_weights)

Parameters: { "use_label_encoder" } are not used.



In [12]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = XBoost_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Etnia': 0}],
                              privileged_groups=[{'Etnia': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.01006422625057974
EOD: -0.02810167223303564
DIR: 1.1814106734721315
AOD: -0.012182759132986932


### SVM

In [13]:
svm_rw = SGDClassifier(loss="hinge", random_state=42)
svm_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

In [14]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = svm_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Etnia': 0}],
                              privileged_groups=[{'Etnia': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())



SPD: 0.008685379033728013
EOD: -0.05296429105360412
DIR: 1.0203508968267463
AOD: -0.03389653487232108


### CB

In [15]:
cb_rw = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    random_seed=42
)

cb_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

0:	learn: 0.6122726	total: 123ms	remaining: 12.2s
1:	learn: 0.5474110	total: 187ms	remaining: 9.15s
2:	learn: 0.5005873	total: 234ms	remaining: 7.57s
3:	learn: 0.4667147	total: 284ms	remaining: 6.82s
4:	learn: 0.4383250	total: 388ms	remaining: 7.37s
5:	learn: 0.4198704	total: 468ms	remaining: 7.33s
6:	learn: 0.4040656	total: 571ms	remaining: 7.58s
7:	learn: 0.3911540	total: 676ms	remaining: 7.77s
8:	learn: 0.3829457	total: 774ms	remaining: 7.82s
9:	learn: 0.3751996	total: 863ms	remaining: 7.76s
10:	learn: 0.3689716	total: 963ms	remaining: 7.79s
11:	learn: 0.3646867	total: 1.06s	remaining: 7.78s
12:	learn: 0.3604056	total: 1.17s	remaining: 7.82s
13:	learn: 0.3572541	total: 1.27s	remaining: 7.81s
14:	learn: 0.3548266	total: 1.39s	remaining: 7.85s
15:	learn: 0.3530124	total: 1.47s	remaining: 7.71s
16:	learn: 0.3507865	total: 1.58s	remaining: 7.71s
17:	learn: 0.3491215	total: 1.68s	remaining: 7.66s
18:	learn: 0.3481294	total: 1.78s	remaining: 7.59s
19:	learn: 0.3469767	total: 1.88s	remaini

<catboost.core.CatBoostClassifier at 0x7d61041e0ad0>

In [16]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = cb_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Etnia': 0}],
                              privileged_groups=[{'Etnia': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.005397766088434029
EOD: -0.031081348817563886
DIR: 1.1118808018980175
AOD: -0.015547029319299667


### Logiatic Regression

In [17]:
LogistReg_rw = LogisticRegression(solver='saga')
LogistReg_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)



In [18]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = LogistReg_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Etnia': 0}],
                              privileged_groups=[{'Etnia': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())
print("Accuracy:", metric.accuracy())

SPD: 0.013670852719307812
EOD: 0.0024735869623591367
DIR: 1.287391598384199
AOD: 0.002999731268989515
Accuracy: 0.8482920427884042


##Reweighing - Sexo

In [19]:
privileged_groups = [{'Sexo': 1}]
unprivileged_groups = [{'Sexo': 0}]

In [20]:
dataset_Sexo = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=data,
    label_names=['Diabetes'],
    protected_attribute_names=['Sexo']
)

In [21]:
train, test = dataset_Sexo.split([0.7], shuffle=True)

In [22]:
rw = Reweighing(unprivileged_groups= [{'Sexo' : 0}],
                privileged_groups= [{'Sexo' : 1}])

train_rw = rw.fit_transform(train)

### XBoost

In [23]:
XBoost_rw = xgb.XGBClassifier(
        objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

XBoost_rw.fit(train_rw.features, train_rw.labels.ravel() , sample_weight=train_rw.instance_weights)

Parameters: { "use_label_encoder" } are not used.



In [24]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = XBoost_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Sexo': 0}],
                              privileged_groups=[{'Sexo': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.011657823606908438
EOD: 0.06946277954879321
DIR: 1.207064856249135
AOD: 0.03737817471012725


### SVM

In [25]:
svm_rw = SGDClassifier(loss="hinge", random_state=42)
svm_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

In [26]:
# Create prediction dataset
dataset_test_pred = test.copy()
dataset_test_pred.labels = svm_rw.predict(test.features).reshape(-1, 1)

# Fairness evaluation
metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Sexo': 0}],
                              privileged_groups=[{'Sexo': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.0008365042722490736
EOD: 0.005006335691958939
DIR: 1.5059786205305852
AOD: 0.0026037596775993063


### CB

In [27]:
cb_rw = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    random_seed=42
)

cb_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

0:	learn: 0.6110772	total: 42.8ms	remaining: 4.24s
1:	learn: 0.5443566	total: 79.4ms	remaining: 3.89s
2:	learn: 0.4975528	total: 128ms	remaining: 4.15s
3:	learn: 0.4618092	total: 167ms	remaining: 4.01s
4:	learn: 0.4363441	total: 205ms	remaining: 3.89s
5:	learn: 0.4166925	total: 245ms	remaining: 3.84s
6:	learn: 0.4008296	total: 284ms	remaining: 3.78s
7:	learn: 0.3881147	total: 322ms	remaining: 3.7s
8:	learn: 0.3785608	total: 361ms	remaining: 3.65s
9:	learn: 0.3703995	total: 398ms	remaining: 3.58s
10:	learn: 0.3646614	total: 440ms	remaining: 3.56s
11:	learn: 0.3605672	total: 483ms	remaining: 3.54s
12:	learn: 0.3563204	total: 522ms	remaining: 3.49s
13:	learn: 0.3531289	total: 570ms	remaining: 3.5s
14:	learn: 0.3509344	total: 621ms	remaining: 3.52s
15:	learn: 0.3481485	total: 660ms	remaining: 3.46s
16:	learn: 0.3466834	total: 702ms	remaining: 3.43s
17:	learn: 0.3448855	total: 741ms	remaining: 3.38s
18:	learn: 0.3437628	total: 779ms	remaining: 3.32s
19:	learn: 0.3424847	total: 816ms	remaini

<catboost.core.CatBoostClassifier at 0x7d60f82c1410>

In [28]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = cb_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Sexo': 0}],
                              privileged_groups=[{'Sexo': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.01184235311724817
EOD: 0.07128816055082968
DIR: 1.2525457714773154
AOD: 0.038041561939039605


### Logistic Regression

In [30]:
LogistReg_rw = LogisticRegression(solver='saga')
LogistReg_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)



In [31]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = LogistReg_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Sexo': 0}],
                              privileged_groups=[{'Sexo': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())
print("Accuracy:", metric.accuracy())

SPD: 0.01115374999913401
EOD: 0.05996475669152557
DIR: 1.2418924087165517
AOD: 0.03279728084239458
Accuracy: 0.8463700110019751


## Prejudice Remover - Etnia

In [32]:
dataset = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=data,
    label_names=['Diabetes'],
    protected_attribute_names=['Etnia']
)

train, test = dataset.split([0.7], shuffle=True)

In [33]:
pr = PrejudiceRemover(sensitive_attr='Etnia', eta=25.0)
pr.fit(train)
test_pred_mit = pr.predict(test)

metric_mit = ClassificationMetric(test, test_pred_mit,
                                  unprivileged_groups=[{'Etnia': 0}],
                                  privileged_groups=[{'Etnia': 1}])

print("\n==== Fairness Metrics: After In-processing Mitigation ====")
print("Statistical parity difference:", metric_mit.statistical_parity_difference())
print("Disparate impact:", metric_mit.disparate_impact())
print("Equal opportunity difference:", metric_mit.equal_opportunity_difference())
print("Average odds difference:", metric_mit.average_odds_difference())


==== Fairness Metrics: After In-processing Mitigation ====
Statistical parity difference: 0.014198347590787634
Disparate impact: 1.3046277872855092
Equal opportunity difference: 0.004531930758902514
Average odds difference: 0.004232181930434916


## Prejudice Remover - Sexo

In [34]:
dataset = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=data,
    label_names=['Diabetes'],
    protected_attribute_names=['Sexo']
)

train, test = dataset.split([0.7], shuffle=True)

In [35]:
pr = PrejudiceRemover(sensitive_attr='Sexo', eta=25.0)
pr.fit(train)
test_pred_mit = pr.predict(test)

metric_mit = ClassificationMetric(test, test_pred_mit,
                                  unprivileged_groups=[{'Sexo': 0}],
                                  privileged_groups=[{'Sexo': 1}])

print("\n==== Fairness Metrics: After In-processing Mitigation ====")
print("Statistical parity difference:", metric_mit.statistical_parity_difference())
print("Disparate impact:", metric_mit.disparate_impact())
print("Equal opportunity difference:", metric_mit.equal_opportunity_difference())
print("Average odds difference:", metric_mit.average_odds_difference())
print("Accuracy:", metric.accuracy())


==== Fairness Metrics: After In-processing Mitigation ====
Statistical parity difference: 0.00013805142551943234
Disparate impact: 1.0024760599714724
Equal opportunity difference: 0.03601612811625274
Average odds difference: 0.0169328579423369
Accuracy: 0.8463700110019751


## Gráfico

Logistic Regression

In [36]:
import plotly.graph_objects as go
import plotly.express as px

metrics = [
    "Average Odds Difference",
    "Equal Opportunity Difference",
    "Statistical Parity Difference",
]

before = [0.034090, 0.061312, 0.012693]
after = [0.026710, 0.048551, 0.009196]
pr_gender = [0.019388, 0.038288, 0.0030340]

fig = go.Figure()


fig.add_trace(go.Scatter(
    x=before,
    y=metrics,
    mode='markers+text',
    name='Before Mitigation(baseline)',
    marker=dict(size=27, color=px.colors.qualitative.Safe[1]),
    text=[f"{v:.2f}" for v in before],
    textposition='top center'
))

fig.add_trace(go.Scatter(
    x=after,
    y=metrics,
    mode='markers+text',
    name='After Reweighing Mitigation',
    marker=dict(size=27, color=px.colors.qualitative.Safe[0]),
    text=[f"{v:.2f}" for v in after],
    textposition='top center'
))

fig.add_trace(go.Scatter(
    x=pr_gender,
    y=metrics,
    mode='markers+text',
    name='After Prejudice Remover Mitigation',
    marker=dict(size=27, color=px.colors.qualitative.Safe[2]),
    text=[f"{v:.2f}" for v in pr_gender],
    textposition='top center'
))


fig.update_layout(
    title="",
    xaxis_title="Fairness Metrics Scores for Gender",
    yaxis_title="Fairness Metrics",
    font=dict(size=16),
    margin=dict(l=100, r=50, b=40, t=50),
    xaxis=dict(
        range=[0, 0.07],
        tick0=0,
        dtick=0.01,
        showgrid=True
    ),
        yaxis=dict(
        categoryorder='array'
    ),
    height=480
)

fig.show()
