<a href="https://colab.research.google.com/github/luisafelixx/Fairness_Diabetes/blob/main/mitiga%C3%A7%C3%A3o.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dados brutos importados de https://www.cdc.gov/brfss/annual_data/annual_2015.html

Processo de limpeza apoiado no notebook: https://www.kaggle.com/code/alexteboul/diabetes-health-indicators-dataset-notebook/notebook#1.-Get-the-data

Explicação: https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf




# Import de Bibliotecas

In [None]:
import pandas as pd
import numpy as np

In [None]:
!pip install aif360


Collecting aif360
  Downloading aif360-0.6.1-py3-none-any.whl.metadata (5.0 kB)
Downloading aif360-0.6.1-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.7/259.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aif360
Successfully installed aif360-0.6.1


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# Bibliotecas para manipulação de dados
import pandas as pd
import numpy as np

# Bibliotecas para visualização
import seaborn as sns
import matplotlib.pyplot as plt

# Bibliotecas para manipulação de dados e modelagem
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

# Modelos de Machine Learning
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import AdaBoostClassifier
#from catboost import CatBoostClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

# Métricas de avaliação
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, ConfusionMatrixDisplay, classification_report
)

#Métricas de Fairness
from aif360.sklearn.metrics import (
    statistical_parity_difference,
    equal_opportunity_difference,
    average_odds_difference,
    disparate_impact_ratio,

)

#Mitigação
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing import Reweighing
from aif360.metrics import ClassificationMetric
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.inprocessing import PrejudiceRemover


pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'
pip install 'aif360[OptimalTransport]'


In [None]:
data = pd.read_csv('dados_limpos2015.csv')

# Mitigação

Desfazer os index para esse passo

## Reweighing - Etnia

In [None]:
privileged_groups = [{'Etnia': 1}]
unprivileged_groups = [{'Etnia': 0}]

In [None]:
#Transforma o dataframe em um objeto que o AIF360 entende (BinaryLabelDataset)
dataset_Etnia = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=data,
    label_names=['Diabetes'],
    protected_attribute_names=['Etnia']
)

In [None]:
train, test = dataset_Etnia.split([0.7], shuffle=True)

In [None]:
rw = Reweighing(unprivileged_groups= [{'Etnia' : 0}],
                privileged_groups= [{'Etnia' : 1}])

train_rw = rw.fit_transform(train)

### XBoost

In [None]:
XBoost_rw = xgb.XGBClassifier(
        objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

XBoost_rw.fit(train_rw.features, train_rw.labels.ravel() , sample_weight=train_rw.instance_weights)

Parameters: { "use_label_encoder" } are not used.



In [None]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = XBoost_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Etnia': 0}],
                              privileged_groups=[{'Etnia': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.0067922275743023605
EOD: -0.027997620669098666
DIR: 1.116847570182216
AOD: -0.01414091219241418


### SVM

In [None]:
svm_rw = SGDClassifier(loss="hinge", random_state=42)
svm_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

In [None]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = svm_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Etnia': 0}],
                              privileged_groups=[{'Etnia': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())



SPD: 0.004208689705710145
EOD: -0.028941960243351078
DIR: 1.049852156281016
AOD: -0.017340315707569002


### CB

In [None]:
cb_rw = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    random_seed=42
)

cb_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

0:	learn: 0.6121420	total: 88.9ms	remaining: 8.8s
1:	learn: 0.5469370	total: 131ms	remaining: 6.44s
2:	learn: 0.4978150	total: 168ms	remaining: 5.42s
3:	learn: 0.4635646	total: 205ms	remaining: 4.92s
4:	learn: 0.4371877	total: 241ms	remaining: 4.59s
5:	learn: 0.4173959	total: 281ms	remaining: 4.41s
6:	learn: 0.4037379	total: 320ms	remaining: 4.25s
7:	learn: 0.3912770	total: 359ms	remaining: 4.13s
8:	learn: 0.3832377	total: 396ms	remaining: 4s
9:	learn: 0.3752010	total: 437ms	remaining: 3.93s
10:	learn: 0.3687481	total: 478ms	remaining: 3.86s
11:	learn: 0.3638077	total: 517ms	remaining: 3.79s
12:	learn: 0.3600596	total: 557ms	remaining: 3.73s
13:	learn: 0.3567609	total: 599ms	remaining: 3.68s
14:	learn: 0.3541256	total: 636ms	remaining: 3.6s
15:	learn: 0.3521795	total: 673ms	remaining: 3.53s
16:	learn: 0.3501427	total: 711ms	remaining: 3.47s
17:	learn: 0.3490490	total: 751ms	remaining: 3.42s
18:	learn: 0.3477322	total: 793ms	remaining: 3.38s
19:	learn: 0.3467303	total: 846ms	remaining: 

<catboost.core.CatBoostClassifier at 0x7e513a5ed910>

In [None]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = cb_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Etnia': 0}],
                              privileged_groups=[{'Etnia': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.008257688354251613
EOD: -0.01168851020991668
DIR: 1.168125978819273
AOD: -0.006622202175468921


### Logiatic Regression

In [None]:
LogistReg_rw = LogisticRegression(solver='liblinear')
LogistReg_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

In [None]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = LogistReg_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Etnia': 0}],
                              privileged_groups=[{'Etnia': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())
print("Accuracy:", metric.accuracy())

SPD: 0.010622898591597929
EOD: -0.009654974659958543
DIR: 1.214177162213416
AOD: -0.003320987976424829
Accuracy: 0.8487427261038427


##Reweighing - Sexo

In [None]:
privileged_groups = [{'Sexo': 1}]
unprivileged_groups = [{'Sexo': 0}]

In [None]:
dataset_Sexo = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=data,
    label_names=['Diabetes'],
    protected_attribute_names=['Sexo']
)

In [None]:
train, test = dataset_Sexo.split([0.7], shuffle=True)

In [None]:
rw = Reweighing(unprivileged_groups= [{'Sexo' : 0}],
                privileged_groups= [{'Sexo' : 1}])

train_rw = rw.fit_transform(train)

### XBoost

In [None]:
XBoost_rw = xgb.XGBClassifier(
        objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

XBoost_rw.fit(train_rw.features, train_rw.labels.ravel() , sample_weight=train_rw.instance_weights)

Parameters: { "use_label_encoder" } are not used.



In [None]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = XBoost_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Sexo': 0}],
                              privileged_groups=[{'Sexo': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.011442035549163662
EOD: 0.059776874715308376
DIR: 1.2004434947941167
AOD: 0.03289991582172836


### SVM

In [None]:
svm_rw = SGDClassifier(loss="hinge", random_state=42)
svm_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

In [None]:
# Create prediction dataset
dataset_test_pred = test.copy()
dataset_test_pred.labels = svm_rw.predict(test.features).reshape(-1, 1)

# Fairness evaluation
metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Sexo': 0}],
                              privileged_groups=[{'Sexo': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: -1.9811084322419602e-05
EOD: 0.0006617813758244334
DIR: 0.9800286259953354
AOD: 0.00030245358498870635


### CB

In [None]:
cb_rw = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    random_seed=42
)

cb_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

0:	learn: 0.6116720	total: 41ms	remaining: 4.05s
1:	learn: 0.5452567	total: 78.5ms	remaining: 3.84s
2:	learn: 0.4991842	total: 115ms	remaining: 3.73s
3:	learn: 0.4630040	total: 180ms	remaining: 4.32s
4:	learn: 0.4381987	total: 255ms	remaining: 4.85s
5:	learn: 0.4177620	total: 329ms	remaining: 5.16s
6:	learn: 0.4029842	total: 415ms	remaining: 5.51s
7:	learn: 0.3904717	total: 501ms	remaining: 5.76s
8:	learn: 0.3810123	total: 597ms	remaining: 6.03s
9:	learn: 0.3729542	total: 690ms	remaining: 6.21s
10:	learn: 0.3668439	total: 756ms	remaining: 6.12s
11:	learn: 0.3622978	total: 862ms	remaining: 6.32s
12:	learn: 0.3580491	total: 978ms	remaining: 6.54s
13:	learn: 0.3546820	total: 1.08s	remaining: 6.67s
14:	learn: 0.3519137	total: 1.19s	remaining: 6.75s
15:	learn: 0.3497166	total: 1.31s	remaining: 6.86s
16:	learn: 0.3476577	total: 1.4s	remaining: 6.86s
17:	learn: 0.3460982	total: 1.52s	remaining: 6.92s
18:	learn: 0.3451212	total: 1.6s	remaining: 6.85s
19:	learn: 0.3440947	total: 1.71s	remaining

<catboost.core.CatBoostClassifier at 0x7e5146cde050>

In [None]:
# Create prediction dataset
dataset_test_pred = test.copy()
dataset_test_pred.labels = cb_rw.predict(test.features).reshape(-1, 1)

# Fairness evaluation
metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Sexo': 0}],
                              privileged_groups=[{'Sexo': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())

SPD: 0.012528478406985087
EOD: 0.059527498171142196
DIR: 1.2652991032241712
AOD: 0.033293308963938194


### Logistic Regression

In [None]:
LogistReg_rw = LogisticRegression(solver='liblinear')
LogistReg_rw.fit(train_rw.features, train_rw.labels.ravel(), sample_weight=train_rw.instance_weights)

In [None]:
dataset_test_pred = test.copy()
dataset_test_pred.labels = LogistReg_rw.predict(test.features).reshape(-1, 1)

metric = ClassificationMetric(test, dataset_test_pred,
                              unprivileged_groups=[{'Sexo': 0}],
                              privileged_groups=[{'Sexo': 1}])

print("SPD:", metric.statistical_parity_difference())
print("EOD:", metric.equal_opportunity_difference())
print("DIR:", metric.disparate_impact())
print("AOD:", metric.average_odds_difference())
print("Accuracy:", metric.accuracy())

SPD: 0.007078270915086025
EOD: 0.04366629330106525
DIR: 1.1431585704098255
AOD: 0.024477111072546957
Accuracy: 0.8480401903474238


## Prejudice Remover - Etnia

In [None]:
pr = PrejudiceRemover(sensitive_attr='Etnia', eta=25.0)
pr.fit(train)
test_pred_mit = pr.predict(test)

metric_mit = ClassificationMetric(test, test_pred_mit,
                                  unprivileged_groups=[{'Etnia': 0}],
                                  privileged_groups=[{'Etnia': 1}])

print("\n==== Fairness Metrics: After In-processing Mitigation ====")
print("Statistical parity difference:", metric_mit.statistical_parity_difference())
print("Disparate impact:", metric_mit.disparate_impact())
print("Equal opportunity difference:", metric_mit.equal_opportunity_difference())
print("Average odds difference:", metric_mit.average_odds_difference())


==== Fairness Metrics: After In-processing Mitigation ====
Statistical parity difference: 0.0202877674263926
Disparate impact: 1.4646029036090165
Equal opportunity difference: 0.02220345551468006
Average odds difference: 0.014391824506269534


## Prejudice Remover - Sexo

In [None]:
pr = PrejudiceRemover(sensitive_attr='Sexo', eta=25.0)
pr.fit(train)
test_pred_mit = pr.predict(test)

metric_mit = ClassificationMetric(test, test_pred_mit,
                                  unprivileged_groups=[{'Sexo': 0}],
                                  privileged_groups=[{'Sexo': 1}])

print("\n==== Fairness Metrics: After In-processing Mitigation ====")
print("Statistical parity difference:", metric_mit.statistical_parity_difference())
print("Disparate impact:", metric_mit.disparate_impact())
print("Equal opportunity difference:", metric_mit.equal_opportunity_difference())
print("Average odds difference:", metric_mit.average_odds_difference())
print("Accuracy:", metric.accuracy())


==== Fairness Metrics: After In-processing Mitigation ====
Statistical parity difference: 0.002728912497892519
Disparate impact: 1.0505260435782982
Equal opportunity difference: 0.03439704495992063
Average odds difference: 0.01777935589656321
Accuracy: 0.8480401903474238


## Gráfico

Logistic Regression

In [None]:
import plotly.graph_objects as go
import plotly.express as px

metrics = [
    "Average Odds Difference",
    "Equal Opportunity Difference",
    "Statistical Parity Difference",
]

before = [0.034090, 0.061312, 0.012693]
after = [0.026710, 0.048551, 0.009196]
pr_gender = [0.019388, 0.038288, 0.0030340]

fig = go.Figure()


fig.add_trace(go.Scatter(
    x=before,
    y=metrics,
    mode='markers+text',
    name='Before Mitigation(baseline)',
    marker=dict(size=27, color=px.colors.qualitative.Safe[1]),
    text=[f"{v:.2f}" for v in before],
    textposition='top center'
))

fig.add_trace(go.Scatter(
    x=after,
    y=metrics,
    mode='markers+text',
    name='After Reweighing Mitigation',
    marker=dict(size=27, color=px.colors.qualitative.Safe[0]),
    text=[f"{v:.2f}" for v in after],
    textposition='top center'
))

fig.add_trace(go.Scatter(
    x=pr_gender,
    y=metrics,
    mode='markers+text',
    name='After Prejudice Remover Mitigation',
    marker=dict(size=27, color=px.colors.qualitative.Safe[2]),
    text=[f"{v:.2f}" for v in pr_gender],
    textposition='top center'
))


fig.update_layout(
    title="",
    xaxis_title="Fairness Metrics Scores for Gender",
    yaxis_title="Fairness Metrics",
    font=dict(size=16),
    margin=dict(l=100, r=50, b=40, t=50),
    xaxis=dict(
        range=[0, 0.07],
        tick0=0,
        dtick=0.01,
        showgrid=True
    ),
        yaxis=dict(
        categoryorder='array'
    ),
    height=480
)

fig.show()
