In [None]:
# install neccessary packages 
!pip install aif360==0.5.0
!pip install fairlearn==0.10.0
!pip install lime==0.2.0.1
!pip install matplotlib==3.7.2
!pip install numpy==1.24.3
!pip install pandas==2.0.3
!pip install plotly==5.9.0
!pip install seaborn==0.12.2
!pip install shap==0.44.1
!pip install sklearn==1.3.0
!pip install xgboost==1.7.3

In [None]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from aif360.sklearn.inprocessing import ExponentiatedGradientReduction
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# load dataset from SHRS_smallDS.xlsx
df = pd.read_csv('Seer_Old datasets/bladder.csv',index_col=False)

In [None]:
df.shape

In [None]:
cancer_type_df = df[['age',
                    ' behaviour_code',
                    'gender_code',
                    'cs_extension_code',
                    'grade_code10',
                    'histologic_type_code',
                    'cs_lymph_nodes_code',
                    'marital_status_code',
                    'cs_mets_at_dx_code',
                    'primary_site_code7',
                    'race_code',
                    ' radiation_code',
                    ' number_of_nodes_examined',
                    ' regional_positive_nodes ',
                    'number_of_primaries',
                    ' site_specific_surgery_code',
                    'stage_of_cancer_code',
                    'cs_tumor_size',
                    'survived_code ']]

cancer_type_df.head()

In [None]:
# Pre-processing
cancer_type_df = cancer_type_df.dropna()

cancer_type_df['survived_code '] = cancer_type_df['survived_code '].map({'yes': 1, 'no': 0})
cancer_type_df['primary_site_code7'] = cancer_type_df['primary_site_code7'].str[1:]
cancer_type_df['primary_site_code7'] = pd.to_numeric(cancer_type_df['primary_site_code7'], errors='raise')

cancer_type_df['stage_of_cancer_code'] = cancer_type_df['stage_of_cancer_code'].replace(" ", np.NAN)  # Replace with np.NAN or other value

cancer_type_df = cancer_type_df.dropna(subset=['stage_of_cancer_code'])  # Drop rows with NaN in 'stage_of_cancer_code'

# Try converting 'stage_of_cancer_code' to numeric, handling potential errors
try:
      cancer_type_df['stage_of_cancer_code'] = pd.to_numeric(cancer_type_df['stage_of_cancer_code'], errors='raise')
except ValueError:
      print("Error: Some values in 'stage_of_cancer_code' cannot be converted to numeric (after removing empty strings).")

cancer_type_df.shape

In [None]:
cancer_type_df['primary_site_code7'] = pd.to_numeric(cancer_type_df['primary_site_code7'], errors='raise')
cancer_type_df.dtypes

In [None]:
cancer_type_stage_df = cancer_type_df[cancer_type_df['stage_of_cancer_code'] == 1]
print(cancer_type_stage_df.shape)

In [None]:
# separate array into input and output components
X = cancer_type_stage_df.drop(['survived_code ', 'stage_of_cancer_code'], axis=1)
y = cancer_type_stage_df['survived_code ']

prot_attr_cols = [colname for colname in X 
                  if "gender_code" in colname or "marital_status_code" in colname 
                  or "race_code" in colname]
prot_attr_cols

## Naive Bayes

In [None]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
   
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
   
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
nb_estimator = GaussianNB()
nb_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=nb_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
nb_exp_grad_red.fit(X_train, y_train)

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(nb_exp_grad_red, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

## Logistic Regression

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# split the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                       test_size=0.2)
# LogisticRegression
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)
 
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
lr_estimator = LogisticRegression(solver='liblinear')
lr_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=lr_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
lr_exp_grad_red.fit(X_train, y_train)

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(lr_exp_grad_red, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
y_pred_before = clf.predict(X_test)
df_pred_before = X_test.copy()
df_pred_before['survived_code'] = y_pred_before
df_pred_before

In [None]:
from aif360.datasets import BinaryLabelDataset

# Convert the DataFrame to a BinaryLabelDataset
predicted_dataset_before = BinaryLabelDataset(df=df_pred_before,
                                       label_names=['survived_code'],
                                       protected_attribute_names=prot_attr_cols,
                                       favorable_label=1,  
                                       unfavorable_label=0)  
predicted_dataset_before

In [None]:
y_pred_after = lr_exp_grad_red.predict(X_test)
df_pred_after = X_test.copy()
df_pred_after['survived_code'] = y_pred_after
df_pred_after

In [None]:
# Convert the DataFrame to a BinaryLabelDataset
predicted_dataset_after = BinaryLabelDataset(df=df_pred_after,
                                       label_names=['survived_code'],
                                       protected_attribute_names=prot_attr_cols,
                                       favorable_label=1,  
                                       unfavorable_label=0)  
predicted_dataset_after

In [None]:
true_dataset = X_test.copy()
true_dataset['survived_code'] = y_test
true_dataset

In [None]:
true_dataset = BinaryLabelDataset(df=true_dataset,
                                   label_names=['survived_code'],
                                   protected_attribute_names=prot_attr_cols,
                                   favorable_label=1,  
                                   unfavorable_label=0)  

In [None]:
from aif360.metrics import ClassificationMetric

# For the model before fairness intervention
metric_before = ClassificationMetric(true_dataset, predicted_dataset_before,
                                    unprivileged_groups=[{'gender_code': 2}],
                                    privileged_groups=[{'gender_code': 1}])

tpr_before_unpriv = metric_before.true_positive_rate(privileged=False)
fpr_before_unpriv = metric_before.false_positive_rate(privileged=False)

tpr_before_priv = metric_before.true_positive_rate(privileged=True)
fpr_before_priv = metric_before.false_positive_rate(privileged=True)

# For the model after fairness intervention
metric_after = ClassificationMetric(true_dataset, predicted_dataset_after,
                                   unprivileged_groups=[{'gender_code': 2}],
                                   privileged_groups=[{'gender_code': 1}])

tpr_after_unpriv = metric_after.true_positive_rate(privileged=False)
fpr_after_unpriv = metric_after.false_positive_rate(privileged=False)

tpr_after_priv = metric_after.true_positive_rate(privileged=True)
fpr_after_priv = metric_after.false_positive_rate(privileged=True)


In [None]:
# Now plot the points for the before and after models
plt.figure()
# Plot for the model before fairness intervention
plt.scatter(fpr_before_unpriv, tpr_before_unpriv, label='Unprivileged Group (Female) - Before', color='red')
plt.scatter(fpr_before_priv, tpr_before_priv, label='Privileged Group (Male) - Before', color='blue')

# Plot for the model after fairness intervention
plt.scatter(fpr_after_unpriv, tpr_after_unpriv, label='Unprivileged Group (Female) - After', color='pink')
plt.scatter(fpr_after_priv, tpr_after_priv, label='Privileged Group (Male) - After', color='lightblue')

# Line of equality
plt.plot([0, 1], [0, 1], 'k--', label='Line of Equality')

plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Equalized odds plot for Logistic Regression Model')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
print('Unprivileged Group - Before: ' + '(' + str(fpr_before_unpriv) + ', ' + str(tpr_before_unpriv) + ')')
print('Unprivileged Group - After: ' + '(' + str(fpr_after_unpriv) + ', ' +  str(tpr_after_unpriv) + ')')
print('Privileged Group - Before: ' + '(' + str(fpr_before_priv) + ', ' + str(tpr_before_priv) + ')')
print('Privileged Group - After: ' + '(' + str(fpr_after_priv) + ', ' + str(tpr_after_priv) + ')')

## Decission Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


# split the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                       test_size=0.2)
# Create a decision tree classifier object
clf = DecisionTreeClassifier()

# Train the decision tree on the data
clf.fit(X_train, y_train)
 
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
dt_estimator = DecisionTreeClassifier()
dt_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=dt_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
dt_exp_grad_red.fit(X_train, y_train)

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(dt_exp_grad_red, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
y_pred_before = clf.predict(X_test)
df_pred_before = X_test.copy()
df_pred_before['survived_code'] = y_pred_before
df_pred_before

In [None]:
from aif360.datasets import BinaryLabelDataset

# Convert the DataFrame to a BinaryLabelDataset
predicted_dataset_before = BinaryLabelDataset(df=df_pred_before,
                                       label_names=['survived_code'],
                                       protected_attribute_names=prot_attr_cols,
                                       favorable_label=1,  
                                       unfavorable_label=0)  
predicted_dataset_before

In [None]:
y_pred_after = dt_exp_grad_red.predict(X_test)
df_pred_after = X_test.copy()
df_pred_after['survived_code'] = y_pred_after
df_pred_after

In [None]:
# Convert the DataFrame to a BinaryLabelDataset
predicted_dataset_after = BinaryLabelDataset(df=df_pred_after,
                                       label_names=['survived_code'],
                                       protected_attribute_names=prot_attr_cols,
                                       favorable_label=1,  
                                       unfavorable_label=0)  
predicted_dataset_after

In [None]:
true_dataset = X_test.copy()
true_dataset['survived_code'] = y_test
true_dataset

In [None]:
true_dataset = BinaryLabelDataset(df=true_dataset,
                                   label_names=['survived_code'],
                                   protected_attribute_names=prot_attr_cols,
                                   favorable_label=1,  
                                   unfavorable_label=0)  

In [None]:
from aif360.metrics import ClassificationMetric

# For the model before fairness intervention
metric_before = ClassificationMetric(true_dataset, predicted_dataset_before,
                                    unprivileged_groups=[{'gender_code': 2}],
                                    privileged_groups=[{'gender_code': 1}])

tpr_before_unpriv = metric_before.true_positive_rate(privileged=False)
fpr_before_unpriv = metric_before.false_positive_rate(privileged=False)

tpr_before_priv = metric_before.true_positive_rate(privileged=True)
fpr_before_priv = metric_before.false_positive_rate(privileged=True)

# For the model after fairness intervention
metric_after = ClassificationMetric(true_dataset, predicted_dataset_after,
                                   unprivileged_groups=[{'gender_code': 2}],
                                   privileged_groups=[{'gender_code': 1}])

tpr_after_unpriv = metric_after.true_positive_rate(privileged=False)
fpr_after_unpriv = metric_after.false_positive_rate(privileged=False)

tpr_after_priv = metric_after.true_positive_rate(privileged=True)
fpr_after_priv = metric_after.false_positive_rate(privileged=True)


In [None]:
# Now plot the points for the before and after models
plt.figure()
# Plot for the model before fairness intervention
plt.scatter(fpr_before_unpriv, tpr_before_unpriv, label='Unprivileged Group - Before', color='red')
plt.scatter(fpr_before_priv, tpr_before_priv, label='Privileged Group - Before', color='blue')

# Plot for the model after fairness intervention
plt.scatter(fpr_after_unpriv, tpr_after_unpriv, label='Unprivileged Group - After', color='pink')
plt.scatter(fpr_after_priv, tpr_after_priv, label='Privileged Group - After', color='lightblue')

# Line of equality
plt.plot([0, 1], [0, 1], 'k--', label='Line of Equality')

plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Equalized Odds Plot for Decision Model')
plt.legend()
plt.grid(True)
plt.show()


## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(ada, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
ada_estimator = AdaBoostClassifier()
ada_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=ada_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
ada_exp_grad_red.fit(X_train, y_train)

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(ada_exp_grad_red, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

## XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(xgb, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
xgb_estimator = XGBClassifier()
xgb_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=xgb_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
xgb_exp_grad_red.fit(X_train, y_train)

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(xgb_exp_grad_red, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
y_pred_before = xgb.predict(X_test)
df_pred_before = X_test.copy()
df_pred_before['survived_code'] = y_pred_before
df_pred_before

In [None]:
from aif360.datasets import BinaryLabelDataset

# Convert the DataFrame to a BinaryLabelDataset
predicted_dataset_before = BinaryLabelDataset(df=df_pred_before,
                                       label_names=['survived_code'],
                                       protected_attribute_names=prot_attr_cols,
                                       favorable_label=1,  
                                       unfavorable_label=0)  
predicted_dataset_before

In [None]:
y_pred_after = xgb_exp_grad_red.predict(X_test)
df_pred_after = X_test.copy()
df_pred_after['survived_code'] = y_pred_after
df_pred_after

In [None]:
# Convert the DataFrame to a BinaryLabelDataset
predicted_dataset_after = BinaryLabelDataset(df=df_pred_after,
                                       label_names=['survived_code'],
                                       protected_attribute_names=prot_attr_cols,
                                       favorable_label=1,  
                                       unfavorable_label=0)  
predicted_dataset_after

In [None]:
true_dataset = X_test.copy()
true_dataset['survived_code'] = y_test
true_dataset

In [None]:
true_dataset = BinaryLabelDataset(df=true_dataset,
                                   label_names=['survived_code'],
                                   protected_attribute_names=prot_attr_cols,
                                   favorable_label=1,  
                                   unfavorable_label=0)  

In [None]:
from aif360.metrics import ClassificationMetric

# For the model before fairness intervention
metric_before = ClassificationMetric(true_dataset, predicted_dataset_before,
                                    unprivileged_groups=[{'gender_code': 2}],
                                    privileged_groups=[{'gender_code': 1}])

tpr_before_unpriv = metric_before.true_positive_rate(privileged=False)
fpr_before_unpriv = metric_before.false_positive_rate(privileged=False)

tpr_before_priv = metric_before.true_positive_rate(privileged=True)
fpr_before_priv = metric_before.false_positive_rate(privileged=True)

# For the model after fairness intervention
metric_after = ClassificationMetric(true_dataset, predicted_dataset_after,
                                   unprivileged_groups=[{'gender_code': 2}],
                                   privileged_groups=[{'gender_code': 1}])

tpr_after_unpriv = metric_after.true_positive_rate(privileged=False)
fpr_after_unpriv = metric_after.false_positive_rate(privileged=False)

tpr_after_priv = metric_after.true_positive_rate(privileged=True)
fpr_after_priv = metric_after.false_positive_rate(privileged=True)


In [None]:
# Now plot the points for the before and after models
plt.figure()
# Plot for the model before fairness intervention
plt.scatter(fpr_before_unpriv, tpr_before_unpriv, label='Unprivileged Group - Before', color='red')
plt.scatter(fpr_before_priv, tpr_before_priv, label='Privileged Group - Before', color='blue')

# Plot for the model after fairness intervention
plt.scatter(fpr_after_unpriv, tpr_after_unpriv, label='Unprivileged Group - After', color='pink')
plt.scatter(fpr_after_priv, tpr_after_priv, label='Privileged Group - After', color='lightblue')

# Line of equality
plt.plot([0, 1], [0, 1], 'k--', label='Line of Equality')

plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Equalized Odds Plot for Logistic Regression Model')
plt.legend()
plt.grid(True)
plt.show()


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
rf_estimator = RandomForestClassifier()
rf_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=rf_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
rf_exp_grad_red.fit(X_train, y_train)

# Perform 5-fold cross-validation with accuracy scoring
scores = cross_val_score(rf_exp_grad_red, X, y, cv=5, scoring='accuracy')

# Print the average accuracy across folds
print("Average Accuracy:", scores.mean())

In [None]:
y_pred_before = rf.predict(X_test)
df_pred_before = X_test.copy()
df_pred_before['survived_code'] = y_pred_before
df_pred_before

In [None]:
from aif360.datasets import BinaryLabelDataset

# Convert the DataFrame to a BinaryLabelDataset
predicted_dataset_before = BinaryLabelDataset(df=df_pred_before,
                                       label_names=['survived_code'],
                                       protected_attribute_names=prot_attr_cols,
                                       favorable_label=1,  
                                       unfavorable_label=0)  
predicted_dataset_before

In [None]:
y_pred_after = rf_exp_grad_red.predict(X_test)
df_pred_after = X_test.copy()
df_pred_after['survived_code'] = y_pred_after
df_pred_after

In [None]:
# Convert the DataFrame to a BinaryLabelDataset
predicted_dataset_after = BinaryLabelDataset(df=df_pred_after,
                                       label_names=['survived_code'],
                                       protected_attribute_names=prot_attr_cols,
                                       favorable_label=1,  
                                       unfavorable_label=0)  
predicted_dataset_after

In [None]:
true_dataset = X_test.copy()
true_dataset['survived_code'] = y_test
true_dataset

In [None]:
true_dataset = BinaryLabelDataset(df=true_dataset,
                                   label_names=['survived_code'],
                                   protected_attribute_names=prot_attr_cols,
                                   favorable_label=1,  
                                   unfavorable_label=0)  

In [None]:
from aif360.metrics import ClassificationMetric

# For the model before fairness intervention
metric_before = ClassificationMetric(true_dataset, predicted_dataset_before,
                                    unprivileged_groups=[{'gender_code': 2}],
                                    privileged_groups=[{'gender_code': 1}])

tpr_before_unpriv = metric_before.true_positive_rate(privileged=False)
fpr_before_unpriv = metric_before.false_positive_rate(privileged=False)

tpr_before_priv = metric_before.true_positive_rate(privileged=True)
fpr_before_priv = metric_before.false_positive_rate(privileged=True)

# For the model after fairness intervention
metric_after = ClassificationMetric(true_dataset, predicted_dataset_after,
                                   unprivileged_groups=[{'gender_code': 2}],
                                   privileged_groups=[{'gender_code': 1}])

tpr_after_unpriv = metric_after.true_positive_rate(privileged=False)
fpr_after_unpriv = metric_after.false_positive_rate(privileged=False)

tpr_after_priv = metric_after.true_positive_rate(privileged=True)
fpr_after_priv = metric_after.false_positive_rate(privileged=True)


In [None]:
# Now plot the points for the before and after models
plt.figure()
# Plot for the model before fairness intervention
plt.scatter(fpr_before_unpriv, tpr_before_unpriv, label='Unprivileged Group - Before', color='red')
plt.scatter(fpr_before_priv, tpr_before_priv, label='Privileged Group - Before', color='blue')

# Plot for the model after fairness intervention
plt.scatter(fpr_after_unpriv, tpr_after_unpriv, label='Unprivileged Group - After', color='pink')
plt.scatter(fpr_after_priv, tpr_after_priv, label='Privileged Group - After', color='lightblue')

# Line of equality
plt.plot([0, 1], [0, 1], 'k--', label='Line of Equality')

plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Equalized Odds Plot for Logistic Regression Model')
plt.legend()
plt.grid(True)
plt.show()
