# Cancer Types and Stages

In [None]:
# install neccessary packages 
!pip install aif360==0.5.0
!pip install fairlearn==0.10.0
!pip install lime==0.2.0.1
!pip install matplotlib==3.7.2
!pip install numpy==1.24.3
!pip install pandas==2.0.3
!pip install plotly==5.9.0
!pip install seaborn==0.12.2
!pip install shap==0.44.1
!pip install sklearn==1.3.0
!pip install xgboost==1.7.3

In [None]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# load dataset from SHRS_smallDS.xlsx
df = pd.read_csv('Seer_Old datasets/bladder.csv',index_col=False)

In [None]:
df.shape

In [None]:
cancer_type_df = df[['age',
                    'gender_code',
                    ' behaviour_code',
                    'cs_extension_code',
                    'grade_code10',
                    'histologic_type_code',
                    'cs_lymph_nodes_code',
                    'marital_status_code',
                    'cs_mets_at_dx_code',
                    'primary_site_code7',
                    'race_code',
                    ' radiation_code',
                    ' number_of_nodes_examined',
                    ' regional_positive_nodes ',
                    'number_of_primaries',
                    ' site_specific_surgery_code',
                    'stage_of_cancer_code',
                    'cs_tumor_size',
                    'survived_code ']]

cancer_type_df.head()

In [None]:
# Pre-processing
cancer_type_df = cancer_type_df.dropna()

cancer_type_df['survived_code '] = cancer_type_df['survived_code '].map({'yes': 1, 'no': 0})
cancer_type_df['primary_site_code7'] = cancer_type_df['primary_site_code7'].str[1:]
cancer_type_df['primary_site_code7'] = pd.to_numeric(cancer_type_df['primary_site_code7'], errors='raise')

cancer_type_df['stage_of_cancer_code'] = cancer_type_df['stage_of_cancer_code'].replace(" ", np.NAN)  # Replace with np.NAN or other value

cancer_type_df = cancer_type_df.dropna(subset=['stage_of_cancer_code'])  # Drop rows with NaN in 'stage_of_cancer_code'

# Try converting 'stage_of_cancer_code' to numeric, handling potential errors
try:
      cancer_type_df['stage_of_cancer_code'] = pd.to_numeric(cancer_type_df['stage_of_cancer_code'], errors='raise')
except ValueError:
      print("Error: Some values in 'stage_of_cancer_code' cannot be converted to numeric (after removing empty strings).")

cancer_type_df.shape

In [None]:
cancer_type_df.dtypes

In [None]:
cancer_type_stage_df = cancer_type_df[cancer_type_df['stage_of_cancer_code'] == 1]
cancer_type_stage_df.shape

In [None]:
from sklearn.model_selection import train_test_split
from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from aif360.sklearn.inprocessing import ExponentiatedGradientReduction

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# separate array into input and output components
X = cancer_type_stage_df.drop(['survived_code ', 'stage_of_cancer_code'], axis=1)
y = cancer_type_stage_df['survived_code ']

(X_train, X_test,
 y_train, y_test) = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
prot_attr_cols = [colname for colname in X_train 
                  if "gender_code" in colname or "marital_status_code" in colname 
                  or "race_code" in colname]
prot_attr_cols

# 1. Logestic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_estimator = LogisticRegression(solver='liblinear')
lr_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=lr_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
lr_exp_grad_red.fit(X_train, y_train)
lr_egr_acc = lr_exp_grad_red.score(X_test, y_test)
print(lr_egr_acc)

In [None]:
# Without fairness
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, y_pred)
lr_acc

## 1.1 SHAP For Logestic Regression

In [None]:
# import shap and data visualization tool
import shap
import matplotlib.pyplot as plt

# load JS visualization code to notebook
shap.initjs()

lr_explainer = shap.Explainer(lr_exp_grad_red.predict, X_train)

# Calculate SHAP values for the subset
lr_shap_values = lr_explainer(X_test)

In [None]:
print("Mitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(lr_shap_values, X_test)

In [None]:
# Create the explainer for unmitigated model
lr_unmitigated_explainer = shap.Explainer(lr.predict, X_train)
lr_unmitigated_shap_values = lr_unmitigated_explainer(X_test)

In [None]:
print("Unmitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(lr_unmitigated_shap_values, X_test)

## 1.2 LIME For Logestic Regression

In [None]:
# import the LimeTabularExplainer module
import lime
import lime.lime_tabular
import numpy as np


# get the feature names
feature_names = list(X_test.columns)

# Fit the Explainer on the training data set using the LimeTabularExplainer
lr_explainer = lime.lime_tabular.LimeTabularExplainer(training_data=np.array(X_test),
                                 feature_names=X_test.columns,
                                 class_names = ['Dead', 'Alive'], 
                                 mode = 'classification')

In [None]:
# predict_fn_rf = lambda x: rf.predict_proba(x).astype(float)
i = np.random.randint(len(X_test))
print(y_test.iloc[i])
lr_exp = lr_explainer.explain_instance(X_test.iloc[i], lr_exp_grad_red.predict_proba)
lr_exp.show_in_notebook(show_all=False)

In [None]:
lr_exp = lr_explainer.explain_instance(X_test.iloc[i], lr.predict_proba)
lr_exp.show_in_notebook(show_all=False)

# 2. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_estimator = DecisionTreeClassifier()
dt_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=dt_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
dt_exp_grad_red.fit(X_train, y_train)
dt_egr_acc = dt_exp_grad_red.score(X_test, y_test)
print(dt_egr_acc)

In [None]:
# Without fairness
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
dt_acc = accuracy_score(y_test, y_pred)
dt_acc

## 2.1 SHAP For Decision Tree

In [None]:
# import shap and data visualization tool
import shap
import matplotlib.pyplot as plt

# load JS visualization code to notebook
shap.initjs()

# Create the explainer using the subset
dt_explainer = shap.Explainer(dt_exp_grad_red.predict, X_train)

# Calculate SHAP values for the subset
dt_shap_values = dt_explainer(X_test)

In [None]:
print("Mitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(dt_shap_values, X_test)

In [None]:
# Create the explainer
dt_unmitigated_explainer = shap.Explainer(dt.predict, X_train)
dt_unmitigated_shap_values = dt_unmitigated_explainer(X_test)
print("Unmitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(dt_unmitigated_shap_values, X_test)

## 2.2 LIME For Decision Tree

In [None]:
# import the LimeTabularExplainer module
import lime
import lime.lime_tabular
import numpy as np


# get the feature names
feature_names = list(X_test.columns)

# Fit the Explainer on the training data set using the LimeTabularExplainer
dt_explainer = lime.lime_tabular.LimeTabularExplainer(training_data=np.array(X_test),
                                 feature_names=X_test.columns,
                                 class_names = ['Dead', 'Alive'], 
                                 mode = 'classification')

In [None]:
# predict_fn_rf = lambda x: rf.predict_proba(x).astype(float)
i = np.random.randint(len(X_test))
print(y_test.iloc[i])
dt_exp = dt_explainer.explain_instance(X_test.iloc[i], dt_exp_grad_red.predict_proba)
dt_exp.show_in_notebook(show_all=False)

In [None]:
dt_exp = dt_explainer.explain_instance(X_test.iloc[i], dt.predict_proba)
dt_exp.show_in_notebook(show_all=False)

# 3. Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_estimator = GaussianNB()
nb_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=nb_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
nb_exp_grad_red.fit(X_train, y_train)
nb_egr_acc = nb_exp_grad_red.score(X_test, y_test)
print(nb_egr_acc)

In [None]:
# Without fairness
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
nb_acc = accuracy_score(y_test, y_pred)
nb_acc

## 3.1 SHAP For Naive Bayes

In [None]:
# import shap and data visualization tool
import shap
import matplotlib.pyplot as plt

# load JS visualization code to notebook
shap.initjs()

# Create the explainer using the subset
nb_explainer = shap.Explainer(nb_exp_grad_red.predict, X_train)

# Calculate SHAP values for the subset
nb_shap_values = nb_explainer(X_test)

In [None]:
print("Mitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(nb_shap_values, X_test)

In [None]:
# Create the explainer for unmitigated model
nb_unimitigated_explainer = shap.Explainer(nb.predict, X_train)

# Calculate SHAP values for the subset
nb_unimitigated_shap_values = nb_unimitigated_explainer(X_test)

In [None]:
print("Unmitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(nb_unimitigated_shap_values, X_test)

## 3.2 LIME For Naive Bayes

In [None]:
# import the LimeTabularExplainer module
import lime
import lime.lime_tabular
import numpy as np


# get the feature names
feature_names = list(X_test.columns)

# Fit the Explainer on the training data set using the LimeTabularExplainer
nb_explainer = lime.lime_tabular.LimeTabularExplainer(training_data=np.array(X_test),
                                 feature_names=X_test.columns,
                                 class_names = ['Dead', 'Alive'], 
                                 mode = 'classification')

In [None]:
i = np.random.randint(len(X_test))
print(y_test.iloc[i])
nb_exp = nb_explainer.explain_instance(X_test.iloc[i], nb_exp_grad_red.predict_proba)
nb_exp.show_in_notebook(show_all=False)

In [None]:
nb_exp = nb_explainer.explain_instance(X_test.iloc[i], nb.predict_proba)
nb_exp.show_in_notebook(show_all=False)

# 4. AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_estimator = AdaBoostClassifier()
ada_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=ada_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
ada_exp_grad_red.fit(X_train, y_train)
ada_egr_acc = ada_exp_grad_red.score(X_test, y_test)
print(ada_egr_acc)

In [None]:
# Without fairness
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
ada_acc = accuracy_score(y_test, y_pred)
ada_acc

## 4.1 SHAP For AdaBoost

In [None]:
# import shap and data visualization tool
import shap
import matplotlib.pyplot as plt

# load JS visualization code to notebook
shap.initjs()

# Create the explainer using the subset
ada_explainer = shap.Explainer(ada_exp_grad_red.predict, X_train)

# Calculate SHAP values for the subset
ada_shap_values = ada_explainer(X_test)

In [None]:
print("Mitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(ada_shap_values, X_test)

In [None]:
# Create the explainer for unmitigated model
ada_unmitigated_explainer = shap.Explainer(ada.predict, X_train)

# Calculate SHAP values for the subset
ada_unmitigated_shap_values = ada_unmitigated_explainer(X_test)

In [None]:
print("Unmitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(ada_unmitigated_shap_values, X_test)

## 4.2 LIME For AdaBoost

In [None]:
# import the LimeTabularExplainer module
import lime
import lime.lime_tabular
import numpy as np


# get the feature names
feature_names = list(X_test.columns)

# Fit the Explainer on the training data set using the LimeTabularExplainer
ada_explainer = lime.lime_tabular.LimeTabularExplainer(training_data=np.array(X_test),
                                 feature_names=X_test.columns,
                                 class_names = ['Dead', 'Alive'], 
                                 mode = 'classification')

In [None]:
i = np.random.randint(len(X_test))
print(y_test.iloc[i])
ada_exp = ada_explainer.explain_instance(X_test.iloc[i], ada_exp_grad_red.predict_proba)
ada_exp.show_in_notebook(show_all=False)

In [None]:
ada_exp = ada_explainer.explain_instance(X_test.iloc[i], ada.predict_proba)
ada_exp.show_in_notebook(show_all=False)

# 5. XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_estimator = XGBClassifier()
xgb_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=xgb_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
xgb_exp_grad_red.fit(X_train, y_train)
xgb_egr_acc = xgb_exp_grad_red.score(X_test, y_test)
print(xgb_egr_acc)

In [None]:
# Without fairness
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred)
xgb_acc

## 5.1 SHAP For XGBoost

In [None]:
# import shap and data visualization tool
import shap
import matplotlib.pyplot as plt

# load JS visualization code to notebook
shap.initjs()

# Create the explainer using the subset
xgb_explainer = shap.Explainer(xgb_exp_grad_red.predict, X_train)

# Calculate SHAP values for the subset
xgb_shap_values = xgb_explainer(X_test)

In [None]:
print("Mitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(xgb_shap_values, X_test)

In [None]:
# Create the explainer for unmitigated model
xgb_unmitigated_explainer = shap.Explainer(xgb.predict, X_train)

# Calculate SHAP values for the subset
xgb_unmitigated_shap_values = xgb_unmitigated_explainer(X_test)

In [None]:
print("Unmitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(xgb_unmitigated_shap_values, X_test)

## 5.2 LIME For XGBoost

In [None]:
# import the LimeTabularExplainer module
import lime
import lime.lime_tabular
import numpy as np


# get the feature names
feature_names = list(X_test.columns)

# Fit the Explainer on the training data set using the LimeTabularExplainer
xgb_explainer = lime.lime_tabular.LimeTabularExplainer(training_data=np.array(X_test),
                                 feature_names=X_test.columns,
                                 class_names = ['Dead', 'Alive'], 
                                 mode = 'classification')

In [None]:
i = np.random.randint(len(X_test))
print(y_test.iloc[i])
xgb_exp = xgb_explainer.explain_instance(X_test.iloc[i], xgb_exp_grad_red.predict_proba)
xgb_exp.show_in_notebook(show_all=False)

In [None]:
xgb_exp = xgb_explainer.explain_instance(X_test.iloc[i], xgb.predict_proba)
xgb_exp.show_in_notebook(show_all=False)

# 6. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_estimator = RandomForestClassifier()
rf_exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=rf_estimator, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
rf_exp_grad_red.fit(X_train, y_train)
rf_egr_acc = rf_exp_grad_red.score(X_test, y_test)
print(rf_egr_acc)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred)
rf_acc

## 6.1 SHAP for Random Forest

In [None]:
# import shap and data visualization tool
import shap
import matplotlib.pyplot as plt

# load JS visualization code to notebook
shap.initjs()

# Create the explainer using the subset
rf_explainer = shap.Explainer(rf_exp_grad_red.predict, X_train)

# Calculate SHAP values for the subset
rf_shap_values = rf_explainer(X_test)

In [None]:
print("Mitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(rf_shap_values, X_test)

In [None]:
# Create the explainer for unmitigated model
rf_unmitigated_explainer = shap.Explainer(rf.predict, X_train)

# Calculate SHAP values for the subset
rf_unmitigated_shap_values = rf_unmitigated_explainer(X_test)

In [None]:
print("Unmitigated Variable Importance Plot - Global Interpretation")
figure = plt.figure()
shap.summary_plot(rf_unmitigated_shap_values, X_test)

## 6.2 LIME for Random Forest

In [None]:
# import the LimeTabularExplainer module
import lime
import lime.lime_tabular
import numpy as np


# get the feature names
feature_names = list(X_test.columns)

# Fit the Explainer on the training data set using the LimeTabularExplainer
rf_explainer = lime.lime_tabular.LimeTabularExplainer(training_data=np.array(X_test),
                                 feature_names=X_test.columns,
                                 class_names = ['Dead', 'Alive'], 
                                 mode = 'classification')

In [None]:
i = np.random.randint(len(X_test))
print(y_test.iloc[i])
rf_exp = rf_explainer.explain_instance(X_test.iloc[i], rf_exp_grad_red.predict_proba)
rf_exp.show_in_notebook(show_all=False)

In [None]:
rf_exp = rf_explainer.explain_instance(X_test.iloc[i], rf.predict_proba)
rf_exp.show_in_notebook(show_all=False)