<a href="https://colab.research.google.com/github/pamegup/fairness-in-ml/blob/master/NIH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# HIDE
import pandas as pd
import numpy as np
np.random.seed(7)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white", palette=[sns.color_palette('muted')[i] for i in [0,2]], 
        color_codes=True, context="talk")
from IPython import display
%matplotlib inline

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

# import keras as ke
# import keras.backend as K
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

create_gif = False

print(f"sklearn: {sk.__version__}")
print(f"pandas: {pd.__version__}")
print(f"tensorflow: {tf.__version__}")

sklearn: 1.0.2
pandas: 1.3.5
tensorflow: 2.11.0


# New Section

In [5]:
from google.colab import drive
drive.mount('/content/drive/')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [7]:
patientstarget = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/patientstarget.csv")
patients = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/patients.csv")

conditions = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/conditions.csv")
observations = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/observations.csv")
care_plans = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/careplans.csv")
encounters = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/encounters.csv")
devices = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/devices.csv")
supplies = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Challenge Data/supplies.csv')
procedures = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/procedures.csv")
medications = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge Data/medications.csv")

In [8]:

#Select COVID-19 conditions out of all conditions in the simulation
covid_conditions = conditions[conditions.CODE == 840539006]
#Grab the IDs of patients that have been diagnosed with COVID-19
covid_patient_ids = conditions[conditions.CODE == 840539006].PATIENT.unique()
# Grab IDs for patients with admission due to COVID-19
inpatient_ids = encounters[(encounters.REASONCODE == 840539006) & (encounters.CODE == 1505002)].PATIENT
inpatient_ids.shape

(1867,)

# New Section

In [9]:
#Lab values for COVID-19 patients The following code presents lab values taken for COVID-19 patients. Values are separated into survivors and non survivors.
#The first block of code selects lab values of interest from all observations in the simulation.
lab_obs = observations[(observations.CODE == '48065-7') | (observations.CODE == '26881-3') | 
                          (observations.CODE == '2276-4') | (observations.CODE == '89579-7') |
                          (observations.CODE == '2532-0') | (observations.CODE == '731-0') |
                          (observations.CODE == '14804-9')
                      ]

In [10]:
# Set up a new DataFrame with boolean columns representing various outcomes, like admit, recovery 
cp = covid_conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id')
isolation_ids = care_plans[(care_plans.CODE == 736376001) & (care_plans.REASONCODE == 840539006)].PATIENT
cp['isolation'] = cp.Id.isin(isolation_ids)
cp['admit'] = cp.Id.isin(inpatient_ids)

icu_ids = encounters[encounters.CODE == 305351004].PATIENT
cp['icu_admit'] = cp.Id.isin(icu_ids)
vent_ids = procedures[procedures.CODE == 26763009].PATIENT
cp['ventilated'] = cp.Id.isin(vent_ids)


In [11]:
hospitalized = (cp.admit == True)
icu = (cp.icu_admit == True)
vent = (cp.ventilated == True)
covid_count = cp.Id.size
row_filters = {'Home Isolation': (cp.isolation == True), 'Hospital Admission': hospitalized, 'ICU Admission': icu,
 }

table_rows = []
for category, row_filter in row_filters.items():
    row = {'Outcome': category}
    row['All Patients'] = cp[row_filter].Id.size / covid_count
    row['Hospitalized'] = cp[row_filter & hospitalized].Id.size / hospitalized.value_counts()[True]
    row['ICU Admitted'] = cp[row_filter & icu].Id.size / icu.value_counts()[True]
    row['Required Ventilation'] = cp[row_filter & vent].Id.size / vent.value_counts()[True]
    table_rows.append(row)
    
pd.DataFrame.from_records(table_rows)

Unnamed: 0,Outcome,All Patients,Hospitalized,ICU Admitted,Required Ventilation
0,Home Isolation,0.795238,0.032673,0.026667,0.02439
1,Hospital Admission,0.211678,1.0,1.0,1.0
2,ICU Admission,0.042517,0.200857,1.0,1.0


In [12]:
# Outcomes for ICU patients
icu_only = cp[cp.icu_admit == True]

vent = (icu_only.ventilated == True)
covid_count = icu_only.Id.size


table_rows = []
for category, row_filter in row_filters.items():
    row = {'Outcome': category}
    row['ICU Admitted'] = icu_only[row_filter].Id.size / covid_count
    row['Required Ventilation'] = icu_only[row_filter & vent].Id.size / vent.value_counts()[True]
    table_rows.append(row)
    
pd.DataFrame.from_records(table_rows)


  row['ICU Admitted'] = icu_only[row_filter].Id.size / covid_count
  row['Required Ventilation'] = icu_only[row_filter & vent].Id.size / vent.value_counts()[True]


Unnamed: 0,Outcome,ICU Admitted,Required Ventilation
0,Home Isolation,0.026667,0.02439
1,Hospital Admission,1.0,1.0
2,ICU Admission,1.0,1.0


In [13]:
# HIDE
def load_ICU_data(path):
    column_names = ['Id', 'BIRTHDATE', 'DEATHDATE','ICU Admission' ,'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX',
                    'FIRST', 'LAST', 'SUFFIX', 'MAIDEN','MARITAL','race', 'gender', 'BIRTHPLACE', 'ADDRESS']

    input_data = (pd.read_csv(path, names=column_names, 
                             na_values="?", sep=r'\s*,\s*', engine='python'))
               # .loc[lambda df: df['race'].isin(['White', 'black', 'asian'])])
    
    input_data

    # sensitive attributes; we identify 'race' and 'sex' as sensitive attributes
    sensitive_attribs = ['race', 'gender']
    Z = (input_data.loc[:, sensitive_attribs]
         .assign(race=lambda df: (df['race'] == 'white').astype(int),
                 gender=lambda df: (df['gender'] == 'M').astype(int)))

    # targets; 1 admit to ICU , otherwise 0
    
    y = (input_data['ICU Admission'] == 'Yes')
    # y = icu_only[row_filter].Id
    # features; note that the 'target' and sentive attribute columns are dropped
    X = (input_data
         .drop(columns=['ICU Admission', 'race', 'gender'])
         .fillna('Unknown')
         .pipe(pd.get_dummies, drop_first=True))
    
    print(f"features X: {X.shape[0]} samples, {X.shape[1]} attributes")
    print(f"targets y: {y.shape[0]} samples")
    print(f"sensitives Z: {Z.shape[0]} samples, {Z.shape[1]} attributes")
    return X, y, Z


In [14]:
# load ICU data set
X, y, Z = load_ICU_data("/content/drive/MyDrive/Colab Notebooks/Challenge Data/patientstarget.csv")


features X: 12353 samples, 36798 attributes
targets y: 12353 samples
sensitives Z: 12353 samples, 2 attributes


In [15]:
# HIDE

class FairClassifier(object):
    
    def __init__(self, n_features, n_sensitive, lambdas):
        self.lambdas = lambdas
        
        clf_inputs = Input(shape=(n_features,))
        adv_inputs = Input(shape=(1,))
        
        clf_net = self._create_clf_net(clf_inputs)
        adv_net = self._create_adv_net(adv_inputs, n_sensitive)
        self._trainable_clf_net = self._make_trainable(clf_net)
        self._trainable_adv_net = self._make_trainable(adv_net)
        self._clf = self._compile_clf(clf_net)
        self._clf_w_adv = self._compile_clf_w_adv(clf_inputs, clf_net, adv_net)
        self._adv = self._compile_adv(clf_inputs, clf_net, adv_net, n_sensitive)
        self._val_metrics = None
        self._fairness_metrics = None
        
        self.predict = self._clf.predict
        
    def _make_trainable(self, net):
        def make_trainable(flag):
            net.trainable = flag
            for layer in net.layers:
                layer.trainable = flag
        return make_trainable
        
    def _create_clf_net(self, inputs):
        dense1 = Dense(32, activation='relu')(inputs)
        dropout1 = Dropout(0.2)(dense1)
        dense2 = Dense(32, activation='relu')(dropout1)
        dropout2 = Dropout(0.2)(dense2)
        dense3 = Dense(32, activation='relu')(dropout2)
        dropout3 = Dropout(0.2)(dense3)
        outputs = Dense(1, activation='sigmoid', name='y')(dropout3)
        return Model(inputs=[inputs], outputs=[outputs])
        
    def _create_adv_net(self, inputs, n_sensitive):
        dense1 = Dense(32, activation='relu')(inputs)
        dense2 = Dense(32, activation='relu')(dense1)
        dense3 = Dense(32, activation='relu')(dense2)
        outputs = [Dense(1, activation='sigmoid')(dense3) for _ in range(n_sensitive)]
        return Model(inputs=[inputs], outputs=outputs)

    def _compile_clf(self, clf_net):
        clf = clf_net
        self._trainable_clf_net(True)
        clf.compile(loss='binary_crossentropy', optimizer='adam')
        return clf
        
    def _compile_clf_w_adv(self, inputs, clf_net, adv_net):
        clf_w_adv = Model(inputs=[inputs], outputs=[clf_net(inputs)]+adv_net(clf_net(inputs)))
        self._trainable_clf_net(True)
        self._trainable_adv_net(False)
        loss_weights = [1.]+[-lambda_param for lambda_param in self.lambdas]
        clf_w_adv.compile(loss=['binary_crossentropy']*(len(loss_weights)), 
                          loss_weights=loss_weights,
                          optimizer='adam')
        return clf_w_adv

    def _compile_adv(self, inputs, clf_net, adv_net, n_sensitive):
        adv = Model(inputs=[inputs], outputs=adv_net(clf_net(inputs)))
        self._trainable_clf_net(False)
        self._trainable_adv_net(True)
        adv.compile(loss=['binary_crossentropy']*n_sensitive, loss_weights=self.lambdas, 
                    optimizer='adam')
        return adv

    def _compute_class_weights(self, data_set, classes=[0, 1]):
        class_weights = []
        if len(data_set.shape) == 1:
            balanced_weights = compute_class_weight('balanced', classes=classes, y=data_set)
            class_weights.append(dict(zip(classes, balanced_weights)))
        else:
            n_attr =  data_set.shape[1]
            for attr_idx in range(n_attr):
                balanced_weights = compute_class_weight('balanced', classes=classes,
                                                        y=np.array(data_set)[:,attr_idx])
                class_weights.append(dict(zip(classes, balanced_weights)))
        return class_weights
    
    def _compute_target_class_weights(self, y, classes=[0, 1]):
        balanced_weights =  compute_class_weight('balanced', classes=classes, y=y)
        class_weights = {'y': dict(zip(classes, balanced_weights))}
        return class_weights
        
    def pretrain(self, x, y, z, epochs=10, verbose=0):
        self._trainable_clf_net(True)
        self._clf.fit(x.values, y.values, epochs=epochs, verbose=verbose)
        self._trainable_clf_net(False)
        self._trainable_adv_net(True)
        class_weight_adv = self._compute_class_weights(z)
        self._adv.fit(x.values, np.hsplit(z.values, z.shape[1]), class_weight=class_weight_adv, 
                      epochs=epochs, verbose=verbose)
        
    def fit(self, x, y, z, validation_data=None, T_iter=250, batch_size=128,
            save_figs=False, verbose=0):
        n_sensitive = z.shape[1]
        if validation_data is not None:
            x_val, y_val, z_val = validation_data
        
        class_weight_clf = [{0:1., 1:1.}]
        class_weight_adv = self._compute_class_weights(z)
        class_weight_clf_w_adv = class_weight_clf+class_weight_adv
        self._val_metrics = pd.DataFrame()
        self._fairness_metrics = pd.DataFrame()  
        for idx in range(T_iter):
            if validation_data is not None:
                y_pred = pd.Series(self._clf.predict(x_val.values).ravel(), index=y_val.index)
                self._val_metrics.loc[idx, 'ROC AUC'] = roc_auc_score(y_val, y_pred)
                self._val_metrics.loc[idx, 'Accuracy'] = (accuracy_score(y_val, (y_pred>0.5))*100)
                for sensitive_attr in z_val.columns:
                    self._fairness_metrics.loc[idx, sensitive_attr] = p_rule(y_pred,
                                                                             z_val[sensitive_attr])
                display.clear_output(wait=True)
                plot_distributions(y_pred, z_val, idx+1, self._val_metrics.loc[idx],
                                   self._fairness_metrics.loc[idx], 
                                   fname=f'output/{idx+1:08d}.png' if save_figs else None)
                plt.show(plt.gcf())
            
            
            # train adverserial
            self._trainable_clf_net(False)
            self._trainable_adv_net(True)
            self._adv.fit(x.values, np.hsplit(z.values, z.shape[1]), batch_size=batch_size, 
                          class_weight=class_weight_adv, epochs=1, verbose=verbose)
            
            # train classifier 
            # !Changed this into several epochs on whole dataset instead of single random minibatch!
            self._trainable_clf_net(True)
            self._trainable_adv_net(False)
            indices = np.random.permutation(len(x))[:batch_size]
            self._clf_w_adv.fit(x.values, [y.values]+np.hsplit(z.values, z.shape[1]), batch_size=len(x), 
                                class_weight=class_weight_clf_w_adv, epochs=5, verbose=verbose)

In [16]:
# split into train/test set
X_train, X_test, y_train, y_test, Z_train, Z_test = train_test_split(X, y, Z, test_size=0.5, 
                                                                     stratify=y, random_state=7)

# standardize the data
scaler = StandardScaler().fit(X_train)
scale_df = lambda df, scaler: pd.DataFrame(scaler.transform(df), columns=df.columns, index=df.index)
X_train = X_train.pipe(scale_df, scaler) 
X_test = X_test.pipe(scale_df, scaler) 

In [17]:
# initialise FairClassifier
clf = FairClassifier(n_features=X_train.shape[1], n_sensitive=Z_train.shape[1],
                     lambdas=[5., 5.])

# pre-train both adverserial and classifier networks
clf.pretrain(X_train, y_train, Z_train, verbose=0, epochs=5)

ValueError: ignored