In [22]:
import time
import matplotlib.pyplot as plt  # Visualization
import pandas as pd
import numpy as np 
import fairlearn
from fairlearn.postprocessing import ThresholdOptimizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer  # Using KNN for missing values
from sklearn.ensemble import RandomForestClassifier

# Load dataset
data_folder = '~/Downloads/RDS Dataset/data'
data = pd.read_csv(f'{data_folder}/training_v2.csv')

# Define parameters
test_ratio = 0.2
validation_ratio = 0.4
seed = 27
missing_value_threshold = 0.75

# Drop excessive missing data (only for hospital_death = 0)
data = data.dropna(thresh=int(0.4 * len(data)), axis=1)

# Remove fraction of rows where hospital_death == 0
fraction = 0.8*(len(data[data['hospital_death'] == 0]) - len(data[data['hospital_death'] == 1]))/len(data) #*0.8 to maintain some of the difference
data = data.drop(data[data['hospital_death'] == 0].sample(frac=fraction, random_state=seed).index)

# Select features and target variable
X = data.drop(['hospital_death', 'patient_id', 'encounter_id', 'hospital_id', 'icu_id',
               'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob',
               'apache_2_bodysystem'], axis=1)  # Drop identifiable features + target variable(s)
y = data['hospital_death'].copy()
y_apache = data['apache_4a_hospital_death_prob'].copy()

# Split into training and testing
X_train, X_test, y_train, y_test, y_apache_train, y_apache_test = train_test_split(
    X, y, y_apache, test_size=test_ratio, random_state=seed)

# Further split test set into validation and test
X_val, X_test, y_val, y_test, y_apache_val, y_apache_test = train_test_split(
    X_test, y_test, y_apache_test, test_size=validation_ratio, random_state=seed)

# Drop columns with excessive missing values
cols_to_drop = X_train.columns[X_train.isna().mean() > missing_value_threshold]
X_train.drop(columns=cols_to_drop, inplace=True)
X_val.drop(columns=cols_to_drop, inplace=True)
X_test.drop(columns=cols_to_drop, inplace=True)

# Identify categorical and numerical columns
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Initialize KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)
X_train[numerical_features] = knn_imputer.fit_transform(X_train[numerical_features])
X_val[numerical_features] = knn_imputer.transform(X_val[numerical_features])
X_test[numerical_features] = knn_imputer.transform(X_test[numerical_features])

# Fill missing categorical values with 'missing'
X_train[categorical_features] = X_train[categorical_features].fillna('missing')
X_val[categorical_features] = X_val[categorical_features].fillna('missing')
X_test[categorical_features] = X_test[categorical_features].fillna('missing')

# One-hot encoding for categorical features
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder(drop=None, sparse_output=False, handle_unknown="ignore"))])
numerical_transformer = Pipeline(steps=[("imputer", KNNImputer(n_neighbors=5))])

# Define preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, make_column_selector(dtype_include=object)),
        ('num', numerical_transformer, make_column_selector(dtype_exclude=object))
    ]
)

# Fit and transform datasets
X_train = pd.DataFrame(preprocessing_pipeline.fit_transform(X_train), columns=preprocessing_pipeline.get_feature_names_out())
X_val = pd.DataFrame(preprocessing_pipeline.transform(X_val), columns=preprocessing_pipeline.get_feature_names_out())
X_test = pd.DataFrame(preprocessing_pipeline.transform(X_test), columns=preprocessing_pipeline.get_feature_names_out())

# Fill missing values in target variable
y_train.fillna(0, inplace=True)
y_val.fillna(0, inplace=True)
y_test.fillna(0, inplace=True)

# Train a classifier
clf = RandomForestClassifier(random_state=seed)
clf.fit(X_train, y_train)

# Apply Equalized Odds postprocessing to ensure fairness across ethnicity categories
sensitive_feature = data['ethnicity'].copy()
sensitive_feature = sensitive_feature.fillna('missing')

threshold_optimizer = ThresholdOptimizer(estimator=clf, constraints="equalized_odds", prefit=True)
threshold_optimizer.fit(X_train, y_train, sensitive_features=sensitive_feature.loc[y_train.index])

# Adjust predictions using the fairness-aware model
y_train = threshold_optimizer.predict(X_train, sensitive_features=sensitive_feature.loc[y_train.index])
y_val = threshold_optimizer.predict(X_val, sensitive_features=sensitive_feature.loc[y_val.index])
y_test = threshold_optimizer.predict(X_test, sensitive_features=sensitive_feature.loc[y_test.index])

# Convert NumPy arrays back to Pandas Series before saving
y_train = pd.Series(y_train, index=X_train.index, name="hospital_death")
y_val = pd.Series(y_val, index=X_val.index, name="hospital_death")
y_test = pd.Series(y_test, index=X_test.index, name="hospital_death")

# Save processed datasets
X_train.to_csv(f"{data_folder}/X_train.csv", index=False)
X_val.to_csv(f'{data_folder}/X_val.csv', index=False)
X_test.to_csv(f'{data_folder}/X_test.csv', index=False)
y_train.to_csv(f'{data_folder}/y_train.csv', index=False)
y_val.to_csv(f'{data_folder}/y_val.csv', index=False)
y_test.to_csv(f'{data_folder}/y_test.csv', index=False)
y_apache_test.to_csv(f'{data_folder}/y_apache_test.csv', index=False)
