In [38]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
import joblib
import os
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# ----------------------------
# Load Data
# ----------------------------

def load_data():
    os.chdir("/Users/damlaortac/Desktop/ML for HC/Project 1/data")
    df_train = pd.read_parquet("set-a-filled.parquet")
    df_val = pd.read_parquet("set-b-filled.parquet")
    df_test = pd.read_parquet("set-c-filled.parquet")
    
    #print(df_train.head())
    #print(df_val.head())
    #print(df_test.head())
    

    labels_train = pd.read_csv("Outcomes-a.txt", sep=',')[['RecordID', 'In-hospital_death']]
    labels_val = pd.read_csv("Outcomes-b.txt", sep=',')[['RecordID', 'In-hospital_death']]
    labels_test = pd.read_csv("Outcomes-c.txt", sep=',')[['RecordID', 'In-hospital_death']]

    return (df_train.drop(columns = ['In-hospital_death']), labels_train), (df_val.drop(columns = ['In-hospital_death']), labels_val), (df_test.drop(columns = ['In-hospital_death']), labels_test)

# ----------------------------
# Feature Engineering
# ----------------------------

def extract_features(df):
    df = df.copy()
    df['Time'] = df['Time'].str.replace(':00', '').astype(float)  # Convert '00:00' to 0.0, etc.
    features = []

    for patient_id, group in df.groupby("RecordID"):
        row = {'RecordID': patient_id}

        # Time series variables
        ts_vars = [col for col in df.columns if col not in ['RecordID', 'Time']]

        for var in ts_vars:
            times = group['Time']
            vals = group[var].dropna()
            row[f'{var}_mean'] = vals.mean() if not vals.empty else np.nan
            row[f'{var}_max'] = vals.max() if not vals.empty else np.nan
            row[f'{var}_last'] = vals.iloc[-1] if not vals.empty else np.nan
            
            row[f'{var}_std'] = vals.std() if not vals.empty else np.nan
            row[f'{var}_missing_frac'] = vals.isna().sum() / len(vals)
            if vals.count() >= 2:
                # Fit linear regression for slope
                x = times[vals.notnull()].values.reshape(-1, 1)
                y = vals.values.reshape(-1, 1)
                model = LinearRegression()
                model.fit(x, y)
                row[f'{var}_slope'] = model.coef_[0][0]
            else:
                row[f'{var}_slope'] = np.nan


        features.append(row)

    return pd.DataFrame(features)

# ----------------------------
# Train & Evaluate
# ----------------------------

def train_and_eval(X_train, y_train, X_test, y_test, model, model_name):
    model.fit(X_train, y_train)
    y_probs = model.predict_proba(X_test)[:, 1]

    auroc = roc_auc_score(y_test, y_probs)
    auprc = average_precision_score(y_test, y_probs)

    print(f"{model_name} - Test Set:")
    print(f"  AuROC: {auroc:.4f}")
    print(f"  AuPRC: {auprc:.4f}\n")

    return model, auroc, auprc




In [39]:

# Load data
(df_train, labels_train), (_, _), (df_test, labels_test) = load_data()

# Feature extraction
feats_train = extract_features(df_train)
feats_test = extract_features(df_test)

# Merge with labels
feats_train = feats_train.merge(labels_train, on='RecordID')
feats_test = feats_test.merge(labels_test, on='RecordID')

# Drop patient ID and rows with all NaNs
X_train = feats_train.drop(columns=['RecordID', 'In-hospital_death']).fillna(0)
y_train = feats_train['In-hospital_death']

X_test = feats_test.drop(columns=['RecordID', 'In-hospital_death']).fillna(0)
y_test = feats_test['In-hospital_death']

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'scaler_q2_1.pkl')

# Models
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
svm = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

xgb = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),  # handles class imbalance
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)




train_and_eval(X_train, y_train, X_test, y_test, xgb, "XGBoost")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, svm, "Support Vector Machine (RBF)")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, knn, "k-Nearest Neighbors (k=5)")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, logreg, "Logistic Regression")
train_and_eval(X_train, y_train, X_test, y_test, rf, "Random Forest")

Parameters: { "use_label_encoder" } are not used.



XGBoost - Test Set:
  AuROC: 0.8552
  AuPRC: 0.5247

Support Vector Machine (RBF) - Test Set:
  AuROC: 0.8439
  AuPRC: 0.4656

k-Nearest Neighbors (k=5) - Test Set:
  AuROC: 0.6970
  AuPRC: 0.2909

Logistic Regression - Test Set:
  AuROC: 0.8428
  AuPRC: 0.4895

Random Forest - Test Set:
  AuROC: 0.8476
  AuPRC: 0.4806



(RandomForestClassifier(class_weight='balanced', random_state=42),
 0.8475724243220584,
 0.48062241886436313)