In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
import joblib
import os

# ----------------------------
# Load Data
# ----------------------------

def load_data():
    os.chdir()
    df_train = pd.read_parquet("set-a-filled.parquet")
    df_val = pd.read_parquet("set-b-filled.parquet")
    df_test = pd.read_parquet("set-c-filled.parquet")

    labels_train = pd.read_csv("Outcomes-a.txt", sep=',')[['PatientID', 'In-hospital_death']]
    labels_val = pd.read_csv("Outcomes-b.txt", sep=',')[['PatientID', 'In-hospital_death']]
    labels_test = pd.read_csv("Outcomes-c.txt", sep=',')[['PatientID', 'In-hospital_death']]

    return (df_train, labels_train), (df_val, labels_val), (df_test, labels_test)

# ----------------------------
# Feature Engineering
# ----------------------------

def extract_features(df):
    features = []

    for patient_id, group in df.groupby("PatientID"):
        row = {'PatientID': patient_id}

        # Time series variables
        ts_vars = [col for col in df.columns if col not in ['PatientID', 'Time']]

        for var in ts_vars:
            vals = group[var].dropna()
            row[f'{var}_mean'] = vals.mean() if not vals.empty else np.nan
            row[f'{var}_max'] = vals.max() if not vals.empty else np.nan
            row[f'{var}_last'] = vals.iloc[-1] if not vals.empty else np.nan

        features.append(row)

    return pd.DataFrame(features)

# ----------------------------
# Train & Evaluate
# ----------------------------

def train_and_eval(X_train, y_train, X_test, y_test, model, model_name):
    model.fit(X_train, y_train)
    y_probs = model.predict_proba(X_test)[:, 1]

    auroc = roc_auc_score(y_test, y_probs)
    auprc = average_precision_score(y_test, y_probs)

    print(f"{model_name} - Test Set:")
    print(f"  AuROC: {auroc:.4f}")
    print(f"  AuPRC: {auprc:.4f}\n")

    return model, auroc, auprc

# ----------------------------
# Main script
# ----------------------------

if __name__ == "__main__":
    # Load data
    (df_train, labels_train), (_, _), (df_test, labels_test) = load_data()

    # Feature extraction
    feats_train = extract_features(df_train)
    feats_test = extract_features(df_test)

    # Merge with labels
    feats_train = feats_train.merge(labels_train, on='PatientID')
    feats_test = feats_test.merge(labels_test, on='PatientID')

    # Drop patient ID and rows with all NaNs
    X_train = feats_train.drop(columns=['PatientID', 'In-hospital_death']).fillna(0)
    y_train = feats_train['In-hospital_death']

    X_test = feats_test.drop(columns=['PatientID', 'In-hospital_death']).fillna(0)
    y_test = feats_test['In-hospital_death']

    # Standardization
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    joblib.dump(scaler, 'scaler_q2_1.pkl')

    # Models
    logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
    rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

    # Train & Evaluate
    train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, logreg, "Logistic Regression")
    train_and_eval(X_train, y_train, X_test, y_test, rf, "Random Forest")

FileNotFoundError: [Errno 2] No such file or directory: 'set-a-filled.parquet'

KeyError: 'Time'