In [3]:
import os

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import joblib

from xgboost import XGBClassifier

from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import select_features

from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import select_features
from tsfresh import extract_relevant_features

def extract_features_tsfresh(df):
    df = df.copy()
    df['Time'] = df['Time'].str.replace(':00', '').astype(float)
    df.rename(columns={"RecordID": "id", "Time": "time"}, inplace=True)

    # Melt the dataframe so that each variable becomes a row
    df = df.melt(id_vars=["id", "time"], var_name="feature", value_name="value").dropna()

    # Pivot to long format suitable for tsfresh
    df = df.pivot_table(index=["id", "time"], columns="feature", values="value").reset_index()

    # tsfresh expects a flat format with one column for ID, one for time, and all features as columns
#     features_filtered_direct = extract_relevant_features(df, y,
#                                                      column_id='id', column_sort='time', n_jobs=8)        


    custom_fc_parameters = {
        "standard_deviation": None,
        "variance": None,
        "minimum": None,
        "absolute_sum_of_changes": None,
        "autocorrelation": [{"lag": 1}],
        "skewness": None,
        "linear_trend": [{"attr": "slope"}],
        "last_location_of_maximum": None,
        "last_location_of_minimum": None,
    }
    

    extracted = extract_features(
        df,
        column_id="id",
        column_sort="time",
        default_fc_parameters=custom_fc_parameters,
        impute_function=None
    )

    
    extracted.reset_index(inplace=True)
    extracted.rename(columns={"id": "RecordID"}, inplace=True)

    return extracted

def train_and_eval(X_train, y_train, X_test, y_test, model, model_name):
    model.fit(X_train, y_train)
    y_probs = model.predict_proba(X_test)[:, 1]
    y_pred = (y_probs >= 0.5).astype(int)  # default threshold

    auroc = roc_auc_score(y_test, y_probs)
    auprc = average_precision_score(y_test, y_probs)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{model_name} - Test Set:")
    print(f"  AuROC    : {auroc:.4f}")
    print(f"  AuPRC    : {auprc:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall   : {recall:.4f}")
    print(f"  F1 Score : {f1:.4f}\n")

    return model, auroc, auprc, f1


In [4]:
def extract_features_basic(df):
    df = df.copy()
    df['Time'] = df['Time'].str.replace(':00', '').astype(float)  # Convert '00:00' to 0.0, etc.
    features = []

    for patient_id, group in df.groupby("RecordID"):
        row = {'RecordID': patient_id}

        # Time series variables
        ts_vars = [col for col in df.columns if col not in ['RecordID', 'Time']]

        for var in ts_vars:
            times = group['Time']
            vals = group[var].dropna()
            row[f'{var}_mean'] = vals.mean() if not vals.empty else np.nan
            row[f'{var}_max'] = vals.max() if not vals.empty else np.nan
            row[f'{var}_last'] = vals.iloc[-1] if not vals.empty else np.nan
            
#             row[f'{var}_std'] = vals.std() if not vals.empty else np.nan
#             row[f'{var}_missing_frac'] = vals.isna().sum() / len(vals)
#             if vals.count() >= 2:
#                 # Fit linear regression for slope
#                 x = times[vals.notnull()].values.reshape(-1, 1)
#                 y = vals.values.reshape(-1, 1)
#                 model = LinearRegression()
#                 model.fit(x, y)
#                 row[f'{var}_slope'] = model.coef_[0][0]
#             else:
#                 row[f'{var}_slope'] = np.nan


        features.append(row)
    #return features
    return pd.DataFrame(features)

In [5]:
df_train = pd.read_parquet("../../data/set-a-filled.parquet").drop(columns = ['In-hospital_death', 'ICUType'])
df_val = pd.read_parquet("../../data/set-b-filled.parquet").drop(columns = ['In-hospital_death', 'ICUType'])
df_test = pd.read_parquet("../../data/set-c-filled.parquet").drop(columns = ['In-hospital_death', 'ICUType'])

labels_train = pd.read_csv("../../data/Outcomes-a.txt", sep=',')[['RecordID', 'In-hospital_death']]
labels_val = pd.read_csv("../../data/Outcomes-b.txt", sep=',')[['RecordID', 'In-hospital_death']]
labels_test = pd.read_csv("../../data/Outcomes-c.txt", sep=',')[['RecordID', 'In-hospital_death']]

# Feature extraction

feats_train_basic = extract_features_basic(df_train).drop(columns = ['RecordID'])


feats_train = extract_features_tsfresh(df_train)
for c in labels_train.columns:
    if c in feats_train.columns:
        print(c)
feats_train = feats_train.merge(labels_train, left_index = True, right_index = True)
feats_train = feats_train.merge(feats_train_basic, left_index = True, right_index = True)
#feats_train.to_pickle("feats_train.pickle")

feats_test_basic = extract_features_basic(df_test).drop(columns = ['RecordID'])


feats_test = extract_features_tsfresh(df_test)
feats_test = feats_test.merge(labels_test, left_index = True, right_index = True)
feats_test = feats_test.merge(feats_test_basic, left_index = True, right_index = True)
#feats_test.to_pickle("feats_test.pickle")

# Drop patient ID and rows with all NaNs
X_train = feats_train.drop(columns=['RecordID', 'In-hospital_death']).fillna(0)
y_train = feats_train['In-hospital_death']

X_test = feats_test.drop(columns=['RecordID', 'In-hospital_death']).fillna(0)
y_test = feats_test['In-hospital_death']


FileNotFoundError: [Errno 2] No such file or directory: '../../data/Outcomes-a.txt'

In [81]:

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'scaler_q2_1.pkl')

# Models
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
svm = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

xgb = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),  # handles class imbalance
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

train_and_eval(X_train, y_train, X_test, y_test, xgb, "XGBoost")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, svm, "Support Vector Machine (RBF)")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, knn, "k-Nearest Neighbors (k=5)")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, logreg, "Logistic Regression")
train_and_eval(X_train, y_train, X_test, y_test, rf, "Random Forest")

Parameters: { "use_label_encoder" } are not used.



XGBoost - Test Set:
  AuROC    : 0.8567
  AuPRC    : 0.5206
  Precision: 0.5364
  Recall   : 0.4906
  F1 Score : 0.5125

Support Vector Machine (RBF) - Test Set:
  AuROC    : 0.8499
  AuPRC    : 0.4779
  Precision: 0.5862
  Recall   : 0.2034
  F1 Score : 0.3020

k-Nearest Neighbors (k=5) - Test Set:
  AuROC    : 0.6797
  AuPRC    : 0.2690
  Precision: 0.5476
  Recall   : 0.0786
  F1 Score : 0.1375

Logistic Regression - Test Set:
  AuROC    : 0.8082
  AuPRC    : 0.4353
  Precision: 0.3164
  Recall   : 0.7453
  F1 Score : 0.4442

Random Forest - Test Set:
  AuROC    : 0.8426
  AuPRC    : 0.4618
  Precision: 0.6897
  Recall   : 0.0342
  F1 Score : 0.0651



(RandomForestClassifier(class_weight='balanced', random_state=42),
 0.8426091526823591,
 0.4618284195265096,
 0.06514657980456026)

In [73]:
# Drop patient ID and rows with all NaNs
feats_train_basic = feats_train_basic.merge(labels_train, left_index = True, right_index = True)
X_train = feats_train_basic.drop(columns=['RecordID', 'In-hospital_death']).fillna(0)
y_train = feats_train_basic['In-hospital_death']

feats_test_basic = feats_test_basic.merge(labels_test, left_index = True, right_index = True)
X_test = feats_test_basic.drop(columns=['RecordID', 'In-hospital_death']).fillna(0)
y_test = feats_test_basic['In-hospital_death']

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'scaler_q2_1.pkl')

# Models
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
svm = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

xgb = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),  # handles class imbalance
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

train_and_eval(X_train, y_train, X_test, y_test, xgb, "XGBoost")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, svm, "Support Vector Machine (RBF)")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, knn, "k-Nearest Neighbors (k=5)")
train_and_eval(X_train_scaled, y_train, X_test_scaled, y_test, logreg, "Logistic Regression")
train_and_eval(X_train, y_train, X_test, y_test, rf, "Random Forest")

Parameters: { "use_label_encoder" } are not used.



XGBoost - Test Set:
  AuROC    : 0.8543
  AuPRC    : 0.5113
  Precision: 0.4828
  Recall   : 0.5504
  F1 Score : 0.5144

Support Vector Machine (RBF) - Test Set:
  AuROC    : 0.8414
  AuPRC    : 0.4643
  Precision: 0.5678
  Recall   : 0.1932
  F1 Score : 0.2883

k-Nearest Neighbors (k=5) - Test Set:
  AuROC    : 0.6981
  AuPRC    : 0.3009
  Precision: 0.5575
  Recall   : 0.1077
  F1 Score : 0.1805

Logistic Regression - Test Set:
  AuROC    : 0.8493
  AuPRC    : 0.5111
  Precision: 0.3786
  Recall   : 0.7675
  F1 Score : 0.5071

Random Forest - Test Set:
  AuROC    : 0.8414
  AuPRC    : 0.4652
  Precision: 0.7143
  Recall   : 0.0598
  F1 Score : 0.1104



(RandomForestClassifier(class_weight='balanced', random_state=42),
 0.8413842900226501,
 0.46515243757191926,
 0.11041009463722397)