In [1]:
!pip freeze
!pip install typing_extensions==4.6.3 
!pip install pytorch-lightning==2.5.1
PL_FAULT_TOLERANT_TRAINING=1

aiohappyeyeballs==2.6.1
aiohttp==3.11.16
aiosignal==1.3.2
anyio==3.6.2
appnope @ file:///Users/runner/miniforge3/conda-bld/appnope_1635819658021/work
argon2-cffi @ file:///Users/runner/miniforge3/conda-bld/argon2-cffi_1636021583814/work
async-generator==1.10
async-timeout==5.0.1
attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1620387926260/work
backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
backports.functools-lru-cache @ file:///home/conda/feedstock_root/build_artifacts/backports.functools_lru_cache_1618230623929/work
beautifulsoup4==4.11.1
bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1629908509068/work
brotlipy==0.7.0
certifi==2022.12.7
cffi @ file:///Users/runner/miniforge3/conda-bld/cffi_1631636293358/work
chardet @ file:///Users/runner/miniforge3/conda-bld/chardet_1610093454858/work
charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1626371162869/work
colorama @ fi



In [2]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np
import os
from torch.utils.data import TensorDataset


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    RocCurveDisplay,
    PrecisionRecallDisplay,
    ConfusionMatrixDisplay
)
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
import torchmetrics

import os
from glob import glob
from pathlib import Path
from tqdm.notebook import tqdm
import ipywidgets as widgets
from IPython.display import display, clear_output
from matplotlib import pyplot as plt


os.chdir("/Users/damlaortac/Desktop/ML for HC/Project 1/ICU-TimeSeries-Mortality-Prediction/data")
set_a_filled = pd.read_parquet("set-a-filled.parquet")
set_b_filled = pd.read_parquet("set-b-filled.parquet")
set_c_filled = pd.read_parquet("set-c-filled.parquet")


class LSTMAutoencoder(pl.LightningModule):
    def __init__(self, input_size, hidden_size=64, num_layers=1, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr

        self.encoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.decoder = nn.LSTM(input_size=hidden_size, hidden_size=input_size, num_layers=num_layers, batch_first=True)

        self.loss_fn = nn.MSELoss()

    def forward(self, x):
        _, (hidden_state, _) = self.encoder(x)
        embedding = hidden_state[-1].unsqueeze(1).repeat(1, x.size(1), 1)
        reconstructed, _ = self.decoder(embedding)
        return reconstructed

    def training_step(self, batch, batch_idx):
        x, _ = batch  # ignore labels
        x_hat = self(x)
        loss = self.loss_fn(x_hat, x)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, _ = batch
        x_hat = self(x)
        loss = self.loss_fn(x_hat, x)
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

In [3]:
class TimeSeriesDataModule(pl.LightningDataModule):
    def __init__(self, feature_cols, batch_size=32):
        super().__init__()
        self.feature_cols = feature_cols
        self.batch_size = batch_size

    def preprocess_parquet_for_lstm(self, key, scaler=None, fit_scaler=False):
        labelname = 'In-hospital_death'
        df = globals()[f"set_{key}_filled"].copy()
        df = df.sort_values(["RecordID", "Time"])
        df[self.feature_cols] = df[self.feature_cols].fillna(0)

        # Fit/transform
        if fit_scaler or scaler is None:
            scaler = StandardScaler()
            scaler.fit(df[self.feature_cols])
        df[self.feature_cols] = scaler.transform(df[self.feature_cols])

        # Group by patient
        X, y = [], []
        for pid, group in df.groupby("RecordID"):
            group = group.sort_values("Time")
            X.append(group[self.feature_cols].values)
            y.append(group[labelname].iloc[0])

        X_tensor = torch.tensor(np.stack(X)).float()
        y_tensor = torch.tensor(y).float()
        return X_tensor, y_tensor, scaler

    def setup(self, stage=None):
        self.X_train, self.y_train, scaler = self.preprocess_parquet_for_lstm("a", fit_scaler=True)
        self.X_val, self.y_val, _ = self.preprocess_parquet_for_lstm("b", scaler=scaler)
        self.X_test, self.y_test, _ = self.preprocess_parquet_for_lstm("c", scaler=scaler)

        self.train_dataset = TensorDataset(self.X_train, self.y_train)
        self.val_dataset = TensorDataset(self.X_val, self.y_val)
        self.test_dataset = TensorDataset(self.X_test, self.y_test)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)
    


In [5]:


feature_cols = set_a_filled.columns.difference(["PatientID", "Time", "RecordID", "In-hospital_death"]).tolist()
datamodule = TimeSeriesDataModule(feature_cols)
datamodule.setup()

In [6]:
def train_autoencoder(model, datamodule, max_epochs=20):
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        log_every_n_steps=10,
        accelerator="auto",
        devices="auto",
        default_root_dir="autoencoder_logs"
    )
    trainer.fit(model, datamodule)
    return model

# Example usage
input_size = len(datamodule.feature_cols)
autoencoder = LSTMAutoencoder(input_size=input_size)
autoencoder = train_autoencoder(autoencoder, datamodule)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | encoder | LSTM    | 27.4 K | train
1 | decoder | LSTM    | 17.5 K | train
2 | loss_fn | MSELoss | 0      | train


Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |                                               | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [7]:
def extract_embeddings(model, dataloader):
    model.eval()
    embeddings = []
    labels = []
    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(model.device)
            _, (hidden_state, _) = model.encoder(x)
            z = hidden_state[-1]  # shape: (batch, hidden_size)
            embeddings.append(z.cpu().numpy())
            labels.append(y.cpu().numpy())
    return np.vstack(embeddings), np.concatenate(labels)

In [8]:
from sklearn.linear_model import LogisticRegression

def train_linear_probe(embeddings, labels):
    clf = LogisticRegression(max_iter=1000)
    clf.fit(embeddings, labels)
    return clf

In [9]:
from sklearn.metrics import roc_auc_score, average_precision_score

def evaluate_probe(clf, embeddings, labels):
    probs = clf.predict_proba(embeddings)[:, 1]
    auroc = roc_auc_score(labels, probs)
    auprc = average_precision_score(labels, probs)
    print(f"📊 Linear Probe Performance:\n - AuROC: {auroc:.4f}\n - AuPRC: {auprc:.4f}")
    return auroc, auprc

In [10]:
# Get embeddings from the frozen encoder
X_train_embed, y_train = extract_embeddings(autoencoder, datamodule.train_dataloader())
X_test_embed, y_test = extract_embeddings(autoencoder, datamodule.test_dataloader())

# Train and evaluate linear probe
probe = train_linear_probe(X_train_embed, y_train)
evaluate_probe(probe, X_test_embed, y_test)

📊 Linear Probe Performance:
 - AuROC: 0.8480
 - AuPRC: 0.4996


(0.8479768742726285, 0.49961778330298107)

In [12]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Use predicted probabilities from logistic regression
probs = probe.predict_proba(X_test_embed)[:, 1]

# Binary predictions with default threshold 0.5
y_pred = (probs >= 0.5).astype(int)

# Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("📊 Linear Probe Evaluation (Frozen LSTM Autoencoder Embeddings):")
print(f" - Precision : {precision:.4f}")
print(f" - Recall    : {recall:.4f}")
print(f" - F1 Score  : {f1:.4f}")

📊 Linear Probe Evaluation (Frozen LSTM Autoencoder Embeddings):
 - Precision : 0.6244
 - Recall    : 0.2359
 - F1 Score  : 0.3424
