# RNN

## Lib and Data Loading

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
import torchmetrics

import os
from pathlib import Path
from tqdm.notebook import tqdm
import ipywidgets as widgets
from IPython.display import display, clear_output
from matplotlib import pyplot as plt

# override all pandas display limits
pd.options.display.max_columns = None
pd.options.display.max_rows = None



In [10]:
# load parquet files
data_path = Path("../../../data")
notebooks_path = Path(os.getcwd())
data_dir = {}

##unsafe
# for file_path in list((notebooks_path / data_path).glob("*.parquet")):
#     print(f"Reading {file_path}")
#     # retrieve the name of the file without the extension for all OS
#     data = pd.read_parquet(file_path)
#     # if "Time" in df.columns:
#     #     df["Time"] = pd.to_datetime(df["Time"])
#     data_dir[str(file_path).replace("\\", "/").split("/")[-1].split(".")[0].replace("-", "_")] = data

for file_path in (notebooks_path / data_path).glob("*.parquet"):
    print(f"Reading {file_path}")
    var_name = file_path.stem.replace("-", "_")
    globals()[var_name] = pd.read_parquet(file_path)


ID_vars = ["PatientID", "Time", "RecordID"]
# stationary variables
stationary_vars = ["Age", "Gender", "Height", "ICUType"]
# dynamic variables
dynamic_vars = set_a.columns.difference(stationary_vars + ID_vars).tolist()

Reading c:\Users\paesc\OneDrive\docs\projects\ICU-TimeSeries-Mortality-Prediction\notebooks\2_SupervisedML\Q2_RNN\..\..\..\data\Outcomes-a.parquet
Reading c:\Users\paesc\OneDrive\docs\projects\ICU-TimeSeries-Mortality-Prediction\notebooks\2_SupervisedML\Q2_RNN\..\..\..\data\Outcomes-b.parquet
Reading c:\Users\paesc\OneDrive\docs\projects\ICU-TimeSeries-Mortality-Prediction\notebooks\2_SupervisedML\Q2_RNN\..\..\..\data\Outcomes-c.parquet
Reading c:\Users\paesc\OneDrive\docs\projects\ICU-TimeSeries-Mortality-Prediction\notebooks\2_SupervisedML\Q2_RNN\..\..\..\data\set-a.parquet
Reading c:\Users\paesc\OneDrive\docs\projects\ICU-TimeSeries-Mortality-Prediction\notebooks\2_SupervisedML\Q2_RNN\..\..\..\data\set-b.parquet
Reading c:\Users\paesc\OneDrive\docs\projects\ICU-TimeSeries-Mortality-Prediction\notebooks\2_SupervisedML\Q2_RNN\..\..\..\data\set-c.parquet


## Preprocess Data Shape

In [11]:
class TimeSeriesDataModule(pl.LightningDataModule):
    def __init__(
        self,
        feature_cols,
        batch_size=32
    ):
        super().__init__()
        self.feature_cols = feature_cols
        self.batch_size = batch_size

    def preprocess_parquet_for_lstm(self, key):
        X = []
        y = globals()[f"Outcomes_{key}"]["In-hospital_death"].values

        for pid, data in globals()[f"set_{key}"].groupby("PatientID"):
            data = data.sort_values("Time")
            # TODO: Replace with real imputation
            data.fillna(0, inplace=True)
            X.append(data[self.feature_cols].values)

        return torch.tensor(np.stack(X)).float(), torch.tensor(y).float()

    def setup(self, stage=None):
        self.X_train, self.y_train = self.preprocess_parquet_for_lstm("a")
        self.X_val, self.y_val     = self.preprocess_parquet_for_lstm("b")
        self.X_test, self.y_test   = self.preprocess_parquet_for_lstm("c")

        self.train_dataset = TensorDataset(self.X_train, self.y_train)
        self.val_dataset   = TensorDataset(self.X_val, self.y_val)
        self.test_dataset  = TensorDataset(self.X_test, self.y_test)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)



## LSTM Model Class

In [17]:
class LSTMClassifier(pl.LightningModule):
    def __init__(self, input_size, hidden_size=64, num_layers=1, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Linear(hidden_size, 1)
        self.loss_fn = nn.BCEWithLogitsLoss()

    def forward(self, x):
        # Run input through the LSTM
        _, (hidden_state, _) = self.lstm(x)  # lstm_output, (hidden_state, cell_state)

        # Take the last layer's hidden state (for stacked LSTM)
        last_hidden = hidden_state[-1]  # shape: (batch_size, hidden_dim)

        # Pass through the classification head
        logits = self.classifier(last_hidden)

        return logits

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x).squeeze(1)
        loss = self.loss_fn(logits, y.float())
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x).squeeze(1)
        loss = self.loss_fn(logits, y.float())
        preds = torch.sigmoid(logits) > 0.5
        acc = (preds.int() == y).float().mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x).squeeze(1)
        loss = self.loss_fn(logits, y.float())
        preds = torch.sigmoid(logits) > 0.5
        acc = (preds.int() == y).float().mean()
        self.log("test_loss", loss)
        self.log("test_acc", acc)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5, verbose=True),
            'monitor': 'val_loss',
            'interval': 'epoch',
            'frequency': 1
        }
        return {"optimizer": optimizer, "lr_scheduler": scheduler}


# Callbacks for trainer
early_stop_callback = EarlyStopping(monitor='val_loss', patience=10, verbose=True, mode='min')
lr_monitor = LearningRateMonitor(logging_interval='epoch')
checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1, verbose=True, filename='best-checkpoint')



# -------------------------------
# Example Training Loop
# -------------------------------
# Ensure you have torch tensors: X_train, y_train, X_val, y_val, X_test, y_test
# And shape: [n_samples, seq_len, n_features] for X_* and [n_samples] for y_*

# Instantiate
feature_cols = dynamic_vars + stationary_vars
datamodule = TimeSeriesDataModule(feature_cols=feature_cols, batch_size=64)
model = LSTMClassifier(input_size=len(feature_cols), hidden_size=64, num_layers=2)



# # Trainer
trainer = pl.Trainer(
    max_epochs=10,
    callbacks=[early_stop_callback, lr_monitor, checkpoint_callback],
    log_every_n_steps=10,
    accelerator="auto",
    devices="auto"
)

trainer.fit(model, datamodule=datamodule)
trainer.test(model, datamodule=datamodule)



GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name       | Type              | Params | Mode 
---------------------------------------------------------
0 | lstm       | LSTM              | 60.7 K | train
1 | classifier | Linear            | 65     | train
2 | loss_fn    | BCEWithLogitsLoss | 0      | train
---------------------------------------------------------
60.7 K    Trainable params
0         Non-trainable params
60.7 K    Total params
0.243     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\paesc\anaconda3\envs\py311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\paesc\anaconda3\envs\py311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.403
Epoch 0, global step 63: 'val_loss' reached 0.40297 (best 0.40297), saving model to 'c:\\Users\\paesc\\OneDrive\\docs\\projects\\ICU-TimeSeries-Mortality-Prediction\\notebooks\\2_SupervisedML\\Q2_RNN\\lightning_logs\\version_1\\checkpoints\\best-checkpoint.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

c:\Users\paesc\anaconda3\envs\py311\Lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
c:\Users\paesc\anaconda3\envs\py311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.8537499904632568
        test_loss            0.410361647605896
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.410361647605896, 'test_acc': 0.8537499904632568}]

## Training