# Sequence Models (GRU / LSTM / Transformer)

This notebook trains sequence models on an HDF5 dataset from **any dataset** (MIMIC/eICU/NWICU or external).


## 1) Libraries

In [None]:
import os
from pathlib import Path
import subprocess
import tables
import pandas as pd


## 2) Paths & Configuration

In [None]:
# You can point to ANY HDF5 file here
# Example:
# H5_PATH = Path('outputs/mimic/preprocess/h5/dataset_mimic.h5')
# H5_PATH = Path('outputs/eicu/preprocess/h5/dataset_eicu.h5')
# H5_PATH = Path('outputs/nwicu/preprocess/h5/dataset_features.h5')
H5_PATH = Path('outputs/mimic/preprocess/h5/dataset_mimic.h5')

# Training options
BATCH_SIZE = 64
EPOCHS = 50
MAX_SEQ_LEN = 96
SEED = 43
DEVICE = 'auto'  # 'auto', 'cpu', or 'cuda'

# Output metrics
OUTPUT_DIR = Path(os.environ.get('OUTPUT_DIR', 'outputs'))
METRICS_CSV = OUTPUT_DIR / 'reports' / 'metrics_sequence_models.csv'


## 3) Inspect HDF5

In [None]:
if not H5_PATH.exists():
    raise FileNotFoundError(f'HDF5 not found: {H5_PATH}')

with tables.open_file(H5_PATH, mode='r') as h5:
    print('HDF5:', H5_PATH)
    for split in ['train', 'val', 'test']:
        x = h5.root.data[split]
        y = h5.root.labels[split]
        print(f'{split}: data={x.shape}, labels={y.shape}')
    if hasattr(h5.root, 'static'):
        print('Static features available:', h5.root.static)


## 4) Train Transformer

In [None]:
TRAIN_SCRIPT = Path('scripts') / 'modeling' / 'train_sequence_model.py'

cmd = [
    'python', str(TRAIN_SCRIPT),
    '--h5', str(H5_PATH),
    '--batch-size', str(BATCH_SIZE),
    '--epochs', str(EPOCHS),
    '--max-seq-len', str(MAX_SEQ_LEN),
    '--seed', str(SEED),
    '--device', DEVICE,
    '--model', 'transformer',
    '--metrics-csv', str(METRICS_CSV),
]

print('Running Transformer:')
print(' '.join(cmd))
subprocess.run(cmd, check=True)


## 5) Train GRU

In [None]:
TRAIN_SCRIPT = Path('scripts') / 'modeling' / 'train_sequence_model.py'

cmd = [
    'python', str(TRAIN_SCRIPT),
    '--h5', str(H5_PATH),
    '--batch-size', str(BATCH_SIZE),
    '--epochs', str(EPOCHS),
    '--max-seq-len', str(MAX_SEQ_LEN),
    '--seed', str(SEED),
    '--device', DEVICE,
    '--model', 'gru',
    '--metrics-csv', str(METRICS_CSV),
]

print('Running GRU:')
print(' '.join(cmd))
subprocess.run(cmd, check=True)


## 6) Train LSTM

In [None]:
TRAIN_SCRIPT = Path('scripts') / 'modeling' / 'train_sequence_model.py'

cmd = [
    'python', str(TRAIN_SCRIPT),
    '--h5', str(H5_PATH),
    '--batch-size', str(BATCH_SIZE),
    '--epochs', str(EPOCHS),
    '--max-seq-len', str(MAX_SEQ_LEN),
    '--seed', str(SEED),
    '--device', DEVICE,
    '--model', 'lstm',
    '--metrics-csv', str(METRICS_CSV),
]

print('Running LSTM:')
print(' '.join(cmd))
subprocess.run(cmd, check=True)


## 7) View Metrics

In [None]:
if METRICS_CSV.exists():
    df = pd.read_csv(METRICS_CSV)
    df
else:
    print('Metrics file not found:', METRICS_CSV)
