In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch.optim as optim
from sklearn.metrics import precision_recall_curve, roc_curve, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import RobustScaler, StandardScaler
import random
import matplotlib.pyplot as plt
import time

plt.style.use('default')



In [2]:

# --- Add this near the top of your script ---
def set_seed(seed_value=42):
    """Sets the seed for reproducibility in PyTorch, NumPy, and Python."""
    random.seed(seed_value)  # Python random module
    np.random.seed(seed_value) # Numpy module
    torch.manual_seed(seed_value) # PyTorch CPU seeding

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # if you are using multi-GPU.
        # Configure CuDNN for deterministic operations
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        # Optional: Newer PyTorch versions might require this for full determinism
        # Note: This can sometimes throw errors if a deterministic implementation isn't available
        # try:
        #     torch.use_deterministic_algorithms(True)
        # except Exception as e:
        #     print(f"Warning: Could not enable deterministic algorithms: {e}")
        # Optional: Sometimes needed for deterministic matrix multiplication
        # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

    print(f"Seed set globally to {seed_value}")



# --- Call this function very early in your script ---
SEED = 42 # Choose your desired seed value
set_seed(SEED)


Seed set globally to 42


In [3]:
static_variables = ['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight']
categorical_vars = ["Gender","GCS", "MechVent"]
static_variables_we_want = ['Age', 'Gender', 'Height', 'Weight']
all_variables = ['Weight', 'Age', 'TroponinI', 'DiasABP', 'MechVent', 'HCO3', 'Cholesterol', 'HCT', 'SaO2', 'WBC', 'SysABP', 'Urine', 'ICUType', 'Gender', 'ALP', 'Creatinine', 'K', 'AST', 'Glucose', 'RespRate', 'MAP', 'FiO2', 'BUN', 'Na', 'Bilirubin', 'TroponinT', 'PaCO2', 'Height', 'GCS', 'HR', 'pH', 'PaO2', 'Lactate', 'ALT', 'NISysABP', 'RecordID', 'Platelets', 'Temp', 'Mg', 'NIDiasABP', 'Albumin', 'NIMAP']
dyn_variables = [x for x in all_variables if x not in static_variables]
dyn_variables.append('Weight_VAR')

print(len(dyn_variables), len(static_variables_we_want))

37 4


In [8]:
from data_loaders import load_outcomes, load_processed_data

train_df = load_processed_data("data/set-a_no_nan.parquet")
val_df = load_processed_data("data/set-b_no_nan.parquet")
test_df = load_processed_data("data/set-c_no_nan.parquet")

outcomes_a_dict = load_outcomes("data/Outcomes-a.txt")
outcomes_b_dict = load_outcomes("data/Outcomes-b.txt")
outcomes_c_dict = load_outcomes("data/Outcomes-c.txt")

Loaded data from data/set-a_no_nan.parquet, shape: (196000, 43)
Loaded data from data/set-b_no_nan.parquet, shape: (196000, 43)
Loaded data from data/set-c_no_nan.parquet, shape: (196000, 43)
Loaded 4000 outcomes from data/Outcomes-a.txt
Loaded 4000 outcomes from data/Outcomes-b.txt
Loaded 4000 outcomes from data/Outcomes-c.txt


In [13]:
print(train_df.columns)

Index(['RecordID', 'Time', 'Age', 'Gender', 'Height', 'Weight', 'Albumin',
       'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol', 'Creatinine',
       'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K',
       'Lactate', 'Mg', 'MAP', 'MechVent', 'Na', 'NIDiasABP', 'NIMAP',
       'NISysABP', 'PaCO2', 'PaO2', 'pH', 'Platelets', 'RespRate', 'SaO2',
       'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC',
       'Weight_VAR'],
      dtype='object', name='Parameter')


In [11]:
outcome_dicts = [outcomes_a_dict, outcomes_b_dict, outcomes_c_dict] # USE outcomes_b_dict for index 1
data_dfs = [train_df, val_df, test_df]
set_names = ["Train", "Validation", "Test"] # For printing

outcome_labels_aligned = [] # Use a new name to avoid confusion

print("--- Aligning Labels with Data Order ---")
for i in range(3):
    print(f"\nProcessing {set_names[i]} Set:")
    current_outcome_dict = outcome_dicts[i]
    current_data_df = data_dfs[i]

    # 1. Get the DEFINITIVE order of RecordIDs from the data DataFrame
    # This order dictates how your features will be structured for the dataset
    data_record_ids_ordered = current_data_df['RecordID'].unique().astype(int)
    print(f"  Order of unique RecordIDs based on data: {data_record_ids_ordered[:10]}... (Total: {len(data_record_ids_ordered)})")

    # 2. Check if all necessary outcome IDs are present (Set Check - Good Sanity Check)
    outcome_record_ids_set = set(current_outcome_dict.keys())
    data_record_ids_set = set(data_record_ids_ordered)

    if outcome_record_ids_set != data_record_ids_set:
        print(f"  ERROR: Set of RecordIDs mismatch for {set_names[i]} set.")
        missing_in_outcomes = data_record_ids_set - outcome_record_ids_set
        missing_in_data = outcome_record_ids_set - data_record_ids_set
        if missing_in_outcomes:
            print(f"  IDs in data but not outcomes: {missing_in_outcomes}")
        if missing_in_data:
            print(f"  IDs in outcomes but not data: {missing_in_data}")
        raise ValueError(f"RecordID sets do not match for {set_names[i]} set. Cannot align labels reliably.")
    else:
        print(f"  Set of RecordIDs matches.")

    # 3. Create the labels array by looking up IDs IN THE ORDER defined by the data
    try:
        current_labels = np.array(
            [current_outcome_dict[record_id] for record_id in data_record_ids_ordered],
            dtype=np.float32 # Use float for BCELoss compatibility
        )
        outcome_labels_aligned.append(current_labels)
        print(f"  Aligned labels created. Shape: {current_labels.shape}, Type: {type(current_labels)}")
    except KeyError as e:
        print(f"  ERROR: RecordID {e} found in data but missing from outcome dictionary during alignment!")
        raise ValueError(f"KeyError during label alignment for {set_names[i]} set.")
    except Exception as e:
         print(f"  ERROR: An unexpected error occurred during label alignment for {set_names[i]}: {e}")
         raise e


# Final verification (optional)
print("\n--- Final Check ---")
if len(outcome_labels_aligned) == 3:
    assert len(data_dfs[0]['RecordID'].unique()) == len(outcome_labels_aligned[0]), f"Train length mismatch: Data {len(data_dfs[0]['RecordID'].unique())}, Labels {len(outcome_labels_aligned[0])}"
    assert len(data_dfs[1]['RecordID'].unique()) == len(outcome_labels_aligned[1]), f"Validation length mismatch: Data {len(data_dfs[1]['RecordID'].unique())}, Labels {len(outcome_labels_aligned[1])}"
    assert len(data_dfs[2]['RecordID'].unique()) == len(outcome_labels_aligned[2]), f"Test length mismatch: Data {len(data_dfs[2]['RecordID'].unique())}, Labels {len(outcome_labels_aligned[2])}"
    print("All checks passed. Labels are aligned with the unique RecordID order from data DataFrames.")
else:
    print("Could not perform final length check due to earlier errors.")

# Now 'outcome_labels_aligned' contains [y_train, y_val, y_test] NumPy arrays
# correctly ordered according to the unique patient order in train_df, val_df, test_df.

--- Aligning Labels with Data Order ---

Processing Train Set:
  Order of unique RecordIDs based on data: [132539 132540 132541 132543 132545 132547 132548 132551 132554 132555]... (Total: 4000)
  Set of RecordIDs matches.
  Aligned labels created. Shape: (4000,), Type: <class 'numpy.ndarray'>

Processing Validation Set:
  Order of unique RecordIDs based on data: [142675 142676 142680 142683 142688 142690 142691 142692 142693 142694]... (Total: 4000)
  Set of RecordIDs matches.
  Aligned labels created. Shape: (4000,), Type: <class 'numpy.ndarray'>

Processing Test Set:
  Order of unique RecordIDs based on data: [152871 152873 152875 152878 152882 152884 152885 152886 152887 152890]... (Total: 4000)
  Set of RecordIDs matches.
  Aligned labels created. Shape: (4000,), Type: <class 'numpy.ndarray'>

--- Final Check ---
All checks passed. Labels are aligned with the unique RecordID order from data DataFrames.


In [97]:
# Convert dfs into numpy arrays
def convert_df_to_np(df):
    dfs = []
    for record_id in df['RecordID'].unique():
        df_tmp = df[df['RecordID'] == record_id]
        df_tmp = df_tmp.drop(columns=['RecordID', "Time"])
        arr = df_tmp.to_numpy()
        dfs.append(arr)

    # convert list of dfs to list of tensors
    train_data = np.array(dfs)
    return train_data


train_data = convert_df_to_np(data_df[0])
val_data = convert_df_to_np(data_df[1])
test_data = convert_df_to_np(data_df[2])

y_train = outcome_labels[0]
y_val = outcome_labels[1]
y_test = outcome_labels[2]


# Standardize data
# Original shape: (n_patients, n_timepoints, n_features)
n_patients, n_timepoints, n_features = train_data.shape

# Reshape to 2D: (n_patients * n_timepoints, n_features)
train_data_2d = train_data.reshape(-1, n_features)
val_data_2d = val_data.reshape(-1, n_features)
test_data_2d = test_data.reshape(-1, n_features)

# Initialize and fit the scaler ONLY on training data
scaler = RobustScaler()
scaler.fit(train_data_2d)

# Transform all datasets
train_scaled_2d = scaler.transform(train_data_2d)
val_scaled_2d = scaler.transform(val_data_2d)
test_scaled_2d = scaler.transform(test_data_2d)

# Reshape back to 3D
train_data = train_scaled_2d.reshape(n_patients, n_timepoints, n_features)
val_data = val_scaled_2d.reshape(val_data.shape)
test_data = test_scaled_2d.reshape(test_data.shape)

print("Sklearn Standard scaled train data shape:", train_data.shape)



Sklearn Standard scaled train data shape: (4000, 49, 41)


In [78]:
# --- Training and Evaluation Functions ---

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    start_time = time.time()

    for i, (batch_X, batch_y) in enumerate(loader):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        # Gradient clipping (optional but can help stability)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()

        # Print progress (optional)
        # if (i + 1) % 50 == 0:
        #      elapsed = time.time() - start_time
        #      print(f'  Batch {i+1}/{len(loader)}, Loss: {loss.item():.4f}, Time: {elapsed:.2f}s')
        #      start_time = time.time() # Reset timer

    avg_loss = total_loss / len(loader)
    return avg_loss

def evaluate(model, loader, criterion, device, return_probs=False):
    model.eval()
    total_loss = 0.0
    all_logits = []
    all_labels = []

    with torch.no_grad():
        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X) # raw logits
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()

            all_logits.extend(outputs.cpu())
            all_labels.extend(batch_y.cpu())

    avg_loss = total_loss / len(loader)
    all_logits = torch.cat(all_logits).numpy()
    all_labels = torch.cat(all_labels).numpy().flatten()

    all_probs = 1/ (1+ np.exp(-all_logits)).flatten() # Sigmoid to get probabilities

    all_preds_05 = np.where(all_probs > 0.5, 1, 0).astype(int) # Convert probabilities to binary predictions
    # Calculate metrics
    try:
        auc = roc_auc_score(all_labels, all_preds_05) # Use probabilities for AUC if needed: roc_auc_score(all_labels, probs.flatten())
    except ValueError:
        print("Warning: ROC AUC calculation failed. Likely only one class present in this evaluation batch/set.")
        auc = 0.0 # Or handle as appropriate

    f1 = f1_score(all_labels, all_preds_05)
    acc = accuracy_score(all_labels, all_preds_05)
    prec = precision_score(all_labels, all_preds_05, zero_division=0)
    rec = recall_score(all_labels, all_preds_05, zero_division=0)
    conf_mat = confusion_matrix(all_labels, all_preds_05)

    metrics = {
        'loss': avg_loss,
        'auc': auc,
        'f1': f1,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'conf_matrix': conf_mat
    }
    if return_probs:
        return metrics, all_probs, all_labels
    else:
        return metrics



In [79]:
# --- Hyperparameters and Setup ---
model_save_path = "./data/V7_1_best_transformer_model_weighted_sampler_dropout03.pth"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu")
print("Using device:", DEVICE)

BATCH_SIZE = 32 # Adjust based on GPU memory
LEARNING_RATE = 1e-4 # Common starting point for transformers
WEIGHT_DECAY = 1e-4 # AdamW uses weight decay
EPOCHS = 30 # Start with a reasonable number, monitor validation loss/AUC
D_MODEL = 128
N_HEAD = 2
NUM_LAYERS = 2
DROPOUT = 0.3 # Increased dropout slightly


# --- Create Datasets and DataLoaders ---
train_dataset = MedicalTimeSeriesDatasetTimeGrid(train_data, y_train)
val_dataset = MedicalTimeSeriesDatasetTimeGrid(val_data, y_val)
test_dataset = MedicalTimeSeriesDatasetTimeGrid(test_data, y_test)

# --- Handle Imbalance (Method 1: Weighted Loss) ---
# Calculate weights: weight = total_samples / (num_classes * samples_in_class)
# Or simpler for binary: weight for positive class = num_negative / num_positive
# num_positives = y_train.sum()
# num_negatives = len(y_train) - num_positives
# pos_weight_val = num_negatives / num_positives
# pos_weight = torch.tensor([pos_weight_val], dtype= torch.float32,  device=DEVICE) # Wrap in tensor for BCEWithLogitsLoss
# print(f"Calculated positive class weight: {pos_weight.item():.2f}")

# # Loss Function with weighting
# criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
#---------------------

# --- Handle Imbalance (Method 2: Weighted Sampler - Use either this OR weighted loss, or sometimes both) ---
# Comment out the WeightedRandomSampler lines if using only weighted loss.
# Using both can sometimes be beneficial but start with one. Let's use WeightedRandomSampler here.

class_counts = np.bincount(y_train.astype(int))
class_weights = 1. / class_counts
sample_weights = np.array([class_weights[int(t)] for t in y_train])
sample_weights = torch.from_numpy(sample_weights).double()
sampler = WeightedRandomSampler(sample_weights, len(sample_weights))


criterion = nn.BCEWithLogitsLoss() # Use this if using WeightedRandomSampler
# Use the sampler ONLY for the training loader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler)
# ---------------------

# For validation and test, use standard sequential loading
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# --- Model, Optimizer ---
model = TimeSeriesTransformer(
    d_model=D_MODEL,
    nhead=N_HEAD,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
).to(DEVICE)

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=3)

# --- Training Loop ---
best_val_auc = -1.0 # Or use F1 score, depending on priority
best_epoch = -1
history = {'train_loss': [], 'val_loss': [], 'val_auc': [], 'val_f1': []}

print("\n--- Starting Training ---")
for epoch in range(EPOCHS):
    start_epoch_time = time.time()

    # Training
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, DEVICE)
    history['train_loss'].append(train_loss)

    # Validation
    val_metrics = evaluate(model, val_loader, criterion, DEVICE)
    val_loss = val_metrics['loss']
    val_auc = val_metrics['auc']
    val_f1 = val_metrics['f1']
    history['val_loss'].append(val_loss)
    history['val_auc'].append(val_auc)
    history['val_f1'].append(val_f1)

    end_epoch_time = time.time()
    epoch_duration = end_epoch_time - start_epoch_time

    print(f"Epoch {epoch+1}/{EPOCHS} | Time: {epoch_duration:.2f}s")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f} | Val AUC: {val_auc:.4f} | Val F1: {val_f1:.4f}")
    print(f"  Val Accuracy: {val_metrics['accuracy']:.4f} | Val Precision: {val_metrics['precision']:.4f} | Val Recall: {val_metrics['recall']:.4f}")
    print(f"  Val Confusion Matrix:\n{val_metrics['conf_matrix']}")


    # Optional: Learning rate scheduling step based on validation loss
    scheduler.step(val_loss)

    # Save the best model based on validation AUC (or F1)
    # If using F1, ensure it's calculated correctly (might need probability adjustment/threshold tuning)
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        best_epoch = epoch
        torch.save(model.state_dict(), model_save_path)
        print(f"  * Best model saved based on Val AUC: {best_val_auc:.4f} at epoch {epoch+1}")

print(f"\n--- Training Finished ---")
print(f"Best validation AUC: {best_val_auc:.4f} achieved at epoch {best_epoch+1}")

# --- Final Evaluation on Test Set ---
print("\n--- Evaluating on Test Set ---")
# Load the best model
model.load_state_dict(torch.load(model_save_path))
print("Loaded best model weights for testing.")

test_metrics = evaluate(model, test_loader, criterion, DEVICE)

print("Test Set Performance:")
print(f"-  Test Loss: {test_metrics['loss']:.4f}")
print(f"-  Test AUC: {test_metrics['auc']:.4f}")
print(f"-  Test F1: {test_metrics['f1']:.4f}")
print(f"-  Test Accuracy: {test_metrics['accuracy']:.4f}")
print(f"-  Test Precision: {test_metrics['precision']:.4f}")
print(f"-  Test Recall: {test_metrics['recall']:.4f}")
print(f"-  Test Confusion Matrix:\n{test_metrics['conf_matrix']}")

# --- Optional: Plot training history ---
# plt.figure(figsize=(12, 5))
# plt.subplot(1, 2, 1)
# plt.plot(history['train_loss'], label='Train Loss')
# plt.plot(history['val_loss'], label='Val Loss')
# plt.title('Loss History')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()

# plt.subplot(1, 2, 2)
# plt.plot(history['val_auc'], label='Val AUC')
# plt.plot(history['val_f1'], label='Val F1')
# plt.title('Validation Metrics History')
# plt.xlabel('Epoch')
# plt.ylabel('Metric Value')
# plt.legend()
# plt.tight_layout()
# plt.show()


Using device: mps

--- Starting Training ---




Epoch 1/30 | Time: 3.13s
  Train Loss: 0.6457
  Val Loss: 0.6988 | Val AUC: 0.6592 | Val F1: 0.3411
  Val Accuracy: 0.5703 | Val Precision: 0.2180 | Val Recall: 0.7835
  Val Confusion Matrix:
[[1836 1596]
 [ 123  445]]
  * Best model saved based on Val AUC: 0.6592 at epoch 1
Epoch 2/30 | Time: 2.73s
  Train Loss: 0.5709
  Val Loss: 0.5760 | Val AUC: 0.7044 | Val F1: 0.3953
  Val Accuracy: 0.6780 | Val Precision: 0.2695 | Val Recall: 0.7412
  Val Confusion Matrix:
[[2291 1141]
 [ 147  421]]
  * Best model saved based on Val AUC: 0.7044 at epoch 2
Epoch 3/30 | Time: 2.71s
  Train Loss: 0.5584
  Val Loss: 0.4943 | Val AUC: 0.7149 | Val F1: 0.4331
  Val Accuracy: 0.7552 | Val Precision: 0.3227 | Val Recall: 0.6585
  Val Confusion Matrix:
[[2647  785]
 [ 194  374]]
  * Best model saved based on Val AUC: 0.7149 at epoch 3
Epoch 4/30 | Time: 2.77s
  Train Loss: 0.5189
  Val Loss: 0.4475 | Val AUC: 0.7131 | Val F1: 0.4437
  Val Accuracy: 0.7788 | Val Precision: 0.3451 | Val Recall: 0.6215
  Va