In [None]:

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

DATA_DIR      = "/data"      # your hand-made CSVs
DATASETS_DIR  = "/datasets"  # KHDP provided data

LABELS_CSV    = os.path.join(DATA_DIR, "labels_with_hf.csv")
COMORB_CSV    = os.path.join(DATA_DIR, "tabular.csv")
ECG_DIR       = os.path.join(DATASETS_DIR, "ECG-Registry", "1.0.0", "1.MAIN")

VGPU_BATCH_SIZE = 32
VGPU_EPOCHS     = 20


In [None]:
comorb_df = pd.read_csv(COMORB_CSV, index_col="person_id")
feat_df  = comorb_df.drop(columns=["hf_outcome"])

# compute mean/std once
mean_vals = feat_df.mean().values.astype(np.float32)
std_vals  = feat_df.std().replace(0,1).values.astype(np.float32)

comorb_map = {
    pid: feat_df.loc[pid].values.astype(np.float32)
    for pid in feat_df.index
}

print(f"Loaded comorbidity for {len(comorb_map)} patients with {feat_df.shape[1]} features each")


In [None]:
class ECGDataset(Dataset):
def __init__(self, ecg_dir, records_df, transform=None):
    self.ecg_dir   = ecg_dir
    self.records   = records_df.reset_index(drop=True)
    self.transform = transform

def __len__(self):
    return len(self.records)

def __getitem__(self, idx):
    row = self.records.iloc[idx]
    path = os.path.join(self.ecg_dir, row.ecg_file)
    # adjust loader to match your filetype
    sig = np.load(path) if path.endswith(".npy") else torch.load(path)
    sig = torch.tensor(sig, dtype=torch.float32)
    if self.transform:
        sig = self.transform(sig)
    return sig, row.person_id, row.hf_outcome

class TabularDataset(Dataset):
def __init__(self, records_df, comorb_map, mean, std):
    self.records = records_df.reset_index(drop=True)
    self.comorb  = comorb_map
    self.mean    = mean
    self.std     = std

def __len__(self):
    return len(self.records)

def __getitem__(self, idx):
    row = self.records.iloc[idx]
    pid = row.person_id
    x   = self.comorb[pid]
    x_norm = (x - self.mean) / self.std
    return torch.tensor(x_norm, dtype=torch.float32), torch.tensor(row.hf_outcome, dtype=torch.float32)

class CombinedDataset(Dataset):
def __init__(self, ecg_ds, tab_ds):
    assert len(ecg_ds) == len(tab_ds), "ECG vs Tabular size mismatch!"
    self.ecg_ds = ecg_ds
    self.tab_ds = tab_ds

def __len__(self):
    return len(self.ecg_ds)

def __getitem__(self, idx):
    sig, pid, y1 = self.ecg_ds[idx]
    x_tab, y2    = self.tab_ds[idx]
    return sig, x_tab, y1  # y1 == y2


In [None]:
class ECGEncoder(nn.Module):
def __init__(self):
    super().__init__()
    self.conv = nn.Sequential(
        nn.Conv1d(12,32,5,padding=2), nn.ReLU(),
        nn.MaxPool1d(2),
        nn.Conv1d(32,64,5,padding=2), nn.ReLU(),
        nn.AdaptiveAvgPool1d(1)
    )

def forward(self, x):
    return self.conv(x).view(x.size(0), -1)

class TabularEncoder(nn.Module):
def __init__(self, in_dim):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(in_dim,64), nn.ReLU(),
        nn.Linear(64,32),    nn.ReLU()
    )

def forward(self, x):
    return self.net(x)

class CombinedModel(nn.Module):
def __init__(self, tab_in_dim):
    super().__init__()
    self.ecg_enc  = ECGEncoder()
    self.tab_enc  = TabularEncoder(tab_in_dim)
    self.classif = nn.Sequential(
        nn.Linear(64+32,32), nn.ReLU(),
        nn.Linear(32,1)
    )

def forward(self, sig, tab):
    e = self.ecg_enc(sig)
    t = self.tab_enc(tab)
    return self.classif(torch.cat([e,t], dim=1)).squeeze()


In [None]:
records = pd.read_csv(LABELS_CSV)
uids             = records.person_id.unique()
train_uids, val_uids = train_test_split(
    uids,
    test_size=0.2,
    stratify=records.drop_duplicates("person_id").hf_outcome,
    random_state=42
)
train_df = records[records.person_id.isin(train_uids)].reset_index(drop=True)
val_df   = records[records.person_id.isin(val_uids)].reset_index(drop=True)
counts        = train_df.hf_outcome.value_counts()
class_weights = {0:1/counts[0], 1:1/counts[1]}
train_weights = train_df.hf_outcome.map(class_weights).values
train_sampler = WeightedRandomSampler(train_weights, len(train_weights), replacement=True)

print(f"Train/Val patients: {len(train_uids)}/{len(val_uids)}, records: {len(train_df)}/{len(val_df)}")
print(f"Class weights: {class_weights}")


In [None]:
ecg_tr  = ECGDataset(ECG_DIR, train_df)
ecg_val = ECGDataset(ECG_DIR, val_df)
tab_tr  = TabularDataset(train_df, comorb_map, mean_vals, std_vals)
tab_val = TabularDataset(val_df,   comorb_map, mean_vals, std_vals)
ds_tr   = CombinedDataset(ecg_tr,  tab_tr)
ds_val  = CombinedDataset(ecg_val, tab_val)

loader_tr = DataLoader(ds_tr, batch_size=VGPU_BATCH_SIZE, sampler=train_sampler)
loader_val= DataLoader(ds_val, batch_size=VGPU_BATCH_SIZE, shuffle=False)


In [None]:
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model     = CombinedModel(tab_in_dim=len(mean_vals)).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(1, VGPU_EPOCHS+1):
    model.train()
    train_loss = 0
    for i, (sig, tab_x, y) in enumerate(loader_tr, 1):
        sig, tab_x, y = sig.to(device), tab_x.to(device), y.to(device)
        logits = model(sig, tab_x)
        loss   = criterion(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * sig.size(0)
        if i % 10 == 0:
            print(f"Epoch {epoch}  Batch {i}/{len(loader_tr)}  Loss: {loss.item():.4f}")
    print(f"→ Epoch {epoch}  TRAIN avg loss: {train_loss/len(loader_tr.dataset):.4f}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for sig, tab_x, y in loader_val:
            sig, tab_x, y = sig.to(device), tab_x.to(device), y.to(device)
            val_loss += criterion(model(sig, tab_x), y).item() * sig.size(0)
    print(f"→ Epoch {epoch}  VAL   avg loss: {val_loss/len(loader_val.dataset):.4f}\n")

# save final model
torch.save(model.state_dict(), "hf_risk_model.pth")
