In [1]:
# 1 - Imports & Setup

import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Directories
DATA_DIR_CSV = Path("/mnt/c/Users/orizarchi/Desktop/Courses/Resting_ECG_Dataset/dat_csv")
DATA_DIR_ANN = Path("/mnt/c/Users/orizarchi/Desktop/Courses/Resting_ECG_Dataset/ann_txt")
OUT_DIR = Path("~/projects/ecg_delineation/data/processed/segments_all").expanduser()
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Parameters
FS = 500
SEG_LEFT = 193
SEG_RIGHT = 318
SEG_LEN = SEG_LEFT + 1 + SEG_RIGHT


In [5]:
# 2 – Helper functions

def load_signal(record_id):
    """Load 12-lead ECG [12, N]."""
    sig_path = DATA_DIR_CSV / f"{record_id}.csv"
    return pd.read_csv(sig_path, header=None).values.T

def load_annotations(record_id, lead_suffix):
    path = DATA_DIR_ANN / f"{record_id}.{lead_suffix}.txt"
    if not path.exists():
        return None
    anno = pd.read_csv(
        path, skiprows=1, header=None,
        names=["type", "start", "end"]
    )
    anno = anno.dropna()
    anno["type"] = anno["type"].astype(int)
    anno["start"] = anno["start"].astype(int)
    anno["end"] = anno["end"].astype(int)
    return anno


def segment_around_qrs(sig, anno):
    """Return (segments, labels) lists for one lead."""
    segments, labels = [], []
    qrs = anno[anno["type"] == 1]

    for _, row in qrs.iterrows():
        center = (row["start"] + row["end"]) // 2
        start = center - SEG_LEFT
        end = center + SEG_RIGHT

        pad_left = max(0, -start)
        pad_right = max(0, end - (len(sig) - 1))
        start_clipped = max(0, start)
        end_clipped = min(len(sig) - 1, end)

        seg = np.zeros(SEG_LEN, dtype=np.float32)
        seg[pad_left:SEG_LEN-pad_right] = sig[start_clipped:end_clipped+1]

        label = np.full(SEG_LEN, 3, dtype=np.int64)  # 3 = background
        for _, wave in anno.iterrows():
            w_start = wave["start"] - start
            w_end = wave["end"] - start
            w_start = max(0, w_start)
            w_end = min(SEG_LEN-1, w_end)
            if w_end >= 0 and w_start < SEG_LEN:
                label[w_start:w_end+1] = wave["type"]

        segments.append(seg)
        labels.append(label)

    return segments, labels

In [None]:
# 3 – Process everything

lead_suffixes = {
    "I":"i", "II":"ii", "III":"iii",
    "aVR":"avr", "aVL":"avl", "aVF":"avf",
    "V1":"v1", "V2":"v2", "V3":"v3",
    "V4":"v4", "V5":"v5", "V6":"v6"
}

all_records = [f.stem for f in DATA_DIR_CSV.glob("*.csv")]

for record_id in tqdm(all_records, desc="Processing records"):
    sig = load_signal(record_id)
    all_segments, all_labels = [], []

    for idx, (lead, suffix) in enumerate(lead_suffixes.items()):
        anno = load_annotations(record_id, suffix)
        if anno is None: 
            continue
        segments, labels = segment_around_qrs(sig[idx], anno)
        if segments:
            all_segments.extend(segments)
            all_labels.extend(labels)

    if all_segments:
        np.savez_compressed(
            OUT_DIR / f"{record_id}.npz",
            segments=np.array(all_segments, dtype=np.float32),
            labels=np.array(all_labels, dtype=np.int64)
        )


Processing records:   0%|          | 0/2399 [00:00<?, ?it/s]

Processing records: 100%|██████████| 2399/2399 [11:12<00:00,  3.57it/s]


In [8]:
# 4 - Sanity check, load one file and print shapes

sample_file = next(OUT_DIR.glob("*.npz"))
data = np.load(sample_file)
print("File:", sample_file.name)
print("Segments shape:", data["segments"].shape)
print("Labels shape:", data["labels"].shape)


File: SI0296.npz
Segments shape: (144, 512)
Labels shape: (144, 512)
