## **TRAIN SEGMENTATION**

In [3]:
import os
import scipy.io as sio
import numpy as np
import pandas as pd
import neurokit2 as nk

# ----------------------------
# Paths
# ----------------------------
train_mat_dir = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\3-SPLIT_DATA\train"
labels_csv = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\3-SPLIT_DATA\train_labels.csv"

output_dir = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\4-SEGMENTED_DATA\train"
os.makedirs(output_dir, exist_ok=True)

out_label_csv = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\4-SEGMENTED_DATA\train_segment_labels.csv"

WINDOW_SEC = 3.0

# ----------------------------
# Load record-level labels
# ----------------------------
labels_df = pd.read_csv(labels_csv)
label_map = dict(zip(labels_df["record"], labels_df["label"]))

# ----------------------------
# Segmentation function
# ----------------------------
def segment_ecg_pcg(ecg, pcg, fs, window_sec=3.0):

    try:
        _, rpeaks = nk.ecg_peaks(ecg, sampling_rate=fs)
        rpeaks = rpeaks["ECG_R_Peaks"]
    except Exception as e:
        # R-peak detection failed
        return []

    # If no R-peaks detected, skip
    if len(rpeaks) == 0:
        return []

    half_window = int((window_sec / 2) * fs)
    window_len = int(window_sec * fs)

    segments = []
    last_end = -1

    for r in rpeaks:
        start = r - half_window
        end = r + half_window

        if start < 0 or end > len(ecg):
            continue

        if start <= last_end:
            continue

        ecg_seg = ecg[start:end]
        pcg_seg = pcg[start:end]

        if len(ecg_seg) != window_len:
            continue

        segments.append((ecg_seg, pcg_seg))
        last_end = end

    return segments


# ----------------------------
# Dataset-wide loop
# ----------------------------
segment_labels = []
total_segments = 0

for file in os.listdir(train_mat_dir):
    if not file.endswith(".mat"):
        continue

    record_id = file.replace(".mat", "")
    mat_path = os.path.join(train_mat_dir, file)

    data = sio.loadmat(mat_path)
    ecg = data["ecg"].squeeze().astype(np.float32)
    pcg = data["pcg"].squeeze().astype(np.float32)
    fs = int(data["fs"][0][0])

    if record_id not in label_map:
        print(f"Label missing for {record_id}, skipping")
        continue

    record_label = label_map[record_id]

    segments = segment_ecg_pcg(ecg, pcg, fs, WINDOW_SEC)
    
    if len(segments) == 0:
        print(f"{record_id}: no valid R-peak segments, skipped")
        continue

    for i, (ecg_seg, pcg_seg) in enumerate(segments):
        seg_name = f"{record_id}_seg{i:03d}"

        sio.savemat(
            os.path.join(output_dir, seg_name + ".mat"),
            {
                "ecg": ecg_seg,
                "pcg": pcg_seg,
                "fs": fs,
                "record_id": record_id
            }
        )

        segment_labels.append({
            "segment_id": seg_name,
            "record_id": record_id,
            "label": record_label
        })

    total_segments += len(segments)
    print(f"{record_id}: {len(segments)} segments")

# ----------------------------
# Save segment-level labels
# ----------------------------
seg_label_df = pd.DataFrame(segment_labels)
seg_label_df.to_csv(out_label_csv, index=False)

print(f"\n✅ Total TRAIN segments: {total_segments}")
print(f"✅ Segment labels saved to: {out_label_csv}")

# -------------------------
# SEGMENTED DATA ANALYSIS
# -------------------------

# Path to segmented label file
seg_label_csv = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\4-SEGMENTED_DATA\train_segment_labels.csv"

# Load labels
df = pd.read_csv(seg_label_csv)

# Count labels
label_counts = df["label"].value_counts()

# Percentage distribution
label_percent = df["label"].value_counts(normalize=True) * 100

print("Label counts:")
print(label_counts)

print("\nLabel percentages:")
print(label_percent)

a0001: 9 segments
a0002: 6 segments
a0004: 10 segments
a0006: 5 segments
a0007: 10 segments
a0008: 10 segments
a0010: 10 segments
a0012: 9 segments
a0013: 3 segments
a0014: 10 segments
a0015: 9 segments
a0016: 9 segments
a0017: 9 segments
a0019: 8 segments
a0020: 11 segments
a0022: 10 segments
a0023: 8 segments
a0024: 6 segments
a0025: 8 segments
a0026: 9 segments
a0027: 9 segments
a0028: 9 segments
a0029: 9 segments
a0030: 11 segments
a0031: 10 segments
a0032: 3 segments
a0033: 9 segments
a0034: 7 segments
a0035: 8 segments
a0036: 10 segments
a0037: 9 segments
a0038: 10 segments
a0039: 8 segments
a0045: 10 segments
a0046: 11 segments
a0048: 11 segments
a0049: 8 segments
a0053: 9 segments
a0054: 10 segments
a0055: 11 segments
a0056: 8 segments
a0057: 11 segments
a0058: 5 segments
a0061: 9 segments
a0064: 9 segments
a0065: 9 segments
a0067: 5 segments
a0068: 11 segments
a0069: 7 segments
a0070: 8 segments
a0072: 10 segments
a0075: 9 segments
a0076: 11 segments
a0077: no valid R-peak seg

## **TEST SEGMENTATION**

In [2]:
import os
import scipy.io as sio
import numpy as np
import pandas as pd
import neurokit2 as nk

# ----------------------------
# Paths
# ----------------------------
test_mat_dir = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\3-SPLIT_DATA\test"
labels_csv = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\3-SPLIT_DATA\test_labels.csv"

output_dir = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\4-SEGMENTED_DATA\test"
os.makedirs(output_dir, exist_ok=True)

out_label_csv = r"E:\PROJECTS\CARDIAC-PROJECT-UPDATED\DATASET\4-SEGMENTED_DATA\test_segment_labels.csv"

WINDOW_SEC = 3.0

# ----------------------------
# Load record-level labels
# ----------------------------
labels_df = pd.read_csv(labels_csv)
label_map = dict(zip(labels_df["record"], labels_df["label"]))

# ----------------------------
# Segmentation function
# ----------------------------
def segment_ecg_pcg(ecg, pcg, fs, window_sec=3.0):

    try:
        _, rpeaks = nk.ecg_peaks(ecg, sampling_rate=fs)
        rpeaks = rpeaks["ECG_R_Peaks"]
    except Exception:
        return []

    if len(rpeaks) == 0:
        return []

    half_window = int((window_sec / 2) * fs)
    window_len = int(window_sec * fs)

    segments = []
    last_end = -1

    for r in rpeaks:
        start = r - half_window
        end = r + half_window

        if start < 0 or end > len(ecg):
            continue

        if start <= last_end:
            continue

        ecg_seg = ecg[start:end]
        pcg_seg = pcg[start:end]

        if len(ecg_seg) != window_len:
            continue

        segments.append((ecg_seg, pcg_seg))
        last_end = end

    return segments

# ----------------------------
# Dataset-wide TEST loop
# ----------------------------
segment_labels = []
total_segments = 0
skipped_records = []

for file in os.listdir(test_mat_dir):
    if not file.endswith(".mat"):
        continue

    record_id = file.replace(".mat", "")
    mat_path = os.path.join(test_mat_dir, file)

    data = sio.loadmat(mat_path)
    ecg = data["ecg"].squeeze().astype(np.float32)
    pcg = data["pcg"].squeeze().astype(np.float32)
    fs = int(data["fs"][0][0])

    if record_id not in label_map:
        print(f"Label missing for {record_id}, skipping")
        continue

    record_label = label_map[record_id]

    segments = segment_ecg_pcg(ecg, pcg, fs, WINDOW_SEC)

    if len(segments) == 0:
        print(f"{record_id}: no valid R-peak segments, skipped")
        skipped_records.append(record_id)
        continue

    for i, (ecg_seg, pcg_seg) in enumerate(segments):
        seg_name = f"{record_id}_seg{i:03d}"

        sio.savemat(
            os.path.join(output_dir, seg_name + ".mat"),
            {
                "ecg": ecg_seg,
                "pcg": pcg_seg,
                "fs": fs,
                "record_id": record_id
            }
        )

        segment_labels.append({
            "segment_id": seg_name,
            "record_id": record_id,
            "label": record_label
        })

    total_segments += len(segments)
    print(f"{record_id}: {len(segments)} segments")

# ----------------------------
# Save segment-level labels
# ----------------------------
seg_label_df = pd.DataFrame(segment_labels)
seg_label_df.to_csv(out_label_csv, index=False)

print(f"\n✅ Total TEST segments: {total_segments}")
print(f"✅ Segment labels saved to: {out_label_csv}")

if skipped_records:
    print("\nSkipped TEST records due to R-peak detection failure:")
    print(", ".join(skipped_records))

# ----------------------------
# SEGMENTED DATA ANALYSIS (TEST)
# ----------------------------
df = pd.read_csv(out_label_csv)

label_counts = df["label"].value_counts()
label_percent = df["label"].value_counts(normalize=True) * 100

print("\nLabel counts (TEST):")
print(label_counts)

print("\nLabel percentages (TEST):")
print(label_percent)


a0003: 8 segments
a0005: 10 segments
a0009: 10 segments
a0011: 10 segments
a0018: 5 segments
a0021: 10 segments
a0040: 9 segments
a0042: 10 segments
a0043: 10 segments
a0044: 8 segments
a0047: 4 segments
a0050: 8 segments
a0051: 10 segments
a0052: 10 segments
a0059: 10 segments
a0060: 9 segments
a0062: 4 segments
a0063: 9 segments
a0066: 10 segments
a0071: 9 segments
a0073: 1 segments
a0074: 11 segments
a0080: 8 segments
a0083: 9 segments
a0085: 8 segments
a0089: 6 segments
a0090: 2 segments
a0096: 10 segments
a0101: no valid R-peak segments, skipped
a0102: 6 segments
a0108: 9 segments
a0110: 10 segments
a0121: 11 segments
a0123: 6 segments
a0129: 10 segments
a0132: 7 segments
a0137: no valid R-peak segments, skipped
a0141: 9 segments
a0143: 10 segments
a0145: 9 segments
a0147: 10 segments
a0152: 9 segments
a0164: 11 segments
a0167: 10 segments
a0170: 7 segments
a0178: 10 segments
a0180: 10 segments
a0182: 10 segments
a0184: 10 segments
a0185: 10 segments
a0186: 10 segments
a0190: 10 s

### **Dataset Expansion After Segmentation**

R-peak–centered segmentation was applied to convert record-level ECG–PCG data into fixed-length segments, substantially increasing the effective dataset size.

**Training set:**
- Original records: 283 patients  
- Segmented windows: 2,271 segments  
- **Expansion factor:** ~8.0× increase

**Test set:**
- Original records: 122 patients  
- Segmented windows: 959 segments  
- **Expansion factor:** ~7.9× increase

Overall, segmentation increased the number of available samples by approximately **8×**, enabling more effective training of deep learning models while preserving patient-wise separation and preventing data leakage.

All segments inherit the label of their corresponding patient record, ensuring label consistency after expansion.
