In [None]:
# ==========================================================
# PTB-XL Condition Labeling Script
# Labels ECG records as Normal, Myocardial Infarction, or CAD
# ==========================================================

import pandas as pd
import ast

# === Load Dataset ===
file_path = "ptbxl_merged_features.csv"   # Change this path if needed
df = pd.read_csv(file_path, low_memory=False)

# === Define labeling function ===
def assign_condition(scp_codes_str):
    """
    Map PTB-XL diagnostic codes to three classes:
    - Normal: codes containing 'NORM'
    - Myocardial Infarction: codes containing 'MI' (e.g., IMI, AMI, PMI)
    - CAD: ischemic ST-T changes (codes starting with ISC or STTC)
    Returns 'Other' if no matching condition is found.
    """
    try:
        codes = ast.literal_eval(scp_codes_str)   # Convert string to dict
        code_set = set(codes.keys())              # Extract all diagnostic codes

        if any(code.startswith('NORM') for code in code_set):
            return 'Normal'
        elif any('MI' in code for code in code_set):
            return 'Myocardial Infarction'
        elif any(code.startswith('ISC') or code.startswith('STTC') for code in code_set):
            return 'CAD'
        else:
            return 'Other'
    except (ValueError, SyntaxError):
        return 'Other'

# === Apply labeling to create 'condition' column ===
df['condition'] = df['scp_codes'].apply(assign_condition)

# === Filter only the three target classes (drop 'Other') ===
df_filtered = df[df['condition'].isin(['Normal', 'Myocardial Infarction', 'CAD'])].copy()

# === Display class distribution ===
print("Class Distribution:")
print(df_filtered['condition'].value_counts())

# === Save labeled dataset ===
output_file = "ptbxl_labeled_conditions.csv"
df_filtered.to_csv(output_file, index=False)
print(f"✅ Labeled dataset saved as '{output_file}'")


df_filtered.info()
print("Dataset shape:", df_filtered.shape)

Class Distribution:
condition
Normal                   9514
Myocardial Infarction    5229
CAD                      1546
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21238 entries, 0 to 21237
Columns: 163 entries, filename_hr to condition
dtypes: bool(3), float64(141), int64(3), object(16)
memory usage: 26.0+ MB
Dataset shape: (21238, 163)
