In [2]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [5]:
from google.colab import files
uploaded = files.upload()

Saving scp_statements.csv to scp_statements.csv


In [11]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database (1).csv


In [5]:
# ------------------------------
# A1: ECG Classification
# ------------------------------

import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# -------------------------------------------------------
# Function to load and merge ECG metadata with diagnosis
# -------------------------------------------------------
def load_and_merge_ecg_data(ecg_csv_path, scp_csv_path):
    """
    Loads PTB-XL metadata and SCP statements.
    Parses scp_codes to extract main diagnosis superclass.
    Returns merged DataFrame with selected features and target.
    """
    # Load datasets
    ecg_df = pd.read_csv(ecg_csv_path)
    scp_df = pd.read_csv(scp_csv_path)

    # Map scp_code to superclass
    scp_df = scp_df[['Unnamed: 0', 'diagnostic_class']]
    scp_df.columns = ['scp_code', 'superclass']

    # Parse scp_codes JSON-like strings in ecg_df
    def extract_superclass(scp_codes_str):
        scp_dict = ast.literal_eval(scp_codes_str)  # convert string to dict
        main_code = max(scp_dict, key=scp_dict.get)  # pick code with highest value
        if main_code in scp_df['scp_code'].values:
            return scp_df.loc[scp_df['scp_code'] == main_code, 'superclass'].values[0]
        else:
            return None

    ecg_df['superclass'] = ecg_df['scp_codes'].apply(extract_superclass)

    # Drop rows with no valid superclass
    ecg_df = ecg_df.dropna(subset=['superclass'])

    return ecg_df

# ------------------------------------------------
# Function to preprocess features and target data
# ------------------------------------------------
def preprocess_ecg_data(df, feature_columns, target_column, test_size=0.3, random_state=42):
    """
    Prepares features and target arrays.
    Handles missing values, scales features, and splits into train/test sets.
    """
    X = df[feature_columns]
    y = df[target_column]

    # Impute missing values with column mean
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, random_state=random_state, stratify=y
    )

    return X_train, X_test, y_train, y_test

# ---------------------------------------
# Function to train a classification model
# ---------------------------------------
def train_logistic_regression(X_train, y_train):
    """
    Trains a Logistic Regression model for classification.
    Returns the trained model.
    """
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model

# ------------------------------------
# Function to evaluate the trained model
# ------------------------------------
def evaluate_classification_model(model, X_test, y_test):
    """
    Evaluates the classification model and returns accuracy score.
    """
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

# -----------------
# Main program
# -----------------
if __name__ == "__main__":
    # Step 1: Load and merge data
    ecg_metadata = load_and_merge_ecg_data(
        "ptbxl_database.csv",
        "scp_statements.csv"
    )

    # Step 2: Define target and features
    target_column = 'superclass'  # real ECG diagnostic superclass
    feature_columns = ['age', 'height', 'weight']  # numeric patient features

    # Step 3: Preprocess data
    X_train, X_test, y_train, y_test = preprocess_ecg_data(
        ecg_metadata, feature_columns, target_column
    )

    # Step 4: Train model
    lr_model = train_logistic_regression(X_train, y_train)

    # Step 5: Evaluate model
    accuracy = evaluate_classification_model(lr_model, X_test, y_test)

    # Step 6: Output result
    print(f"A1 Result → Logistic Regression Accuracy (target='{target_column}'): {accuracy:.4f}")


A1 Result → Logistic Regression Accuracy (target='superclass'): 0.4468
