In [2]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [5]:
from google.colab import files
uploaded = files.upload()

Saving scp_statements.csv to scp_statements.csv


In [11]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database (1).csv


In [24]:
#A3
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer # Import SimpleImputer

# ---------- Functions ----------

def load_ecg_metadata(ecg_csv_path, scp_csv_path):
    """
    Loads PTB-XL metadata and SCP statements mapping diagnostic codes to superclasses.
    """
    # Load metadata
    ecg_df = pd.read_csv(ecg_csv_path)
    scp_df = pd.read_csv(scp_csv_path)

    # Rename first column to 'scp_code' (it's unnamed in PhysioNet CSV)
    scp_df.rename(columns={scp_df.columns[0]: 'scp_code'}, inplace=True)

    # Create mapping dictionary from scp_code → diagnostic_class
    scp_mapping = scp_df.set_index('scp_code')['diagnostic_class'].to_dict()

    # Map scp_codes JSON to main diagnostic superclass
    def map_to_superclass(scp_codes_str):
        scp_codes = ast.literal_eval(scp_codes_str)
        if len(scp_codes) == 0:
            return None
        main_code = max(scp_codes, key=scp_codes.get)
        return scp_mapping.get(main_code, None)

    ecg_df['diagnosis'] = ecg_df['scp_codes'].apply(map_to_superclass)

    # Drop rows with missing target labels
    ecg_df = ecg_df.dropna(subset=['diagnosis'])

    return ecg_df


def prepare_features_and_labels(ecg_df, feature_columns, target_column):
    """
    Prepares feature matrix X and target vector y from the dataframe.
    Handles missing values and returns numpy arrays.
    """
    X = ecg_df[feature_columns]
    y = ecg_df[target_column].values

    # Handle missing values using imputation
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    return X_imputed, y


def train_logistic_regression(X_train, y_train):
    """
    Trains logistic regression model with class balancing.
    """
    model = LogisticRegression(max_iter=500, class_weight="balanced")
    model.fit(X_train, y_train)
    return model


def evaluate_model(model, X, y):
    """
    Evaluates model on given data, returns accuracy and classification report.
    """
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    report = classification_report(y, predictions, zero_division=0)
    return acc, report


# ---------- Main Program ----------

if __name__ == "__main__":
    # Step 1: Load ECG metadata
    ecg_df = load_ecg_metadata("ptbxl_database.csv", "scp_statements.csv")

    # Step 2: Define features and target
    feature_columns = ['age', 'height', 'weight']  # more than one attribute
    target_column = 'diagnosis'

    # Step 3: Prepare features and labels (includes imputation)
    X, y = prepare_features_and_labels(ecg_df, feature_columns, target_column)

    # Step 4: Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Step 5: Stratified train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.3, random_state=42, stratify=y
    )

    # Step 6: Train model
    lr_model = train_logistic_regression(X_train, y_train)

    # Step 7: Evaluate on train and test
    train_acc, train_report = evaluate_model(lr_model, X_train, y_train)
    test_acc, test_report = evaluate_model(lr_model, X_test, y_test)

    # Step 8: Print results
    print("=== Train Set Evaluation ===")
    print(f"Accuracy: {train_acc:.4f}")
    print(train_report)

    print("=== Test Set Evaluation ===")
    print(f"Accuracy: {test_acc:.4f}")
    print(test_report)

=== Train Set Evaluation ===
Accuracy: 0.3698
              precision    recall  f1-score   support

          CD       0.23      0.30      0.26      2320
         HYP       0.07      0.23      0.11       885
          MI       0.27      0.14      0.19      2876
        NORM       0.62      0.62      0.62      6394
        STTC       0.21      0.09      0.13      2323

    accuracy                           0.37     14798
   macro avg       0.28      0.28      0.26     14798
weighted avg       0.39      0.37      0.37     14798

=== Test Set Evaluation ===
Accuracy: 0.3749
              precision    recall  f1-score   support

          CD       0.23      0.30      0.26       995
         HYP       0.08      0.25      0.12       380
          MI       0.27      0.14      0.19      1233
        NORM       0.62      0.62      0.62      2740
        STTC       0.22      0.11      0.14       995

    accuracy                           0.37      6343
   macro avg       0.28      0.28      0