In [24]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database (1).csv


In [25]:
from google.colab import files
uploaded = files.upload()

Saving scp_statements.csv to scp_statements (1).csv


In [26]:
#A1
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from ast import literal_eval
import matplotlib.pyplot as plt

features_df = pd.read_csv('/content/ptbxl_database.csv')
diagnosis_df = pd.read_csv('/content/scp_statements.csv')

# ---- Step 2: Merge both dataframes ---- #
# This step seems incorrect as scp_codes is already in features_df.
# We should work directly with features_df for scp_codes.
# df = pd.concat([features_df, diagnosis_df[['scp_codes']]], axis=1)
df = features_df.copy()


# ---- Step 3: Convert string scp_codes to dictionary ---- #
# Using ast.literal_eval for safety, but handle potential errors
def safe_literal_eval(s):
    try:
        return literal_eval(s) if pd.notnull(s) else {}
    except (ValueError, SyntaxError):
        return {} # Return empty dict for malformed strings

df['scp_codes'] = df['scp_codes'].apply(safe_literal_eval)


# ---- Step 4: Label Samples (MI or NORM) ---- #
# Define MI related codes
mi_keywords = ['MI', 'IMI', 'ASMI', 'INJAS', 'INJIS', 'ALMI', 'ILMI', 'IPLMI', 'IPMI', 'LMI', 'PMI'] # Added more MI related codes
def is_mi(d):
    return any(code in d for code in mi_keywords)

# Define NORM related codes (assuming 'NORM' specifically)
norm_keywords = ['NORM']
def is_norm(d):
    return any(code in d for code in norm_keywords)


df['MI_flag'] = df['scp_codes'].apply(is_mi)
df['NORM_flag'] = df['scp_codes'].apply(is_norm)


# Filter based on flags
# We want samples that are flagged as MI OR are flagged as NORM, but not both
df_filtered = df[df['MI_flag'] | df['NORM_flag']].copy()

# Remove samples that are flagged as both MI and NORM to get distinct classes
df_filtered = df_filtered[~ (df_filtered['MI_flag'] & df_filtered['NORM_flag'])].copy()


df_mi = df_filtered[df_filtered['MI_flag']].copy()
df_norm = df_filtered[df_filtered['NORM_flag']].copy()


print(" NORM samples:", df_norm.shape[0])
print(" Myocardical samples:", df_mi.shape[0])

# ---- Step 5: Select numeric features ---- #
features_cols = ['age', 'height', 'weight']  # Update if more features needed

# Impute missing values with the mean *before* splitting
for col in features_cols:
    df_filtered[col] = df_filtered[col].fillna(df_filtered[col].mean())

# Now re-split into MI and NORM dataframes using the imputed data
df_mi_clean = df_filtered[df_filtered['MI_flag']].copy()
df_norm_clean = df_filtered[df_filtered['NORM_flag']].copy()


# ---- Step 6: Normalize Features ---- #
scaler = StandardScaler()

# Fit scaler on the combined data to ensure consistent scaling
scaled_features = scaler.fit_transform(df_filtered[features_cols])


# Separate the scaled features back into MI and NORM
features_norm = scaled_features[df_filtered['NORM_flag']].copy()
features_mi = scaled_features[df_filtered['MI_flag']].copy()


# ---- Step 7: Compute Centroid & Spread ---- #
centroid_norm = features_norm.mean(axis=0) if len(features_norm) > 0 else np.array([np.nan] * len(features_cols))
centroid_mi = features_mi.mean(axis=0) if len(features_mi) > 0 else np.array([np.nan] * len(features_cols))

spread_norm = features_norm.std(axis=0) if len(features_norm) > 0 else np.array([np.nan] * len(features_cols))
spread_mi = features_mi.std(axis=0) if len(features_mi) > 0 else np.array([np.nan] * len(features_cols))


distance = np.linalg.norm(centroid_norm - centroid_mi) if not np.isnan(centroid_norm).any() and not np.isnan(centroid_mi).any() else np.nan

print("Centroid (NORM):", centroid_norm)
print("Centroid (MI):", centroid_mi)
print("\nSpread (NORM):", spread_norm)
print("Spread (MI):", spread_mi)
print("\nEuclidean Distance between Centroids:", distance)

# ---- Optional: Visual Check for MI Candidates ---- #
if df_mi.empty:
    print("\n No MI samples found. Trying to display candidates from original df...")
    possible_mi = df[df['scp_codes'].apply(lambda d: any(k in d for k in mi_keywords))]
    print(possible_mi[features_cols + ['scp_codes']].head(10))

 NORM samples: 9513
 Myocardical samples: 5195
Centroid (NORM): [-0.22188902  0.00795002  0.00338526]
Centroid (MI): [ 0.40631959 -0.01455795 -0.00619903]

Spread (NORM): [0.75428812 1.0643319  1.07242032]
Spread (MI): [1.23857874 0.86976247 0.85153199]

Euclidean Distance between Centroids: 0.6286847596428019
