In [9]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__init__"), '..')))
from src.utils import * 
from sklearn.model_selection import StratifiedKFold, cross_val_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

Code to save all files in a diccionari: {"patient_id":{"files":[files]}}

In [None]:
basePath = "ECG_Database"

patDir = sorted(
    [f.path for f in os.scandir(basePath) if f.is_dir()],
    key=lambda x: int(os.path.basename(x).replace("patient", ""))  
)

patDict = {}


for patient in patDir:
    patID=os.path.basename(patient) 
    archivos = os.listdir(patient)


    dat = [f for f in archivos if f.endswith(".dat")]
    hea = [f for f in archivos if f.endswith(".hea")]
    xyz = [f for f in archivos if f.endswith(".xyz")]

    allFiles = []

    for f in dat + hea + xyz: 
        allFiles.append(f) 

    patDict[patID] = {"files": allFiles}

Optional: code to print all de previous diccionary, to check that everything is ok

In [None]:
for patient, data in patDict.items():
    print(f"Paciente: {patient}")
    
    if "files" in data:
        print("Archivos:", ", ".join(data["files"]))
    else:
        print("No hay archivos registrados.")
    
    print("-" * 40)

Code to extract all the characteristics of al the files and store them in another dictionary

In [None]:
allFeatures = {}

for patient_id, data in patDict.items():
    archivos = data["files"]
    dat_files = [f for f in archivos if f.endswith(".dat")]
    
    patient_features = {} 
    
    for archivo in dat_files:
        features = extract_features(patient_id, archivo)
        patient_features.update(features[patient_id])  
    
    allFeatures[patient_id] = patient_features

Code to get a dataframe of the files an the "possible disease" of them

In [None]:
df_disease=get_disease_df(allFeatures)
df_disease.head()

Code to get a dataframe of all the features, instead a dictionary

In [None]:
df_features = features_dict_to_df(allFeatures)
df_features.head()

Code of the model, a XGBOOST

In [None]:
df_disease["ID"] = df_disease["Patient"] + "/" + df_disease["File"]
df_features["ID"] = df_features["Patient"] + "/" + df_features["File"]

#Join both DataFrames by "ID"
df_final = pd.merge(df_features, df_disease[["ID", "Max_Label"]], on="ID")

#Remove non-numeric or redundant columns
X = df_final.select_dtypes(include=["number", "bool"])
y = df_final["Max_Label"]

# Checking for classes and null values
print("Distribución original de clases:")
print(y.value_counts())
print("\nValores nulos en y:", y.isnull().sum())

ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, stratify=y_res, test_size=0.2, random_state=42)

#We code the classes into numbers
le = LabelEncoder()
y_train_res_encoded = le.fit_transform(y_train_res)
y_test_res_encoded = le.transform(y_test_res) 

#We train the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train_res, y_train_res_encoded)
y_pred_xgb = xgb_model.predict(X_test_res)

#We decode the predictions so we can evaluate them with the original labels.
y_pred_xgb_decoded = le.inverse_transform(y_pred_xgb)

cv_scores = cross_val_score(xgb_model, X_train_res, y_train_res_encoded, cv=5, scoring='accuracy')

#We show the performance of cross-validation
print(f"\n📊 Resultados de Validación Cruzada:")
print(f"Precisión promedio (CV): {cv_scores.mean():.4f}")
print(f"Desviación estándar (CV): {cv_scores.std():.4f}")

#Classification report
print("\n📋 Clasificación con XGBoost:")
print(classification_report(y_test_res, y_pred_xgb_decoded))

#confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test_res, y_pred_xgb_decoded, labels=le.classes_), 
            annot=True, fmt='d', cmap='Oranges',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Matriz de Confusión (XGBoost + Rebalanceado)")
plt.xlabel("Predicción")
plt.ylabel("Real")

Code to get a plot of the importances of the features

In [None]:
importances = xgb_model.feature_importances_
features = X.columns

plt.figure(figsize=(10,6))
sns.barplot(x=importances, y=features)
plt.title("Feature importance - XGBoost")
plt.show()