In [None]:
import wfdb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
import warnings
warnings.filterwarnings('ignore')

# Télécharger et traiter les données MIT-BIH
def download_mitbih_data():
    records = ['100', '101', '102', '103', '104']
    all_signals = []
    all_labels = []
    
    for record in records:
        try:
            print(f"Téléchargement de l'enregistrement {record}...")
            
            # Télécharger les données
            signal, fields = wfdb.rdsamp(record, pn_dir='mitdb')
            annotation = wfdb.rdann(record, 'atr', pn_dir='mitdb')
            
            print(f"Signal shape: {signal.shape}")
            print(f"Nombre d'annotations: {len(annotation.sample)}")
            
            # Utiliser le premier canal
            ecg_signal = signal[:, 0]
            
            # Créer des segments autour de chaque battement détecté
            for i, (sample_idx, symbol) in enumerate(zip(annotation.sample, annotation.symbol)):
                # Prendre une fenêtre de 200 points autour de chaque battement
                start_idx = max(0, sample_idx - 100)
                end_idx = min(len(ecg_signal), sample_idx + 100)
                
                if end_idx - start_idx >= 200:
                    segment = ecg_signal[start_idx:start_idx + 200]
                    
                    # Mapper les annotations vers 5 classes principales
                    if symbol in ['N', 'L', 'R', 'e', 'j', '.']:
                        label = 'N'  # Normal
                    elif symbol in ['A', 'a', 'J', 'S']:
                        label = 'S'  # Supraventricular
                    elif symbol in ['V', 'E']:
                        label = 'V'  # Ventricular
                    elif symbol in ['F']:
                        label = 'F'  # Fusion
                    else:
                        label = 'Q'  # Unclassifiable
                    
                    all_signals.append(segment)
                    all_labels.append(label)
            
            print(f"Segments extraits de {record}: {len([l for l in all_labels if l in ['N', 'S', 'V', 'F', 'Q']])}")
            
        except Exception as e:
            print(f"Erreur avec {record}: {e}")
            continue
    
    if len(all_signals) == 0:
        print("Aucune donnée téléchargée. Création de données synthétiques pour test...")
        # Créer des données factices pour le test
        np.random.seed(42)
        for i in range(1000):
            # Signal ECG synthétique
            t = np.linspace(0, 1, 200)
            signal = np.sin(2*np.pi*t) + 0.5*np.sin(4*np.pi*t) + np.random.normal(0, 0.1, 200)
            all_signals.append(signal)
            
            # Labels aléatoires
            labels = ['N', 'S', 'V', 'F', 'Q']
            all_labels.append(np.random.choice(labels))
        
        print("1000 échantillons synthétiques créés pour le test")
    
    return np.array(all_signals), np.array(all_labels)

print("Téléchargement des données MIT-BIH...")
X, y = download_mitbih_data()
print(f"Données finales: {len(X)} segments")
print(f"Forme des données: {X.shape}")

# Vérification que les données ne sont pas vides
if len(X) == 0:
    raise ValueError("Aucune donnée n'a été téléchargée!")

Téléchargement des données MIT-BIH...
Téléchargement de l'enregistrement 100...
Signal shape: (650000, 2)
Nombre d'annotations: 2274
Segments extraits de 100: 2271
Téléchargement de l'enregistrement 101...
Signal shape: (650000, 2)
Nombre d'annotations: 1874
Segments extraits de 101: 4143
Téléchargement de l'enregistrement 102...
Signal shape: (650000, 2)
Nombre d'annotations: 2192
Segments extraits de 102: 6334
Téléchargement de l'enregistrement 103...
Signal shape: (650000, 2)
Nombre d'annotations: 2091
Segments extraits de 103: 8424
Téléchargement de l'enregistrement 104...
Signal shape: (650000, 2)
Nombre d'annotations: 2311
Segments extraits de 104: 10733
Données finales: 10733 segments
Forme des données: (10733, 200)


In [19]:
# Normalisation des signaux
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Encodage des labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Classes disponibles:", label_encoder.classes_)
print("Distribution des classes:")
unique, counts = np.unique(y, return_counts=True)
for u, c in zip(unique, counts):
    print(f"{u}: {c}")

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Application de SMOTE pour équilibrer les classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Avant SMOTE: {len(X_train)} échantillons")
print(f"Après SMOTE: {len(X_train_balanced)} échantillons")

Classes disponibles: ['N' 'Q' 'S' 'V']
Distribution des classes:
N: 6440
Q: 4248
S: 38
V: 7
Avant SMOTE: 8586 échantillons
Après SMOTE: 20608 échantillons


In [16]:
# Dictionnaire pour stocker les résultats
results = {}

# 1. Decision Tree
print("Entraînement Decision Tree...")
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_balanced, y_train_balanced)
y_pred_dt = dt.predict(X_test)

results['Decision Tree'] = {
    'Accuracy': accuracy_score(y_test, y_pred_dt) * 100,
    'Precision': precision_score(y_test, y_pred_dt, average='weighted') * 100,
    'Recall': recall_score(y_test, y_pred_dt, average='weighted') * 100,
    'F1-Score': f1_score(y_test, y_pred_dt, average='weighted') * 100
}

# 2. Random Forest
print("Entraînement Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_balanced, y_train_balanced)
y_pred_rf = rf.predict(X_test)

results['Random Forest'] = {
    'Accuracy': accuracy_score(y_test, y_pred_rf) * 100,
    'Precision': precision_score(y_test, y_pred_rf, average='weighted') * 100,
    'Recall': recall_score(y_test, y_pred_rf, average='weighted') * 100,
    'F1-Score': f1_score(y_test, y_pred_rf, average='weighted') * 100
}

# 3. XGBoost
print("Entraînement XGBoost...")
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train_balanced, y_train_balanced)
y_pred_xgb = xgb_model.predict(X_test)

results['XGBoost'] = {
    'Accuracy': accuracy_score(y_test, y_pred_xgb) * 100,
    'Precision': precision_score(y_test, y_pred_xgb, average='weighted') * 100,
    'Recall': recall_score(y_test, y_pred_xgb, average='weighted') * 100,
    'F1-Score': f1_score(y_test, y_pred_xgb, average='weighted') * 100
}

# 4. CNN
print("Entraînement CNN...")
# Reshape pour CNN (samples, timesteps, features)
X_train_cnn = X_train_balanced.reshape(X_train_balanced.shape[0], X_train_balanced.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

print(f"Shape des données CNN: {X_train_cnn.shape}")

# Convertir les labels en categorical
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train_balanced)
y_test_categorical = to_categorical(y_test)

# Obtenir la taille d'entrée dynamiquement
input_shape = X_train_cnn.shape[1]
print(f"Taille d'entrée pour CNN: {input_shape}")

# Modèle CNN adapté à la taille des données
model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(input_shape, 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Entraînement (réduit pour le test)
print("Début de l'entraînement CNN...")
history = model.fit(X_train_cnn, y_train_categorical, 
                   epochs=10, batch_size=32, 
                   validation_split=0.2, verbose=1)

# Prédictions CNN
print("Prédictions CNN...")
y_pred_cnn_prob = model.predict(X_test_cnn, verbose=0)
y_pred_cnn = np.argmax(y_pred_cnn_prob, axis=1)

results['CNN'] = {
    'Accuracy': accuracy_score(y_test, y_pred_cnn) * 100,
    'Precision': precision_score(y_test, y_pred_cnn, average='weighted') * 100,
    'Recall': recall_score(y_test, y_pred_cnn, average='weighted') * 100,
    'F1-Score': f1_score(y_test, y_pred_cnn, average='weighted') * 100
}

Entraînement Decision Tree...
Entraînement Random Forest...
Entraînement XGBoost...
Entraînement CNN...
Shape des données CNN: (20608, 200, 1)
Taille d'entrée pour CNN: 200
Début de l'entraînement CNN...
Epoch 1/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.6299 - loss: 0.7730 - val_accuracy: 1.0000 - val_loss: 0.0286
Epoch 2/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.9045 - loss: 0.2612 - val_accuracy: 1.0000 - val_loss: 0.0014
Epoch 3/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.9422 - loss: 0.1641 - val_accuracy: 1.0000 - val_loss: 0.0053
Epoch 4/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.9537 - loss: 0.1307 - val_accuracy: 1.0000 - val_loss: 7.4543e-04
Epoch 5/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.9629 - loss: 0.1088 - val_accuracy: 1.

In [17]:
# Création du tableau de résultats
import pandas as pd

df_results = pd.DataFrame(results).T
df_results = df_results.round(2)

print("\n=== RÉSULTATS (Tableau 2 de l'article) ===")
print(df_results)

# Comparaison avec les résultats attendus de l'article
expected_results = {
    'CNN': {'Accuracy': 99, 'Precision': 98, 'Recall': 98, 'F1-Score': 98},
    'Decision Tree': {'Accuracy': 78, 'Precision': 73, 'Recall': 76, 'F1-Score': 93},
    'Random Forest': {'Accuracy': 98, 'Precision': 97, 'Recall': 98, 'F1-Score': 98},
    'XGBoost': {'Accuracy': 97, 'Precision': 97, 'Recall': 97, 'F1-Score': 97}
}

print("\n=== RÉSULTATS ATTENDUS (Article) ===")
df_expected = pd.DataFrame(expected_results).T
print(df_expected)

print("\n=== DIFFÉRENCES ===")
for model in results.keys():
    if model in expected_results:
        print(f"\n{model}:")
        for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score']:
            obtained = results[model][metric]
            expected = expected_results[model][metric]
            diff = obtained - expected
            print(f"  {metric}: {obtained:.1f}% (attendu: {expected}%, diff: {diff:+.1f}%)")


=== RÉSULTATS (Tableau 2 de l'article) ===
               Accuracy  Precision  Recall  F1-Score
Decision Tree     98.46      98.59   98.46     98.52
Random Forest     99.49      99.44   99.49     99.38
XGBoost           99.30      99.22   99.30     99.26
CNN               98.98      99.37   98.98     99.12

=== RÉSULTATS ATTENDUS (Article) ===
               Accuracy  Precision  Recall  F1-Score
CNN                  99         98      98        98
Decision Tree        78         73      76        93
Random Forest        98         97      98        98
XGBoost              97         97      97        97

=== DIFFÉRENCES ===

Decision Tree:
  Accuracy: 98.5% (attendu: 78%, diff: +20.5%)
  Precision: 98.6% (attendu: 73%, diff: +25.6%)
  Recall: 98.5% (attendu: 76%, diff: +22.5%)
  F1-Score: 98.5% (attendu: 93%, diff: +5.5%)

Random Forest:
  Accuracy: 99.5% (attendu: 98%, diff: +1.5%)
  Precision: 99.4% (attendu: 97%, diff: +2.4%)
  Recall: 99.5% (attendu: 98%, diff: +1.5%)
  F1-Score: 