# Mod√®le de Pr√©diction des Maladies Cardiovasculaires
## Pr√©diction robuste avec validation crois√©e et analyse ROC-AUC

In [None]:
# Manipulation des donn√©es
import pandas as pd
import numpy as np

# Affichage et visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Mod√®les et m√©triques
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve, auc
)
from sklearn.preprocessing import StandardScaler

# Configuration des visualisations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## 1. Chargement et Exploration des Donn√©es

In [None]:
# Charger le dataset de mani√®re cross-platform
import os
from pathlib import Path

# Utiliser un chemin relatif ou une variable d'environnement
data_path = Path('datasetCleaned.csv')

# Si le fichier n'existe pas au chemin relatif, chercher dans les r√©pertoires courants
if not data_path.exists():
    possible_paths = [
        Path.cwd() / 'datasetCleaned.csv',
        Path.cwd().parent / 'datasetCleaned.csv',
        Path(__file__).parent / 'datasetCleaned.csv' if '__file__' in dir() else None,
    ]
    for path in possible_paths:
        if path and path.exists():
            data_path = path
            break

print(f'Chemin du dataset: {data_path.resolve()}')
df = pd.read_csv(data_path)

print(f'\n‚úì Dataset charg√© avec succ√®s!')
print(f'Dimensions du dataset: {df.shape}')
print(f'Nombre de lignes: {df.shape[0]:,}')
print(f'Nombre de colonnes: {df.shape[1]}')

In [None]:
# Afficher les premi√®res lignes
print('\nAper√ßu des donn√©es:')
print(df.head())

# Informations sur les types de donn√©es
print('\nInformations sur les colonnes:')
print(df.info())

# Statistiques descriptives
print('\nStatistiques descriptives:')
print(df.describe())

In [None]:
# V√©rifier les valeurs manquantes
print('Valeurs manquantes:')
missing = df.isnull().sum()
if missing.sum() == 0:
    print('‚úì Aucune valeur manquante d√©tect√©e')
else:
    print(missing[missing > 0])

# V√©rifier la distribution de la variable cible
print('\nDistribution de la variable cible (cardio):')
print(df['cardio'].value_counts())
print(f'\nProportions:')
print(df['cardio'].value_counts(normalize=True))

## 2. Visualisation et Analyse Exploratoire

In [None]:
# Visualiser la distribution de la variable cible
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Graphique 1: Compte des classes
df['cardio'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Distribution de la Maladie Cardiovasculaire', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Maladie Cardiovasculaire')
axes[0].set_ylabel('Nombre de patients')
axes[0].set_xticklabels(['Non (0)', 'Oui (1)'], rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Graphique 2: Pourcentages
cardio_pct = df['cardio'].value_counts(normalize=True) * 100
axes[1].pie(cardio_pct, labels=['Non atteint', 'Atteint'], autopct='%1.1f%%',
            colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[1].set_title('Proportion de Maladies Cardiovasculaires', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print('\nBilan:')
print(f'Patients sans maladie: {(df["cardio"]==0).sum():,} ({(df["cardio"]==0).sum()/len(df)*100:.1f}%)')
print(f'Patients avec maladie: {(df["cardio"]==1).sum():,} ({(df["cardio"]==1).sum()/len(df)*100:.1f}%)')

In [None]:
# Matrice de corr√©lation
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, cbar_kws={'label': 'Corr√©lation'})
plt.title('Matrice de Corr√©lation des Variables', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Afficher les corr√©lations avec la variable cible
print('\nCorr√©lations avec la variable cible (cardio):')
target_corr = correlation_matrix['cardio'].sort_values(ascending=False)
print(target_corr)

## 3. Pr√©paration des Donn√©es

In [None]:
# Pr√©parer les features et la cible
# Supposant que 'cardio' est la variable cible
X = df.drop(['cardio', 'Unnamed: 0'], axis=1, errors='ignore')
y = df['cardio']

print(f'Features: {X.shape[1]}')
print(f'Nombre d\'exemples: {X.shape[0]:,}')
print(f'\nColonnes utilis√©es:')
print(list(X.columns))

In [None]:
# Diviser les donn√©es en ensembles d'entra√Ænement et de test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'Ensemble d\'entra√Ænement: {X_train.shape[0]:,} exemples')
print(f'Ensemble de test: {X_test.shape[0]:,} exemples')
print(f'\nDistribution dans l\'ensemble d\'entra√Ænement:')
print(f'  - N√©gatifs: {(y_train==0).sum():,} ({(y_train==0).sum()/len(y_train)*100:.1f}%)')
print(f'  - Positifs: {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train)*100:.1f}%)')
print(f'\nDistribution dans l\'ensemble de test:')
print(f'  - N√©gatifs: {(y_test==0).sum():,} ({(y_test==0).sum()/len(y_test)*100:.1f}%)')
print(f'  - Positifs: {(y_test==1).sum():,} ({(y_test==1).sum()/len(y_test)*100:.1f}%)')

In [None]:
# Normaliser les donn√©es (optionnel mais recommand√©)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('‚úì Donn√©es normalis√©es avec succ√®s')
print(f'\nMoyenne des features (ensemble d\'entra√Ænement): {X_train_scaled.mean(axis=0).mean():.6f}')
print(f'√âcart-type des features (ensemble d\'entra√Ænement): {X_train_scaled.std(axis=0).mean():.6f}')

## 4. Entra√Ænement du Mod√®le

In [None]:
# Cr√©er et entra√Æner le mod√®le Random Forest
print('Entra√Ænement du mod√®le Random Forest...')
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

# Entra√Æner sur les donn√©es originales (Random Forest n'a pas besoin de normalisation)
model.fit(X_train, y_train)
print('‚úì Mod√®le entra√Æn√© avec succ√®s!')

## 5. Validation Crois√©e

In [None]:
# Effectuer une validation crois√©e stratifi√©e
print('Ex√©cution de la validation crois√©e stratifi√©e (5 folds)...\n')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# M√©triques √† calculer
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

# Validation crois√©e
cv_results = cross_validate(model, X_train, y_train, cv=skf, scoring=scoring, return_train_score=True)

# Afficher les r√©sultats
print('R√©sultats de la Validation Crois√©e (5 folds):')
print('='*60)

for metric in scoring.keys():
    train_scores = cv_results[f'train_{metric}']
    test_scores = cv_results[f'test_{metric}']
    
    print(f'\n{metric.upper()}:')
    print(f'  Train: {train_scores.mean():.4f} (+/- {train_scores.std():.4f})')
    print(f'  Test:  {test_scores.mean():.4f} (+/- {test_scores.std():.4f})')
    print(f'  D√©tails: {[f"{s:.4f}" for s in test_scores]}')

print('\n' + '='*60)

In [None]:
# Visualiser les r√©sultats de la validation crois√©e
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
colors = ['#3498db', '#e74c3c']

for idx, metric in enumerate(metrics):
    train_scores = cv_results[f'train_{metric}']
    test_scores = cv_results[f'test_{metric}']
    folds = np.arange(1, len(test_scores) + 1)
    
    axes[idx].plot(folds, train_scores, 'o-', label='Train', color=colors[0], linewidth=2, markersize=8)
    axes[idx].plot(folds, test_scores, 's-', label='Test', color=colors[1], linewidth=2, markersize=8)
    axes[idx].set_xlabel('Fold')
    axes[idx].set_ylabel(metric.capitalize())
    axes[idx].set_title(f'{metric.capitalize()} par Fold')
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)
    axes[idx].set_ylim([0, 1.05])

# Supprimer le dernier subplot vide
fig.delaxes(axes[5])

plt.tight_layout()
plt.show()

## 6. √âvaluation du Mod√®le sur l'Ensemble de Test

In [None]:
# Faire des pr√©dictions sur l'ensemble de test
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculer les m√©triques
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print('M√©triques de Performance sur l\'Ensemble de Test')
print('='*50)
print(f'Accuracy (Exactitude):  {accuracy:.4f}')
print(f'Precision (Pr√©cision):  {precision:.4f}')
print(f'Recall (Sensibilit√©):   {recall:.4f}')
print(f'F1-Score:               {f1:.4f}')
print(f'ROC-AUC:                {roc_auc:.4f}')
print('='*50)

In [None]:
# Matrice de confusion
cm = confusion_matrix(y_test, y_pred)

print('\nMatrice de Confusion:')
print(cm)
print(f'\nInterpr√©tation:')
print(f'  Vrais N√©gatifs (TN):      {cm[0,0]:,}')
print(f'  Faux Positifs (FP):       {cm[0,1]:,}')
print(f'  Faux N√©gatifs (FN):       {cm[1,0]:,}')
print(f'  Vrais Positifs (TP):      {cm[1,1]:,}')

In [None]:
# Rapport de classification d√©taill√©
print('\nRapport de Classification D√©taill√©:')
print('='*60)
print(classification_report(y_test, y_pred, 
                          target_names=['Non-Atteint', 'Atteint'],
                          digits=4))

## 7. Analyse ROC-AUC

In [None]:
# Calculer la courbe ROC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc_score_value = auc(fpr, tpr)

# Visualiser la courbe ROC
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Graphique 1: Courbe ROC
axes[0].plot(fpr, tpr, color='#3498db', lw=2.5, label=f'ROC Curve (AUC = {roc_auc_score_value:.4f})')
axes[0].plot([0, 1], [0, 1], color='#e74c3c', lw=2, linestyle='--', label='Al√©atoire (AUC = 0.5000)')
axes[0].set_xlim([0.0, 1.0])
axes[0].set_ylim([0.0, 1.05])
axes[0].set_xlabel('Taux de Faux Positifs')
axes[0].set_ylabel('Taux de Vrais Positifs')
axes[0].set_title('Courbe ROC - Analyse AUC')
axes[0].legend(loc='lower right', fontsize=11)
axes[0].grid(alpha=0.3)

# Graphique 2: Matrice de confusion
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1], 
            xticklabels=['Non-Atteint', 'Atteint'],
            yticklabels=['Non-Atteint', 'Atteint'],
            cbar_kws={'label': 'Nombre'})
axes[1].set_title('Matrice de Confusion')
axes[1].set_ylabel('Vrai Label')
axes[1].set_xlabel('Pr√©diction')

plt.tight_layout()
plt.show()

## 8. Importance des Features

In [None]:
# Obtenir l'importance des features
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print('Importance des Features:')
print('='*50)
for idx, row in feature_importance.iterrows():
    print(f'{row["feature"]:15s} : {row["importance"]:.4f}')
print('='*50)

In [None]:
# Visualiser l'importance des features
fig, ax = plt.subplots(figsize=(12, 6))

colors_gradient = plt.cm.viridis(np.linspace(0, 1, len(feature_importance)))
bars = ax.barh(feature_importance['feature'], feature_importance['importance'], color=colors_gradient)

ax.set_xlabel('Importance', fontsize=12, fontweight='bold')
ax.set_title('Importance des Features dans le Mod√®le', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

# Ajouter les valeurs sur les barres
for i, (bar, val) in enumerate(zip(bars, feature_importance['importance'])):
    ax.text(val + 0.002, i, f'{val:.4f}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

## 9. R√©sum√© et Conclusions

In [None]:
# R√©sum√© complet
print('='*70)
print('R√âSUM√â DU MOD√àLE DE PR√âDICTION DES MALADIES CARDIOVASCULAIRES')
print('='*70)

print('\nüìä DONN√âES:')
print(f'  ‚Ä¢ Taille totale: {len(df):,} patients')
print(f'  ‚Ä¢ Nombre de features: {X.shape[1]}')
print(f'  ‚Ä¢ Ensemble d\'entra√Ænement: {len(X_train):,} ({len(X_train)/len(df)*100:.1f}%)')
print(f'  ‚Ä¢ Ensemble de test: {len(X_test):,} ({len(X_test)/len(df)*100:.1f}%)')

print('\nü§ñ MOD√àLE:')
print(f'  ‚Ä¢ Type: Random Forest Classifier')
print(f'  ‚Ä¢ Nombre d\'arbres: 100')
print(f'  ‚Ä¢ Profondeur maximale: 20')
print(f'  ‚Ä¢ √âquilibre des classes: Activ√©')

print('\nüìà PERFORMANCE (Ensemble de Test):')
print(f'  ‚Ä¢ Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)')
print(f'  ‚Ä¢ Precision: {precision:.4f}')
print(f'  ‚Ä¢ Recall:    {recall:.4f}')
print(f'  ‚Ä¢ F1-Score:  {f1:.4f}')
print(f'  ‚Ä¢ ROC-AUC:   {roc_auc:.4f}')

print('\n‚úì VALIDATION CROIS√âE (5-Fold):')  
print(f'  ‚Ä¢ Accuracy:  {cv_results["test_accuracy"].mean():.4f} ¬± {cv_results["test_accuracy"].std():.4f}')
print(f'  ‚Ä¢ Precision: {cv_results["test_precision"].mean():.4f} ¬± {cv_results["test_precision"].std():.4f}')
print(f'  ‚Ä¢ Recall:    {cv_results["test_recall"].mean():.4f} ¬± {cv_results["test_recall"].std():.4f}')
print(f'  ‚Ä¢ F1-Score:  {cv_results["test_f1"].mean():.4f} ¬± {cv_results["test_f1"].std():.4f}')
print(f'  ‚Ä¢ ROC-AUC:   {cv_results["test_roc_auc"].mean():.4f} ¬± {cv_results["test_roc_auc"].std():.4f}')

print('\nüéØ TOP 5 FEATURES IMPORTANTES:')
for idx, (i, row) in enumerate(feature_importance.head(5).iterrows(), 1):
    print(f'  {idx}. {row["feature"]:15s} - {row["importance"]:.4f}')

print('\n' + '='*70)