√âTAPE B : ENTRA√éNEMENT DU MOD√àLE ML

B.1 ‚Äî Chargement et exploration du dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Configuration du style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 10)

# Chargement du dataset Iris
iris = load_iris()
X = iris.data
y = iris.target

# Cr√©ation d'un DataFrame
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
df['species'] = df['target'].map({0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'})

print("="*80)
print("DATASET IRIS - INFORMATIONS G√âN√âRALES")
print("="*80)
print(f"\nüìä Forme du dataset : {df.shape}")
print(f"üìä Nombre de features : {X.shape[1]}")
print(f"üìä Nombre de classes : {len(np.unique(y))}")
print(f"üìä Classes : {iris.target_names.tolist()}")
print(f"\nüìä Statistiques du dataset :")
print(df.describe())
print(f"\nüìä Distribution des classes :")
print(df['species'].value_counts())

B.2 ‚Äî Visualisations exploratoires (EDA)

B.2.1 ‚Äî Distribution des classes

In [3]:
# Figure 1 : Distribution des classes
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Graphique 1 : Histogramme
df['species'].value_counts().plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_title('Distribution des esp√®ces (Nombre d\'observations)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Nombre d\'observations', fontsize=12)
axes[0].set_xlabel('Esp√®ce', fontsize=12)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45)

# Graphique 2 : Pie chart
df['species'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%',
                                   colors=['#FF6B6B', '#4ECDC4', '#45B7D1'],
                                   labels=['Setosa', 'Versicolor', 'Virginica'])
axes[1].set_title('Proportion des esp√®ces', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('1_distribution_classes.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Graphique sauvegard√© : 1_distribution_classes.png")

B.2.2 ‚Äî Visualisation multidimensionnelle (Pairplot)

In [4]:
# Figure 2 : Pairplot - Relations entre features
fig = plt.figure(figsize=(14, 12))

# Pairplot avec seaborn
g = sns.pairplot(df, hue='species', diag_kind='hist',
                 palette={'Setosa': '#FF6B6B', 'Versicolor': '#4ECDC4', 'Virginica': '#45B7D1'},
                 plot_kws={'alpha': 0.6, 's': 80},
                 diag_kws={'bins': 20})

g.fig.suptitle('Pairplot - Relations entre les features par esp√®ce',
               fontsize=16, fontweight='bold', y=1.001)

plt.tight_layout()
plt.savefig('2_pairplot.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Graphique sauvegard√© : 2_pairplot.png")

B.2.3 ‚Äî Heatmap de corr√©lation

In [5]:
# Figure 3 : Matrice de corr√©lation
fig, ax = plt.subplots(figsize=(10, 8))

correlation_matrix = df.iloc[:, :-2].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            square=True, cbar_kws={"shrink": 0.8}, ax=ax,
            vmin=-1, vmax=1, center=0)

ax.set_title('Matrice de corr√©lation des features', fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig('3_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Graphique sauvegard√© : 3_correlation_heatmap.png")

B.2.4 ‚Äî Bo√Ætes √† moustaches (Box plots)

In [6]:
# Figure 4 : Box plots par feature
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

features = df.columns[:-2]
for idx, feature in enumerate(features):
    sns.boxplot(data=df, x='species', y=feature, ax=axes[idx],
                palette={'Setosa': '#FF6B6B', 'Versicolor': '#4ECDC4', 'Virginica': '#45B7D1'})
    axes[idx].set_title(f'Distribution de {feature}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(feature, fontsize=11)
    axes[idx].set_xlabel('Esp√®ce', fontsize=11)

plt.suptitle('Box plots - Distribution des features par esp√®ce', fontsize=14, fontweight='bold', y=1.00)
plt.tight_layout()
plt.savefig('4_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Graphique sauvegard√© : 4_boxplots.png")

B.3 ‚Äî Pr√©paration des donn√©es

In [7]:
print("\n" + "="*80)
print("PR√âPARATION DES DONN√âES")
print("="*80)

# Split train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n‚úì Split effectu√© :")
print(f"   - Taille train : {X_train.shape[0]} observations (80%)")
print(f"   - Taille test : {X_test.shape[0]} observations (20%)")

# V√©rification du stratified split
print(f"\n‚úì Distribution des classes en train :")
for i, class_name in enumerate(iris.target_names):
    count = (y_train == i).sum()
    print(f"   - {class_name} : {count} observations")

print(f"\n‚úì Distribution des classes en test :")
for i, class_name in enumerate(iris.target_names):
    count = (y_test == i).sum()
    print(f"   - {class_name} : {count} observations")

# Normalisation des features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n‚úì Normalisation effectu√©e (StandardScaler) :")
print(f"   - Moyenne train (avant) : {X_train.mean():.4f}")
print(f"   - Moyenne train (apr√®s) : {X_train_scaled.mean():.4f}")
print(f"   - Std train (avant) : {X_train.std():.4f}")
print(f"   - Std train (apr√®s) : {X_train_scaled.std():.4f}")

Visualisation : Avant/Apr√®s normalisation

In [8]:
# Figure 5 : Avant/Apr√®s normalisation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Avant normalisation
axes[0].hist(X_train.flatten(), bins=30, color='#FF6B6B', alpha=0.7, edgecolor='black')
axes[0].set_title('Distribution des features AVANT normalisation', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Valeurs', fontsize=11)
axes[0].set_ylabel('Fr√©quence', fontsize=11)
axes[0].axvline(X_train.mean(), color='red', linestyle='--', linewidth=2, label=f'Moyenne: {X_train.mean():.2f}')
axes[0].legend()

# Apr√®s normalisation
axes[1].hist(X_train_scaled.flatten(), bins=30, color='#4ECDC4', alpha=0.7, edgecolor='black')
axes[1].set_title('Distribution des features APR√àS normalisation', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Valeurs', fontsize=11)
axes[1].set_ylabel('Fr√©quence', fontsize=11)
axes[1].axvline(X_train_scaled.mean(), color='red', linestyle='--', linewidth=2, label=f'Moyenne: {X_train_scaled.mean():.2f}')
axes[1].legend()

plt.tight_layout()
plt.savefig('5_normalization_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Graphique sauvegard√© : 5_normalization_comparison.png")

B.4 ‚Äî Entra√Ænement de plusieurs mod√®les

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

print("\n" + "="*80)
print("ENTRA√éNEMENT DES MOD√àLES")
print("="*80)

# Dictionnaire pour stocker les mod√®les et leurs performances
models_info = {}

# ============================================================================
# MOD√àLE 1 : DECISION TREE
# ============================================================================
print("\n" + "-"*80)
print("MOD√àLE 1 : DECISION TREE CLASSIFIER")
print("-"*80)

dt_model = DecisionTreeClassifier(max_depth=5, random_state=42, min_samples_split=5)
dt_model.fit(X_train_scaled, y_train)

# Pr√©dictions
dt_train_pred = dt_model.predict(X_train_scaled)
dt_test_pred = dt_model.predict(X_test_scaled)

# M√©triques
dt_train_accuracy = accuracy_score(y_train, dt_train_pred)
dt_test_accuracy = accuracy_score(y_test, dt_test_pred)
dt_precision = precision_score(y_test, dt_test_pred, average='weighted')
dt_recall = recall_score(y_test, dt_test_pred, average='weighted')
dt_f1 = f1_score(y_test, dt_test_pred, average='weighted')

print(f"\nüìä Hyperparam√®tres :")
print(f"   - max_depth : 5")
print(f"   - min_samples_split : 5")
print(f"   - random_state : 42")

print(f"\nüìä Performances :")
print(f"   - Accuracy (train) : {dt_train_accuracy:.4f} ({dt_train_accuracy*100:.2f}%)")
print(f"   - Accuracy (test)  : {dt_test_accuracy:.4f} ({dt_test_accuracy*100:.2f}%)")
print(f"   - Precision        : {dt_precision:.4f}")
print(f"   - Recall           : {dt_recall:.4f}")
print(f"   - F1-Score         : {dt_f1:.4f}")

models_info['Decision Tree'] = {
    'model': dt_model,
    'train_acc': dt_train_accuracy,
    'test_acc': dt_test_accuracy,
    'precision': dt_precision,
    'recall': dt_recall,
    'f1': dt_f1,
    'predictions': dt_test_pred,
    'confusion_matrix': confusion_matrix(y_test, dt_test_pred)
}

print(f"\nüìã Rapport de classification :")
print(classification_report(y_test, dt_test_pred, target_names=iris.target_names))

# ============================================================================
# MOD√àLE 2 : RANDOM FOREST
# ============================================================================
print("\n" + "-"*80)
print("MOD√àLE 2 : RANDOM FOREST CLASSIFIER")
print("-"*80)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=10,
                                   random_state=42, min_samples_split=5)
rf_model.fit(X_train_scaled, y_train)

# Pr√©dictions
rf_train_pred = rf_model.predict(X_train_scaled)
rf_test_pred = rf_model.predict(X_test_scaled)

# M√©triques
rf_train_accuracy = accuracy_score(y_train, rf_train_pred)
rf_test_accuracy = accuracy_score(y_test, rf_test_pred)
rf_precision = precision_score(y_test, rf_test_pred, average='weighted')
rf_recall = recall_score(y_test, rf_test_pred, average='weighted')
rf_f1 = f1_score(y_test, rf_test_pred, average='weighted')

print(f"\nüìä Hyperparam√®tres :")
print(f"   - n_estimators : 100")
print(f"   - max_depth : 10")
print(f"   - min_samples_split : 5")
print(f"   - random_state : 42")

print(f"\nüìä Performances :")
print(f"   - Accuracy (train) : {rf_train_accuracy:.4f} ({rf_train_accuracy*100:.2f}%)")
print(f"   - Accuracy (test)  : {rf_test_accuracy:.4f} ({rf_test_accuracy*100:.2f}%)")
print(f"   - Precision        : {rf_precision:.4f}")
print(f"   - Recall           : {rf_recall:.4f}")
print(f"   - F1-Score         : {rf_f1:.4f}")

models_info['Random Forest'] = {
    'model': rf_model,
    'train_acc': rf_train_accuracy,
    'test_acc': rf_test_accuracy,
    'precision': rf_precision,
    'recall': rf_recall,
    'f1': rf_f1,
    'predictions': rf_test_pred,
    'confusion_matrix': confusion_matrix(y_test, rf_test_pred),
    'feature_importance': rf_model.feature_importances_
}

print(f"\nüìã Rapport de classification :")
print(classification_report(y_test, rf_test_pred, target_names=iris.target_names))

# ============================================================================
# MOD√àLE 3 : LOGISTIC REGRESSION
# ============================================================================
print("\n" + "-"*80)
print("MOD√àLE 3 : LOGISTIC REGRESSION")
print("-"*80)

lr_model = LogisticRegression(max_iter=200, random_state=42,
                               solver='lbfgs', multi_class='multinomial')
lr_model.fit(X_train_scaled, y_train)

# Pr√©dictions
lr_train_pred = lr_model.predict(X_train_scaled)
lr_test_pred = lr_model.predict(X_test_scaled)

# M√©triques
lr_train_accuracy = accuracy_score(y_train, lr_train_pred)
lr_test_accuracy = accuracy_score(y_test, lr_test_pred)
lr_precision = precision_score(y_test, lr_test_pred, average='weighted')
lr_recall = recall_score(y_test, lr_test_pred, average='weighted')
lr_f1 = f1_score(y_test, lr_test_pred, average='weighted')

print(f"\nüìä Hyperparam√®tres :")
print(f"   - max_iter : 200")
print(f"   - solver : lbfgs")
print(f"   - multi_class : multinomial")
print(f"   - random_state : 42")

print(f"\nüìä Performances :")
print(f"   - Accuracy (train) : {lr_train_accuracy:.4f} ({lr_train_accuracy*100:.2f}%)")
print(f"   - Accuracy (test)  : {lr_test_accuracy:.4f} ({lr_test_accuracy*100:.2f}%)")
print(f"   - Precision        : {lr_precision:.4f}")
print(f"   - Recall           : {lr_recall:.4f}")
print(f"   - F1-Score         : {lr_f1:.4f}")

models_info['Logistic Regression'] = {
    'model': lr_model,
    'train_acc': lr_train_accuracy,
    'test_acc': lr_test_accuracy,
    'precision': lr_precision,
    'recall': lr_recall,
    'f1': lr_f1,
    'predictions': lr_test_pred,
    'confusion_matrix': confusion_matrix(y_test, lr_test_pred)
}

print(f"\nüìã Rapport de classification :")
print(classification_report(y_test, lr_test_pred, target_names=iris.target_names))

B.5 ‚Äî Comparaison des mod√®les

In [10]:
print("\n" + "="*80)
print("COMPARAISON DES MOD√àLES")
print("="*80)

# Tableau comparatif
comparison_df = pd.DataFrame({
    'Mod√®le': models_info.keys(),
    'Accuracy (Train)': [models_info[m]['train_acc'] for m in models_info],
    'Accuracy (Test)': [models_info[m]['test_acc'] for m in models_info],
    'Precision': [models_info[m]['precision'] for m in models_info],
    'Recall': [models_info[m]['recall'] for m in models_info],
    'F1-Score': [models_info[m]['f1'] for m in models_info]
})

print("\n" + comparison_df.to_string(index=False))

# D√©tection du meilleur mod√®le
best_model_name = comparison_df.loc[comparison_df['Accuracy (Test)'].idxmax(), 'Mod√®le']
best_accuracy = comparison_df['Accuracy (Test)'].max()

print(f"\n{'='*80}")
print(f"‚úì MEILLEUR MOD√àLE : {best_model_name}")
print(f"‚úì ACCURACY : {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"{'='*80}")

B.6 ‚Äî Visualisations des performances

B.6.1 ‚Äî Comparaison des pr√©cisions

In [11]:
# Figure 6 : Comparaison des m√©triques
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy (Train)', 'Accuracy (Test)', 'Precision', 'F1-Score']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]

    if metric == 'Accuracy (Train)':
        values = [models_info[m]['train_acc'] for m in models_info]
    elif metric == 'Accuracy (Test)':
        values = [models_info[m]['test_acc'] for m in models_info]
    elif metric == 'Precision':
        values = [models_info[m]['precision'] for m in models_info]
    else:  # F1-Score
        values = [models_info[m]['f1'] for m in models_info]

    bars = ax.bar(models_info.keys(), values, color=colors, alpha=0.8, edgecolor='black', linewidth=2)

    # Ajouter les valeurs sur les barres
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.4f}\n({height*100:.2f}%)',
                ha='center', va='bottom', fontsize=10, fontweight='bold')

    ax.set_title(metric, fontsize=12, fontweight='bold')
    ax.set_ylabel('Score', fontsize=11)
    ax.set_ylim([0.9, 1.05])
    ax.grid(axis='y', alpha=0.3)

    for label in ax.get_xticklabels():
        label.set_rotation(45)
        label.set_ha('right')

plt.suptitle('Comparaison des performances des mod√®les', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('6_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Graphique sauvegard√© : 6_models_comparison.png")

B.6.2 ‚Äî Matrice de confusion (Heatmaps)

In [12]:
# Figure 7 : Matrices de confusion
fig, axes = plt.subplots(1, 3, figsize=(16, 4))

for idx, (model_name, model_data) in enumerate(models_info.items()):
    cm = model_data['confusion_matrix']

    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=iris.target_names, yticklabels=iris.target_names,
                cbar_kws={'label': 'Nombre de pr√©dictions'})

    axes[idx].set_title(f'Matrice de confusion - {model_name}\n(Accuracy: {model_data["test_acc"]:.4f})',
                       fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Vraie classe', fontsize=11)
    axes[idx].set_xlabel('Classe pr√©dite', fontsize=11)

plt.suptitle('Matrices de confusion pour tous les mod√®les', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('7_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Graphique sauvegard√© : 7_confusion_matrices.png")

B.6.3 ‚Äî Feature Importance (Random Forest)

In [13]:
# Figure 8 : Feature Importance du Random Forest
fig, ax = plt.subplots(figsize=(10, 6))

feature_importance = models_info['Random Forest']['feature_importance']
feature_names = iris.feature_names

# Tri par importance
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_features = [feature_names[i] for i in sorted_indices]
sorted_importance = feature_importance[sorted_indices]

bars = ax.barh(sorted_features, sorted_importance, color='#45B7D1', alpha=0.8, edgecolor='black', linewidth=2)

# Ajouter les valeurs
for i, (feature, importance) in enumerate(zip(sorted_features, sorted_importance)):
    ax.text(importance, i, f' {importance:.4f}', va='center', fontsize=11, fontweight='bold')

ax.set_xlabel('Importance', fontsize=12, fontweight='bold')
ax.set_title('Feature Importance - Random Forest Classifier', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('8_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Graphique sauvegard√© : 8_feature_importance.png")

B.7 ‚Äî Analyse des erreurs

In [15]:
print("\n" + "="*80)
print("ANALYSE DES ERREURS DU MEILLEUR MOD√àLE (DECISION TREE)")
print("="*80)

best_model = models_info['Decision Tree']['model']
dt_predictions = models_info['Decision Tree']['predictions']

# Indices des pr√©dictions incorrectes
incorrect_indices = np.where(dt_predictions != y_test)[0]

print(f"\nüìä Nombre de pr√©dictions incorrectes : {len(incorrect_indices)}")
print(f"üìä Taux d'erreur : {len(incorrect_indices)/len(y_test)*100:.2f}%")
print(f"üìä Taux de r√©ussite : {(1 - len(incorrect_indices)/len(y_test))*100:.2f}%")

if len(incorrect_indices) > 0:
    print(f"\nüìã D√©tail des erreurs ({len(incorrect_indices)} observations mal classifi√©es) :")
    for idx in incorrect_indices:
        true_class = iris.target_names[y_test[idx]]
        pred_class = iris.target_names[dt_predictions[idx]]
        features = X_test[idx]
        print(f"   - Index {idx} : Pr√©diction '{pred_class}' (r√©alit√© : '{true_class}')")
        print(f"     Features : {features}")

    # Analyse par classe
    print(f"\nüìä Erreurs par classe :")
    for class_idx, class_name in enumerate(iris.target_names):
        class_errors = sum(1 for idx in incorrect_indices if y_test[idx] == class_idx)
        class_total = sum(1 for y in y_test if y == class_idx)
        if class_total > 0:
            error_rate = class_errors / class_total * 100
            print(f"   - {class_name} : {class_errors}/{class_total} erreurs ({error_rate:.1f}%)")
else:
    print(f"\n‚úì Aucune erreur de pr√©diction ! (Mod√®le parfait sur l'ensemble test)")