# Source: UCI Machine Learning Repository
- Th√®me: Pr√©diction d'abandon scolaire et r√©ussite acad√©mique
- √âtablissement: Enseignement sup√©rieur au Portugal
- URL: https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success

In [None]:
import os
import warnings
from datetime import datetime

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    classification_report,
    confusion_matrix,
    f1_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier


warnings.filterwarnings("ignore")

df = pd.read_csv("../data/dataset.csv", delimiter=";")

## Taille du dataset

In [None]:
print(f"Nombre d'observations: {df.shape[0]:,}")
print(f"Nombre de variables: {df.shape[1]}")

## Liste des variables

In [None]:
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

## Typage des donn√©es

In [None]:
df.info()

In [None]:
## Description du dataset
df.describe()

## Distribution de la variable cible (Target)

In [None]:
target_counts = df["Target"].value_counts()
target_pct = df["Target"].value_counts(normalize=True) * 100

for category in target_counts.index:
    count = target_counts[category]
    pct = target_pct[category]
    print(f"- {category}: {count:,} √©tudiants ({pct:.1f}%)")

In [None]:
sns.set_palette("Set2")
plt.figure(figsize=(8, 6))

# Cr√©ation du pie chart
# target_counts = df['Target'].value_counts()
colors = ["#66c2a5", "#fc8d62", "#8da0cb"]  # Couleurs harmonieuses
explode = (0.05, 0.05, 0.05)  # L√©g√®re s√©paration des parts

plt.pie(
    target_counts.values,
    labels=target_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    colors=colors,
    explode=explode,
    shadow=True,
    textprops={"fontsize": 12, "weight": "bold"},
)

plt.title("Distribution de la variable cible (Target)", fontsize=14, weight="bold", pad=20)

plt.axis("equal")  # Pour avoir un cercle parfait
plt.tight_layout()
plt.show()

## Premieres lignes

In [None]:
df.head()

In [None]:
# Analyse binaire: Dropout vs Non-Dropout
print("üéØ Classification Binaire: Dropout vs Non-Dropout\n")

# Cr√©er une nouvelle variable binaire
df["Dropout_Binary"] = df["Target"].apply(lambda x: "Dropout" if x == "Dropout" else "Non-Dropout")

# Calculer les proportions
binary_counts = df["Dropout_Binary"].value_counts()
binary_pct = df["Dropout_Binary"].value_counts(normalize=True) * 100

print("Distribution:")
for category in binary_counts.index:
    count = binary_counts[category]
    pct = binary_pct[category]
    print(f"  ‚Ä¢ {category}: {count:,} √©tudiants ({pct:.1f}%)")

print(f"\nTotal: {binary_counts.sum():,} √©tudiants")

# D√©tail de la composition de Non-Dropout
print("\nüìã Composition de 'Non-Dropout':")
non_dropout_detail = df[df["Dropout_Binary"] == "Non-Dropout"]["Target"].value_counts()
for category in non_dropout_detail.index:
    count = non_dropout_detail[category]
    pct = (count / binary_counts["Non-Dropout"]) * 100
    print(f"  ‚Ä¢ {category}: {count:,} ({pct:.1f}% des Non-Dropout)")

In [None]:
# Visualisation: Dropout vs Non-Dropout avec pie chart
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Graphique 1: Distribution binaire
colors_binary = ["#e74c3c", "#2ecc71"]  # Rouge pour Dropout, Vert pour Non-Dropout
explode_binary = (0.1, 0)  # Faire ressortir Dropout

axes[0].pie(
    binary_counts.values,
    labels=binary_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    colors=colors_binary,
    explode=explode_binary,
    shadow=True,
    textprops={"fontsize": 12, "weight": "bold"},
)

axes[0].set_title("Classification Binaire\nDropout vs Non-Dropout", fontsize=14, weight="bold")

# Graphique 2: Distribution originale (rappel)
colors_original = ["#66c2a5", "#fc8d62", "#8da0cb"]
target_counts = df["Target"].value_counts()
explode_original = (0.05, 0.05, 0.05)

axes[1].pie(
    target_counts.values,
    labels=target_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    colors=colors_original,
    explode=explode_original,
    shadow=True,
    textprops={"fontsize": 12, "weight": "bold"},
)

axes[1].set_title("Classification Multi-classe\n(Original)", fontsize=14, weight="bold")

plt.tight_layout()
plt.show()

print("\nüí° Comparaison:")
print(
    f"   Approche binaire: {binary_pct['Dropout']:.1f}% Dropout vs {binary_pct['Non-Dropout']:.1f}% Non-Dropout"
)
print(f"   Ratio de d√©s√©quilibre: 1:{binary_counts['Non-Dropout'] / binary_counts['Dropout']:.2f}")

## Phase 2 : Qualit√© des Donn√©es

Avant d'analyser les relations entre variables, v√©rifions la qualit√© de nos donn√©es :
1. **Valeurs manquantes** - Y a-t-il des donn√©es absentes ?
2. **Doublons** - Des lignes sont-elles dupliqu√©es ?
3. **Outliers** - Des valeurs aberrantes existent-elles ?

## 2.1 Analyse des valeurs manquantes

In [None]:
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({"Manquantes": missing, "Pourcentage (%)": missing_pct.round(2)})
missing_df = missing_df[missing_df["Manquantes"] > 0].sort_values("Manquantes", ascending=False)

if not len(missing_df):
    print("‚úÖ Aucune valeur manquante dans le dataset !")
    print(f"- Toutes les {df.shape[1]} colonnes sont compl√®tes")
    print(f"- {df.shape[0]:,} lignes sans donn√©es manquantes")
else:
    print("‚ö†Ô∏è Valeurs manquantes d√©tect√©es:\n")
    display(missing_df)

# R√©sum√© rapide
total_cells = df.shape[0] * df.shape[1]
total_missing = df.isnull().sum().sum()
total_ratio = total_missing / total_cells
print(
    f"R√©sum√©: {total_missing:,} valeurs manquantes sur {total_cells:,} cellules ({total_ratio:.2%})"
)

### 2.2 Analyse des valeurs dupliqu√©es

In [None]:
# Doublons complets (toutes les colonnes identiques)
duplicates_full = df.duplicated().sum()
print(
    f"üîç Lignes compl√®tement dupliqu√©es: {duplicates_full} ({(duplicates_full / len(df)) * 100:.2f}%)"
)

if duplicates_full > 0:
    print("Exemples de doublons:")
    display(df[df.duplicated(keep=False)].head(10))
else:
    print("- Aucun doublon complet d√©tect√© ‚úÖ")

# V√©rifier aussi les doublons partiels (sans la colonne Target)
cols_without_target = [col for col in df.columns if col not in ["Target", "Dropout_Binary"]]
duplicates_partial = df.duplicated(subset=cols_without_target).sum()
print(f"\nüîç Doublons partiels (m√™mes features, Target diff√©rent): {duplicates_partial}")

if duplicates_partial:
    print("- Des √©tudiants avec les m√™mes caract√©ristiques ont des r√©sultats diff√©rents")
    print("- Cela peut indiquer de la variabilit√© naturelle ou des erreurs de saisie")

## D√©tection des outliers - Variables num√©riques

In [None]:
# Palette de couleurs pour √©viter les warnings
colors = {"Dropout": "#e74c3c", "Non-Dropout": "#2ecc71"}
order = ["Dropout", "Non-Dropout"]

# S√©lection des variables num√©riques principales
numeric_cols = [
    "Age at enrollment",
    "Admission grade",
    "Previous qualification (grade)",
    "Curricular units 1st sem (grade)",
    "Curricular units 2nd sem (grade)",
]

# Cr√©ation des boxplots avec seaborn (√©vite les warnings)
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    ax = axes[i]
    sns.boxplot(
        data=df,
        x="Dropout_Binary",
        y=col,
        hue="Dropout_Binary",
        hue_order=order,
        palette=colors,
        legend=False,
        ax=ax,
    )
    ax.set_title(col, fontsize=11, weight="bold")
    ax.set_xlabel("")
    ax.set_ylabel("Valeur")

# Supprimer le dernier subplot vide
axes[-1].axis("off")

plt.suptitle(
    "Boxplots des Variables Num√©riques Cl√©s\n(Comparaison Dropout vs Non-Dropout)",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.show()

print("\nüí° Interpr√©tation des boxplots:")
print("- Les points au-del√† des moustaches sont des outliers potentiels")
print("- Comparez les m√©dianes (ligne horizontale) entre Dropout et Non-Dropout")
print("- Des diff√©rences marqu√©es sugg√®rent un pouvoir pr√©dictif de la variable")

In [None]:
# 2.3b Quantification des outliers avec la m√©thode IQR
# ‚ö†Ô∏è UNIQUEMENT sur les variables CONTINUES (pas les cat√©gorielles encod√©es)
print("üìä Quantification des Outliers (M√©thode IQR)\n" + "=" * 50)


def count_outliers_iqr(df, column):
    """Compte les outliers selon la m√©thode IQR"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound


# Variables CONTINUES uniquement (exclure les cat√©gorielles encod√©es en num√©rique)
continuous_cols = [
    "Age at enrollment",
    "Admission grade",
    "Previous qualification (grade)",
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)",
    "Unemployment rate",
    "Inflation rate",
    "GDP",
]

# Variables cat√©gorielles exclues (m√™me si encod√©es en num√©rique)
categorical_excluded = [
    "Marital status",
    "Application mode",
    "Application order",
    "Course",
    "Daytime/evening attendance",
    "Previous qualification",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
    "Displaced",
    "Educational special needs",
    "Debtor",
    "Tuition fees up to date",
    "Gender",
    "Scholarship holder",
    "International",
]

print(f"üìã Analyse de {len(continuous_cols)} variables continues")
print(f"   (Variables cat√©gorielles exclues: {len(categorical_excluded)})\n")

outlier_summary = []
for col in continuous_cols:
    if col in df.columns:
        count, lower, upper = count_outliers_iqr(df, col)
        pct = (count / len(df)) * 100
        if count > 0:
            outlier_summary.append(
                {
                    "Variable": col,
                    "Outliers": count,
                    "Pourcentage": f"{pct:.2f}%",
                    "Borne inf": f"{lower:.2f}",
                    "Borne sup": f"{upper:.2f}",
                }
            )

outlier_df = pd.DataFrame(outlier_summary)
outlier_df = outlier_df.sort_values("Outliers", ascending=False)

if len(outlier_df) > 0:
    print("‚ö†Ô∏è Variables continues avec outliers d√©tect√©s:\n")
    display(outlier_df)
    print(
        "\nüí° Note: Ces outliers peuvent √™tre l√©gitimes (ex: √©tudiants plus √¢g√©s, excellentes notes)"
    )
else:
    print("‚úÖ Aucun outlier significatif d√©tect√© dans les variables continues")

## Phase 3 : Analyse Bivari√©e - Variables Num√©riques vs Dropout

**Objectif** : Identifier quelles variables num√©riques diff√©rencient les √©tudiants qui abandonnent de ceux qui r√©ussissent.

**Questions cl√©s** :
- Les Dropout ont-ils des notes d'admission plus faibles ?
- L'√¢ge est-il un facteur de risque ?
- La performance au 1er semestre pr√©dit-elle l'abandon ?

In [None]:
# 3.1a Notes d'admission et qualification ant√©rieure vs Dropout
print("üìä Notes d'Admission vs Dropout\n" + "=" * 50)

# Palette de couleurs pour √©viter les warnings
colors = {"Dropout": "#e74c3c", "Non-Dropout": "#2ecc71"}
order = ["Dropout", "Non-Dropout"]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Admission grade
sns.violinplot(
    data=df,
    x="Dropout_Binary",
    y="Admission grade",
    hue="Dropout_Binary",
    hue_order=order,
    palette=colors,
    legend=False,
    ax=axes[0],
)
axes[0].set_title("Note d'admission par groupe", fontsize=12, weight="bold")
axes[0].set_xlabel("")
axes[0].set_ylabel("Note d'admission (0-200)")

# Previous qualification grade
sns.violinplot(
    data=df,
    x="Dropout_Binary",
    y="Previous qualification (grade)",
    hue="Dropout_Binary",
    hue_order=order,
    palette=colors,
    legend=False,
    ax=axes[1],
)
axes[1].set_title("Note de qualification ant√©rieure par groupe", fontsize=12, weight="bold")
axes[1].set_xlabel("")
axes[1].set_ylabel("Note qualification (0-200)")

plt.tight_layout()
plt.show()

# Statistiques comparatives
print("\nüìà Statistiques comparatives:")
for col in ["Admission grade", "Previous qualification (grade)"]:
    dropout_mean = df[df["Dropout_Binary"] == "Dropout"][col].mean()
    non_dropout_mean = df[df["Dropout_Binary"] == "Non-Dropout"][col].mean()
    diff = non_dropout_mean - dropout_mean
    print(f"\n   {col}:")
    print(f"- Dropout:     {dropout_mean:.1f}")
    print(f"- Non-Dropout: {non_dropout_mean:.1f}")
    print(
        f"- Diff√©rence:  {diff:+.1f} points ({'‚úÖ Significatif' if abs(diff) > 5 else '‚ö™ Faible'})"
    )

In [None]:
# 3.1b √Çge √† l'inscription vs Dropout
print("üìä √Çge √† l'inscription vs Dropout\n" + "=" * 50)

# Palette de couleurs pour √©viter les warnings
colors = {"Dropout": "#e74c3c", "Non-Dropout": "#2ecc71"}
order = ["Dropout", "Non-Dropout"]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Violin plot
sns.violinplot(
    data=df,
    x="Dropout_Binary",
    y="Age at enrollment",
    hue="Dropout_Binary",
    hue_order=order,
    palette=colors,
    legend=False,
    ax=axes[0],
)
axes[0].set_title("Distribution de l'√¢ge par groupe", fontsize=12, weight="bold")
axes[0].set_xlabel("")
axes[0].set_ylabel("√Çge √† l'inscription")

# Histogramme superpos√©
for label, color in [("Dropout", "#e74c3c"), ("Non-Dropout", "#2ecc71")]:
    subset = df[df["Dropout_Binary"] == label]["Age at enrollment"]
    axes[1].hist(subset, bins=30, alpha=0.6, label=label, color=color, density=True)

axes[1].set_title("Distribution de l'√¢ge (densit√©)", fontsize=12, weight="bold")
axes[1].set_xlabel("√Çge √† l'inscription")
axes[1].set_ylabel("Densit√©")
axes[1].legend()

plt.tight_layout()
plt.show()

# Statistiques comparatives
print("\nüìà Statistiques sur l'√¢ge:")
dropout_age = df[df["Dropout_Binary"] == "Dropout"]["Age at enrollment"]
non_dropout_age = df[df["Dropout_Binary"] == "Non-Dropout"]["Age at enrollment"]

print("\n   Dropout:")
print(f"- Moyenne: {dropout_age.mean():.1f} ans | M√©diane: {dropout_age.median():.0f} ans")
print(f"- Min: {dropout_age.min():.0f} | Max: {dropout_age.max():.0f}")

print("\n   Non-Dropout:")
print(f"- Moyenne: {non_dropout_age.mean():.1f} ans | M√©diane: {non_dropout_age.median():.0f} ans")
print(f"- Min: {non_dropout_age.min():.0f} | Max: {non_dropout_age.max():.0f}")

diff = dropout_age.mean() - non_dropout_age.mean()
print(
    f"\n   üí° Les Dropout sont en moyenne {abs(diff):.1f} ans {'plus √¢g√©s' if diff > 0 else 'plus jeunes'}"
)

In [None]:
# 3.1c Performance 1er Semestre vs Dropout
print("üìä Performance 1er Semestre vs Dropout\n" + "=" * 50)

# Palette de couleurs pour √©viter les warnings
colors = {"Dropout": "#e74c3c", "Non-Dropout": "#2ecc71"}
order = ["Dropout", "Non-Dropout"]

sem1_cols = [
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (enrolled)",
]

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for i, col in enumerate(sem1_cols):
    sns.boxplot(
        data=df,
        x="Dropout_Binary",
        y=col,
        hue="Dropout_Binary",
        hue_order=order,
        palette=colors,
        legend=False,
        ax=axes[i],
    )
    # Titre simplifi√©
    short_name = col.replace("Curricular units 1st sem ", "").replace("(", "").replace(")", "")
    axes[i].set_title(f"1er Sem: {short_name.capitalize()}", fontsize=11, weight="bold")
    axes[i].set_xlabel("")

plt.suptitle(
    "Variables du 1er Semestre - Comparaison Dropout vs Non-Dropout",
    fontsize=13,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.show()

# Statistiques cl√©s
print("\nüìà Statistiques du 1er semestre:")
key_col = "Curricular units 1st sem (approved)"
dropout_val = df[df["Dropout_Binary"] == "Dropout"][key_col].mean()
non_dropout_val = df[df["Dropout_Binary"] == "Non-Dropout"][key_col].mean()

print("\n   Unit√©s valid√©es au 1er semestre:")
print(f"- Dropout:     {dropout_val:.2f} unit√©s en moyenne")
print(f"- Non-Dropout: {non_dropout_val:.2f} unit√©s en moyenne")
print(
    f"- Ratio:       {non_dropout_val / max(dropout_val, 0.01):.1f}x plus d'unit√©s valid√©es pour Non-Dropout"
)

key_col2 = "Curricular units 1st sem (grade)"
dropout_grade = df[df["Dropout_Binary"] == "Dropout"][key_col2].mean()
non_dropout_grade = df[df["Dropout_Binary"] == "Non-Dropout"][key_col2].mean()
print("\n   Note moyenne au 1er semestre:")
print(f"- Dropout:     {dropout_grade:.2f}/20")
print(f"- Non-Dropout: {non_dropout_grade:.2f}/20")

In [None]:
# 3.1d Performance 2√®me Semestre vs Dropout
print("üìä Performance 2√®me Semestre vs Dropout\n" + "=" * 50)

# Palette de couleurs pour √©viter les warnings
colors = {"Dropout": "#e74c3c", "Non-Dropout": "#2ecc71"}
order = ["Dropout", "Non-Dropout"]

sem2_cols = [
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (enrolled)",
]

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for i, col in enumerate(sem2_cols):
    sns.boxplot(
        data=df,
        x="Dropout_Binary",
        y=col,
        hue="Dropout_Binary",
        hue_order=order,
        palette=colors,
        legend=False,
        ax=axes[i],
    )
    short_name = col.replace("Curricular units 2nd sem ", "").replace("(", "").replace(")", "")
    axes[i].set_title(f"2√®me Sem: {short_name.capitalize()}", fontsize=11, weight="bold")
    axes[i].set_xlabel("")

plt.suptitle(
    "Variables du 2√®me Semestre - Comparaison Dropout vs Non-Dropout",
    fontsize=13,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.show()

# Comparaison 1er vs 2√®me semestre
print("\nüìà Comparaison √©volution entre semestres:")
print("\n   | Groupe       | Sem 1 (approved) | Sem 2 (approved) | √âvolution |")
print("   |--------------|------------------|------------------|-----------|")

for group in ["Dropout", "Non-Dropout"]:
    sem1_mean = df[df["Dropout_Binary"] == group]["Curricular units 1st sem (approved)"].mean()
    sem2_mean = df[df["Dropout_Binary"] == group]["Curricular units 2nd sem (approved)"].mean()
    evolution = sem2_mean - sem1_mean
    print(f"   | {group:12} | {sem1_mean:16.2f} | {sem2_mean:16.2f} | {evolution:+9.2f} |")

print("\nüí° Observation: Comment √©volue la performance entre les deux semestres ?")

## Phase 3.2 : Variables Cat√©gorielles vs Dropout

**Objectif** : Identifier quelles cat√©gories sont associ√©es √† un risque plus √©lev√© de Dropout.

**Questions cl√©s** :
- Certains programmes ont-ils plus d'abandons ?
- Les boursiers abandonnent-ils moins ?
- Le genre, l'√©tat civil ou les cours du soir influencent-ils le Dropout ?

In [None]:
# 3.2a Analyse des variables binaires vs Dropout
print("üìä Variables Binaires vs Taux de Dropout\n" + "=" * 50)

# Variables binaires √† analyser (sans Daytime/evening attendance qui a un nom probl√©matique)
binary_vars = [
    ("Gender", {0: "Femme", 1: "Homme"}),
    ("Scholarship holder", {0: "Non boursier", 1: "Boursier"}),
    ("Debtor", {0: "Non d√©biteur", 1: "D√©biteur"}),
    ("Tuition fees up to date", {0: "Non √† jour", 1: "√Ä jour"}),
    ("Displaced", {0: "Local", 1: "D√©plac√©"}),
    ("International", {0: "National", 1: "International"}),
]

# Calculer le taux de dropout pour chaque variable
results = []
for var, labels in binary_vars:
    for val, label in labels.items():
        subset = df[df[var] == val]
        total = len(subset)
        dropout_count = len(subset[subset["Dropout_Binary"] == "Dropout"])
        dropout_rate = (dropout_count / total * 100) if total > 0 else 0
        results.append(
            {
                "Variable": var,
                "Cat√©gorie": label,
                "Total": total,
                "Dropout": dropout_count,
                "Taux Dropout (%)": round(dropout_rate, 1),
            }
        )

results_df = pd.DataFrame(results)

# Afficher le tableau
print("\nüìã Taux de Dropout par variable binaire:\n")
display(results_df)

# Identifier les facteurs de risque
print("\nüî¥ Facteurs augmentant le risque de Dropout:")
for var, labels in binary_vars:
    rates = results_df[results_df["Variable"] == var]["Taux Dropout (%)"].values
    if len(rates) == 2 and abs(rates[0] - rates[1]) > 5:
        higher = labels[0] if rates[0] > rates[1] else labels[1]
        diff = abs(rates[0] - rates[1])
        print(f"- {var}: '{higher}' ‚Üí +{diff:.1f}% de dropout")

In [None]:
# 3.2b Visualisation des variables binaires
print("üìä Visualisation du Taux de Dropout par Variable Binaire\n" + "=" * 50)

# Variables binaires (sans Daytime/evening attendance)
binary_vars_simple = [
    "Gender",
    "Scholarship holder",
    "Debtor",
    "Tuition fees up to date",
    "Displaced",
    "International",
]

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for i, var in enumerate(binary_vars_simple):
    ax = axes[i]

    # Calculer les pourcentages
    cross_tab = pd.crosstab(df[var], df["Dropout_Binary"], normalize="index") * 100

    # Plot
    cross_tab.plot(kind="bar", ax=ax, color=["#2ecc71", "#e74c3c"], edgecolor="black")
    ax.set_title(f"{var}", fontsize=11, weight="bold")
    ax.set_xlabel("")
    ax.set_ylabel("Pourcentage (%)")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
    ax.legend(["Non-Dropout", "Dropout"], loc="upper right")
    ax.set_ylim(0, 100)

    # Ajouter la ligne de r√©f√©rence (taux global)
    global_dropout_rate = (df["Dropout_Binary"] == "Dropout").mean() * 100
    ax.axhline(
        y=global_dropout_rate,
        color="red",
        linestyle="--",
        alpha=0.5,
        label="Taux global",
    )

plt.suptitle(
    "Taux de Dropout vs Non-Dropout par Variable Binaire",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.show()

print(f"\nüìà Taux de dropout global: {global_dropout_rate:.1f}%")
print("   (Ligne rouge pointill√©e = r√©f√©rence)")

In [None]:
# 3.2c Programme d'√©tudes (Course) vs Dropout
print("üìä Programme d'√âtudes vs Taux de Dropout\n" + "=" * 50)

# Mapping des codes de programme
course_mapping = {
    33: "Biofuel Production",
    171: "Animation & Multimedia",
    8014: "Social Service (soir)",
    9003: "Agronomy",
    9070: "Communication Design",
    9085: "Veterinary Nursing",
    9119: "Informatics Engineering",
    9130: "Equinculture",
    9147: "Management",
    9238: "Social Service",
    9254: "Tourism",
    9500: "Nursing",
    9556: "Oral Hygiene",
    9670: "Advertising & Marketing",
    9773: "Journalism & Communication",
    9853: "Basic Education",
    9991: "Management (soir)",
}

# Calculer le taux de dropout par programme
course_stats = (
    df.groupby("Course")
    .agg({"Dropout_Binary": lambda x: (x == "Dropout").sum(), "Target": "count"})
    .rename(columns={"Dropout_Binary": "Dropouts", "Target": "Total"})
)

course_stats["Taux Dropout (%)"] = (course_stats["Dropouts"] / course_stats["Total"] * 100).round(1)
course_stats["Programme"] = course_stats.index.map(course_mapping)
course_stats = course_stats.sort_values("Taux Dropout (%)", ascending=False)

# Visualisation
fig, ax = plt.subplots(figsize=(12, 8))
colors = [
    "#e74c3c" if x > 40 else "#f39c12" if x > 30 else "#2ecc71"
    for x in course_stats["Taux Dropout (%)"]
]

bars = ax.barh(
    course_stats["Programme"],
    course_stats["Taux Dropout (%)"],
    color=colors,
    edgecolor="black",
)
ax.set_xlabel("Taux de Dropout (%)", fontsize=12)
ax.set_title("Taux de Dropout par Programme d'√âtudes", fontsize=14, weight="bold")
ax.axvline(
    x=global_dropout_rate,
    color="red",
    linestyle="--",
    linewidth=2,
    label=f"Moyenne ({global_dropout_rate:.1f}%)",
)
ax.legend()

# Ajouter les valeurs sur les barres
for bar, val in zip(bars, course_stats["Taux Dropout (%)"]):
    ax.text(
        bar.get_width() + 0.5,
        bar.get_y() + bar.get_height() / 2,
        f"{val}%",
        va="center",
        fontsize=9,
    )

plt.tight_layout()
plt.show()

# R√©sum√©
print("\nüî¥ Programmes √† HAUT risque (>40% dropout):")
high_risk = course_stats[course_stats["Taux Dropout (%)"] > 40]
for _, row in high_risk.iterrows():
    print(
        f"- {row['Programme']}: {row['Taux Dropout (%)']}% ({row['Dropouts']}/{row['Total']} √©tudiants)"
    )

print("\nüü¢ Programmes √† FAIBLE risque (<25% dropout):")
low_risk = course_stats[course_stats["Taux Dropout (%)"] < 25]
for _, row in low_risk.iterrows():
    print(
        f"- {row['Programme']}: {row['Taux Dropout (%)']}% ({row['Dropouts']}/{row['Total']} √©tudiants)"
    )

In [None]:
# 3.2d Statut Matrimonial vs Dropout
print("üìä Statut Matrimonial vs Taux de Dropout\n" + "=" * 50)

# Mapping des codes d'√©tat civil
marital_mapping = {
    1: "C√©libataire",
    2: "Mari√©(e)",
    3: "Veuf/Veuve",
    4: "Divorc√©(e)",
    5: "Union de fait",
    6: "S√©par√©(e)",
}

# Calculer le taux de dropout par statut matrimonial
marital_stats = (
    df.groupby("Marital status")
    .agg({"Dropout_Binary": lambda x: (x == "Dropout").sum(), "Target": "count"})
    .rename(columns={"Dropout_Binary": "Dropouts", "Target": "Total"})
)

marital_stats["Taux Dropout (%)"] = (
    marital_stats["Dropouts"] / marital_stats["Total"] * 100
).round(1)
marital_stats["Statut"] = marital_stats.index.map(marital_mapping)
marital_stats = marital_stats.sort_values("Taux Dropout (%)", ascending=False)

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Graphique 1: Taux de dropout
ax1 = axes[0]
x_positions = range(len(marital_stats))
colors = [
    "#e74c3c" if x > global_dropout_rate else "#2ecc71" for x in marital_stats["Taux Dropout (%)"]
]
bars1 = ax1.bar(x_positions, marital_stats["Taux Dropout (%)"], color=colors, edgecolor="black")
ax1.axhline(y=global_dropout_rate, color="red", linestyle="--", linewidth=2)
ax1.set_ylabel("Taux de Dropout (%)")
ax1.set_title("Taux de Dropout par Statut Matrimonial", fontsize=12, weight="bold")
ax1.set_xticks(x_positions)
ax1.set_xticklabels(marital_stats["Statut"], rotation=45, ha="right")

# Graphique 2: Effectifs
ax2 = axes[1]
bars2 = ax2.bar(x_positions, marital_stats["Total"], color="steelblue", edgecolor="black")
ax2.set_ylabel("Nombre d'√©tudiants")
ax2.set_title("Effectifs par Statut Matrimonial", fontsize=12, weight="bold")
ax2.set_xticks(x_positions)
ax2.set_xticklabels(marital_stats["Statut"], rotation=45, ha="right")

plt.tight_layout()
plt.show()

# Tableau r√©capitulatif
print("\nüìã Tableau r√©capitulatif:")
display(marital_stats[["Statut", "Total", "Dropouts", "Taux Dropout (%)"]].reset_index(drop=True))

print(
    f"\nüí° Note: La majorit√© des √©tudiants sont c√©libataires ({marital_stats[marital_stats['Statut'] == 'C√©libataire']['Total'].values[0]} sur {len(df)})"
)

## Phase 4 : Analyse de Corr√©lation

**Objectif** : Identifier les variables les plus corr√©l√©es au Dropout et d√©tecter les multicolin√©arit√©s.

**Questions cl√©s** :
- Quelles variables num√©riques sont les plus li√©es au Dropout ?
- Y a-t-il des variables redondantes (fortement corr√©l√©es entre elles) ?
- Quelles variables prioriser pour le machine learning ?

In [None]:
# =============================================================================
# 4.1 Matrice de Corr√©lation Compl√®te
# =============================================================================
# Cr√©ation d'une variable num√©rique pour Dropout (1 = Dropout, 0 = Non-Dropout)
df["Dropout_Numeric"] = (df["Dropout_Binary"] == "Dropout").astype(int)

# S√©lection des variables num√©riques pour la corr√©lation
# On exclut les variables cat√©gorielles encod√©es en num√©rique
numeric_for_corr = [
    "Dropout_Numeric",  # Notre variable cible
    "Age at enrollment",
    "Admission grade",
    "Previous qualification (grade)",
    # Performance 1er semestre
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
    # Performance 2√®me semestre
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)",
    # Indicateurs √©conomiques
    "Unemployment rate",
    "Inflation rate",
    "GDP",
]

# Calcul de la matrice de corr√©lation
corr_matrix = df[numeric_for_corr].corr()

# Visualisation avec heatmap
fig, ax = plt.subplots(figsize=(16, 14))

# Masque pour la partie triangulaire sup√©rieure (√©vite les doublons)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

# Heatmap avec annotations
sns.heatmap(
    corr_matrix,
    mask=mask,
    annot=True,
    fmt=".2f",
    cmap="RdBu_r",  # Rouge = corr√©lation n√©gative, Bleu = positive
    center=0,
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8, "label": "Coefficient de corr√©lation"},
    ax=ax,
)

ax.set_title(
    "Matrice de Corr√©lation - Variables Num√©riques vs Dropout\n(Triangle inf√©rieur)",
    fontsize=14,
    fontweight="bold",
    pad=20,
)

plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("\nüìä Interpr√©tation de la heatmap :")
print("‚Ä¢ üî¥ Rouge fonc√© : Corr√©lation n√©gative forte (quand l'une augmente, l'autre diminue)")
print("‚Ä¢ üîµ Bleu fonc√© : Corr√©lation positive forte (les deux augmentent ensemble)")
print("‚Ä¢ ‚ö™ Blanc/Clair : Pas de corr√©lation significative")

In [None]:
# =============================================================================
# 4.2 Top 10 Corr√©lations avec Dropout
# =============================================================================
# Extraction des corr√©lations avec notre variable cible
dropout_corr = corr_matrix["Dropout_Numeric"].drop("Dropout_Numeric")

# Tri par valeur absolue (pour voir les plus fortes, positives ou n√©gatives)
dropout_corr_sorted = dropout_corr.reindex(dropout_corr.abs().sort_values(ascending=False).index)

# Affichage du Top 10
print("üéØ Top 10 Variables les plus corr√©l√©es au Dropout\n")
print("=" * 60)
for i, (var, corr) in enumerate(dropout_corr_sorted.head(10).items(), 1):
    direction = "‚¨ÜÔ∏è positive" if corr > 0 else "‚¨áÔ∏è n√©gative"
    print(f"{i:2}. {var:<45} {corr:+.3f} ({direction})")
print("=" * 60)

# Visualisation avec barplot horizontal
fig, ax = plt.subplots(figsize=(12, 8))

colors = ["#e74c3c" if x > 0 else "#2ecc71" for x in dropout_corr_sorted.head(10)]
bars = ax.barh(range(10), dropout_corr_sorted.head(10).values, color=colors)

ax.set_yticks(range(10))
ax.set_yticklabels(dropout_corr_sorted.head(10).index)
ax.invert_yaxis()  # Top variables en haut
ax.axvline(x=0, color="black", linewidth=0.5)
ax.set_xlabel("Coefficient de Corr√©lation avec Dropout")
ax.set_title(
    "Top 10 Variables les Plus Corr√©l√©es au Dropout\n(Rouge = ‚Üë Dropout, Vert = ‚Üì Dropout)",
    fontsize=12,
    fontweight="bold",
)

# Annotations sur les barres
for bar, val in zip(bars, dropout_corr_sorted.head(10).values):
    ax.text(
        val + 0.02 if val > 0 else val - 0.02,
        bar.get_y() + bar.get_height() / 2,
        f"{val:+.3f}",
        va="center",
        ha="left" if val > 0 else "right",
        fontsize=9,
    )

plt.tight_layout()
plt.show()

# Interpr√©tation
print("\nüìä Interpr√©tation :")
print("\nüî¥ Corr√©lations POSITIVES (facteurs de risque - augmentent le dropout) :")
for var, corr in dropout_corr_sorted.head(10).items():
    if corr > 0.1:
        print(f"- {var}: {corr:+.3f}")

print("\nüü¢ Corr√©lations N√âGATIVES (facteurs protecteurs - diminuent le dropout) :")
for var, corr in dropout_corr_sorted.head(10).items():
    if corr < -0.1:
        print(f"- {var}: {corr:+.3f}")

In [None]:
# =============================================================================
# 4.3 Analyse de Multicolin√©arit√©
# =============================================================================
# Recherche des paires de variables fortement corr√©l√©es entre elles
# (potentiellement redondantes pour un mod√®le ML)

# Cr√©ation d'un masque pour la partie triangulaire sup√©rieure (sans la diagonale)
mask_upper = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

# Extraction des corr√©lations
corr_pairs = []
for i in range(len(corr_matrix)):
    for j in range(i + 1, len(corr_matrix)):
        var1 = corr_matrix.index[i]
        var2 = corr_matrix.columns[j]
        corr_val = corr_matrix.iloc[i, j]
        if var1 != "Dropout_Numeric" and var2 != "Dropout_Numeric":
            corr_pairs.append((var1, var2, corr_val))

# Tri par corr√©lation absolue
corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

# Affichage des paires les plus corr√©l√©es
print("üîç Paires de Variables avec Forte Multicolin√©arit√© (|r| > 0.7)\n")
print("=" * 80)
print(f"{'Variable 1':<40} {'Variable 2':<25} {'Corr√©lation':>10}")
print("-" * 80)

high_corr_count = 0
for var1, var2, corr in corr_pairs:
    if abs(corr) > 0.7:
        print(f"{var1:<40} {var2:<25} {corr:+.3f}")
        high_corr_count += 1

if high_corr_count == 0:
    print("Aucune paire avec |r| > 0.7")
print("=" * 80)

print(f"\nüìä Nombre de paires avec multicolin√©arit√© forte : {high_corr_count}")

# Recommandations
print("\nüí° Recommandations pour le Machine Learning :")
print("\n1. Variables CL√âS (plus corr√©l√©es au Dropout) :")
for var, corr in dropout_corr_sorted.head(5).items():
    print(f"- {var}: {corr:+.3f}")

print("\n2. Variables REDONDANTES (multicolin√©arit√©) :")
seen = set()
for var1, var2, corr in corr_pairs[:5]:
    if abs(corr) > 0.5:
        if var1 not in seen and var2 not in seen:
            print(f"- {var1} ‚Üî {var2} (r={corr:+.3f})")
            print("     ‚Üí Consid√©rer de ne garder qu'une des deux")
            seen.add(var2)

print("\n3. Strat√©gie recommand√©e :")
print("- Prioriser les variables de performance (notes, unit√©s valid√©es)")
print("- Attention : les variables du 1er et 2√®me semestre sont tr√®s corr√©l√©es")
print("- L'√¢ge est le seul facteur de risque positif majeur")

### Synth√®se Phase 4 : Matrice de Corr√©lation

**D√©couvertes cl√©s :**

| Aspect | R√©sultat |
|--------|----------|
| **Pr√©dicteurs les plus forts** | Notes et unit√©s valid√©es du 2√®me semestre (r ‚âà -0.57) |
| **Facteur de risque positif** | √Çge √† l'inscription (+0.254) - √©tudiants plus √¢g√©s |
| **Indicateurs √©conomiques** | Faible corr√©lation avec le dropout (< 0.05) |
| **Multicolin√©arit√© d√©tect√©e** | 11 paires avec \|r\| > 0.7 |

**Implications pour le Machine Learning :**
1. Les variables sem1 et sem2 sont fortement corr√©l√©es ‚Üí risque de redondance
2. Prioriser les **grades** (notes) sur les autres m√©triques acad√©miques
3. Le 2√®me semestre est plus pr√©dictif que le 1er
4. L'√¢ge est un facteur de risque √† ne pas n√©gliger

**Prochaine √©tape sugg√©r√©e :** Feature Engineering (cr√©er des ratios, agr√©gations) pour r√©duire la dimensionnalit√© tout en conservant l'information pr√©dictive.

## Phase 5 : Feature Engineering

**Objectif** : Cr√©er de nouvelles variables pour am√©liorer la pr√©diction et r√©duire la multicolin√©arit√© d√©tect√©e en Phase 4.

**Transformations pr√©vues :**
| Type | Description |
|------|-------------|
| **Discr√©tisation** | √Çge ‚Üí tranches (17-20, 21-25, 26-35, 36+) |
| **Regroupement** | Statut marital ‚Üí Solo/Couple |
| **Regroupement** | Qualifications ‚Üí Secondaire/Sup√©rieur |
| **Regroupement** | Programmes ‚Üí Domaines (Sant√©, Tech, Business...) |
| **Ratios** | Taux de r√©ussite par semestre |
| **Agr√©gations** | Notes moyennes, tendance de performance |

In [None]:
# =============================================================================
# 5.1 Discr√©tisation de l'√¢ge
# =============================================================================
# Objectif : Transformer l'√¢ge continu en cat√©gories pour faciliter l'analyse
# Justification : L'√¢ge a une corr√©lation positive avec le dropout (+0.254)

# D√©finition des tranches d'√¢ge (labels purement descriptifs)
bins = [0, 20, 25, 35, 100]
labels = ["17-20", "21-25", "26-35", "36+"]

# Cr√©ation de la nouvelle variable
df["Age_Group"] = pd.cut(df["Age at enrollment"], bins=bins, labels=labels)

# V√©rification de la distribution
print("üìä Distribution des tranches d'√¢ge:")
print("=" * 50)
age_dist = df["Age_Group"].value_counts().sort_index()
for group, count in age_dist.items():
    pct = count / len(df) * 100
    print(f"  {group}: {count:,} ({pct:.1f}%)")

# Calcul du taux de dropout par tranche d'√¢ge (avec observed=True pour √©viter le FutureWarning)
print("\nüéØ Taux de Dropout par tranche d'√¢ge:")
print("=" * 50)
dropout_by_age = (
    df.groupby("Age_Group", observed=True)["Dropout_Binary"]
    .apply(lambda x: (x == "Dropout").mean() * 100)
    .sort_index()
)

for group, rate in dropout_by_age.items():
    indicator = "üî¥" if rate > 35 else "üü°" if rate > 30 else "üü¢"
    print(f"  {indicator} {group}: {rate:.1f}%")

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution des tranches d'√¢ge
colors = ["#3498db", "#9b59b6", "#e67e22", "#e74c3c"]
ax1 = axes[0]
age_dist.plot(kind="bar", ax=ax1, color=colors, edgecolor="black", alpha=0.8)
ax1.set_title("Distribution des Tranches d'√Çge", fontsize=14, fontweight="bold")
ax1.set_xlabel("Tranche d'√¢ge")
ax1.set_ylabel("Nombre d'√©tudiants")
ax1.tick_params(axis="x", rotation=45)

# Ajouter les pourcentages sur les barres
for i, (idx, val) in enumerate(age_dist.items()):
    ax1.text(i, val + 50, f"{val / len(df) * 100:.1f}%", ha="center", fontsize=10)

# Taux de dropout par tranche
ax2 = axes[1]
colors_dropout = [
    "#2ecc71" if r < 30 else "#f39c12" if r < 35 else "#e74c3c" for r in dropout_by_age
]
dropout_by_age.plot(kind="bar", ax=ax2, color=colors_dropout, edgecolor="black", alpha=0.8)
ax2.axhline(
    y=df["Dropout_Binary"].apply(lambda x: x == "Dropout").mean() * 100,
    color="red",
    linestyle="--",
    linewidth=2,
    label="Moyenne globale",
)
ax2.set_title("Taux de Dropout par Tranche d'√Çge", fontsize=14, fontweight="bold")
ax2.set_xlabel("Tranche d'√¢ge")
ax2.set_ylabel("Taux de Dropout (%)")
ax2.tick_params(axis="x", rotation=45)
ax2.legend()

# Ajouter les valeurs sur les barres
for i, (idx, val) in enumerate(dropout_by_age.items()):
    ax2.text(i, val + 1, f"{val:.1f}%", ha="center", fontsize=10, fontweight="bold")

plt.tight_layout()
plt.show()

print("\nüí° Insight : Les √©tudiants plus √¢g√©s (26+) ont un taux de dropout plus √©lev√©.")

In [None]:
# =============================================================================
# 5.7 Tableau r√©capitulatif des nouvelles features
# =============================================================================

# Recr√©er les features manquantes si n√©cessaire (en cas de red√©marrage kernel)
if "Marital_Binary" not in df.columns:
    solo = [1, 3, 4, 6]  # C√©libataire, Veuf, Divorc√©, S√©par√©
    df["Marital_Binary"] = df["Marital status"].apply(lambda x: "Solo" if x in solo else "Couple")

if "Education_Level" not in df.columns:
    secondaire = [1, 9, 10, 12, 14, 15, 19, 38]
    superieur = [2, 3, 4, 5, 6, 39, 40, 42, 43]
    df["Education_Level"] = df["Previous qualification"].apply(
        lambda x: "Secondaire" if x in secondaire else "Sup√©rieur" if x in superieur else "Autre"
    )

if "Course_Domain" not in df.columns:
    course_domains = {
        33: "Tech",
        171: "Arts",
        8014: "Social",
        9003: "Sciences",
        9070: "Arts",
        9085: "Sant√©",
        9119: "Tech",
        9130: "Sciences",
        9147: "Business",
        9238: "Social",
        9254: "Business",
        9500: "Sant√©",
        9556: "Sant√©",
        9670: "Business",
        9773: "Arts",
        9853: "Education",
        9991: "Business",
    }
    df["Course_Domain"] = df["Course"].map(course_domains)

if "Success_Rate_Sem1" not in df.columns:
    df["Success_Rate_Sem1"] = df["Curricular units 1st sem (approved)"] / df[
        "Curricular units 1st sem (enrolled)"
    ].replace(0, np.nan)
    df["Success_Rate_Sem2"] = df["Curricular units 2nd sem (approved)"] / df[
        "Curricular units 2nd sem (enrolled)"
    ].replace(0, np.nan)

if "Avg_Grade" not in df.columns:
    df["Avg_Grade"] = (
        df["Curricular units 1st sem (grade)"] + df["Curricular units 2nd sem (grade)"]
    ) / 2
    df["Total_Approved"] = (
        df["Curricular units 1st sem (approved)"] + df["Curricular units 2nd sem (approved)"]
    )
    df["Performance_Trend"] = (
        df["Curricular units 2nd sem (grade)"] - df["Curricular units 1st sem (grade)"]
    )

## R√©capitulatif des nouvelles features cr√©√©es

In [None]:
# Liste des nouvelles features avec leurs statistiques
new_features = [
    ("Age_Group", "Cat√©gorielle", "4 cat√©gories"),
    ("Marital_Binary", "Binaire", "Solo / Couple"),
    ("Education_Level", "Cat√©gorielle", "3 niveaux"),
    ("Course_Domain", "Cat√©gorielle", "6 domaines"),
    ("Success_Rate_Sem1", "Num√©rique", "Ratio [0-1]"),
    ("Success_Rate_Sem2", "Num√©rique", "Ratio [0-1]"),
    ("Avg_Grade", "Num√©rique", "Moyenne notes"),
    ("Total_Approved", "Num√©rique", "Somme unit√©s"),
    ("Performance_Trend", "Num√©rique", "Diff√©rence grades"),
]

print(f"{'Feature':<25} {'Type':<15} {'Description':<20}")
print("-" * 60)
for name, ftype, desc in new_features:
    print(f"{name:<25} {ftype:<15} {desc:<20}")

print()
print("=" * 70)
print()

# Impact sur le dropout pour chaque nouvelle feature
print("üéØ IMPACT DES NOUVELLES FEATURES SUR LE DROPOUT")
print("=" * 70)
print()

# Features cat√©gorielles
print("VARIABLES CAT√âGORIELLES:")
print("-" * 40)

for feat in ["Age_Group", "Marital_Binary", "Education_Level", "Course_Domain"]:
    dropout_rate = df.groupby(feat, observed=True)["Dropout_Binary"].apply(
        lambda x: (x == "Dropout").mean() * 100
    )
    max_rate = dropout_rate.max()
    min_rate = dropout_rate.min()
    spread = max_rate - min_rate
    print(f"  {feat}: √©cart {spread:.1f}% (min: {min_rate:.1f}%, max: {max_rate:.1f}%)")

print()
print("VARIABLES NUM√âRIQUES (corr√©lation avec Dropout):")
print("-" * 40)

for feat in [
    "Success_Rate_Sem1",
    "Success_Rate_Sem2",
    "Avg_Grade",
    "Total_Approved",
    "Performance_Trend",
]:
    corr = df[feat].corr(df["Dropout_Numeric"])
    direction = "üî¥ Risque" if corr > 0 else "üü¢ Protecteur"
    print(f"  {feat}: r = {corr:+.3f} ({direction})")

print()
print("=" * 70)
print("üí° Les nouvelles features am√©liorent la lisibilit√© et r√©duisent la multicolin√©arit√©")

### Synth√®se Phase 5 : Feature Engineering

**9 nouvelles variables cr√©√©es** pour am√©liorer l'analyse et pr√©parer le Machine Learning.

#### R√©sultats cl√©s :

| Type de transformation | Meilleure feature | Impact sur Dropout |
|------------------------|-------------------|-------------------|
| **Discr√©tisation** | Age_Group | √âcart de 36.3% entre tranches |
| **Regroupement cat√©goriel** | Course_Domain | √âcart de 34.7% entre domaines |
| **Ratio de performance** | Success_Rate_Sem2 | r = -0.705 (le plus fort !) |
| **Agr√©gation** | Avg_Grade | r = -0.551 |

#### D√©couvertes importantes :

1. **Le taux de r√©ussite est plus pr√©dictif que les notes brutes**
   - `Success_Rate_Sem2` (r = -0.705) surpasse m√™me les notes individuelles
   - Cela confirme l'importance du ratio approved/enrolled

2. **Les domaines d'√©tudes r√©v√®lent des patterns clairs**
   - Tech : 54.9% de dropout (risque tr√®s √©lev√©)
   - Sant√© : 20.3% de dropout (protection forte)

3. **Les regroupements simplifient sans perdre d'information**
   - Solo vs Couple capture l'essentiel du statut marital
   - Secondaire/Sup√©rieur est suffisant pour la qualification

#### Recommandations pour le Modeling :

‚úÖ **Variables √† privil√©gier** :
- `Success_Rate_Sem2` (r = -0.705)
- `Age_Group` (√©cart 36.3%)
- `Course_Domain` (√©cart 34.7%)

‚ùå **Variables redondantes √† √©viter** :
- Ne pas utiliser √† la fois les notes ET les ratios
- Choisir entre `Total_Approved` et `Success_Rate`

---

**L'EDA est maintenant complet !** Les donn√©es sont pr√™tes pour la phase de mod√©lisation.

## Phase 6 : Pr√©paration ML et Mod√©lisation

**Objectif** : Pr√©parer les donn√©es et entra√Æner des mod√®les de classification pour pr√©dire le dropout.

### Strat√©gie bas√©e sur l'EDA :
| Aspect | D√©cision |
|--------|----------|
| **Features cl√©s** | Ratios de performance (`Success_Rate_Sem2` r=-0.705), agr√©gations |
| **Exclusions** | Variables avec multicolin√©arit√© (grades individuels), indicateurs √©conomiques |
| **D√©s√©quilibre** | ~32% Dropout ‚Üí utiliser `class_weight='balanced'` |
| **M√©trique prioritaire** | **Recall** (ne pas manquer les √©tudiants √† risque) |

In [None]:
# =============================================================================
# 6.1 Pr√©paration des Features et Target
# =============================================================================
# S√©lection des features bas√©e sur l'EDA (corr√©lations, multicolin√©arit√©)

# Recr√©er les features engineered si n√©cessaire
if "Success_Rate_Sem1" not in df.columns:
    df["Success_Rate_Sem1"] = df["Curricular units 1st sem (approved)"] / df[
        "Curricular units 1st sem (enrolled)"
    ].replace(0, np.nan)
    df["Success_Rate_Sem2"] = df["Curricular units 2nd sem (approved)"] / df[
        "Curricular units 2nd sem (enrolled)"
    ].replace(0, np.nan)
    df["Avg_Grade"] = (
        df["Curricular units 1st sem (grade)"] + df["Curricular units 2nd sem (grade)"]
    ) / 2
    df["Total_Approved"] = (
        df["Curricular units 1st sem (approved)"] + df["Curricular units 2nd sem (approved)"]
    )
    df["Performance_Trend"] = (
        df["Curricular units 2nd sem (grade)"] - df["Curricular units 1st sem (grade)"]
    )

# D√©finir les features selon le plan
numeric_features = [
    "Success_Rate_Sem1",
    "Success_Rate_Sem2",
    "Avg_Grade",
    "Total_Approved",
    "Age at enrollment",
    "Admission grade",
    "Performance_Trend",
]

# Features cat√©gorielles cr√©√©es pendant le Feature Engineering
categorical_features = [
    "Age_Group",
    "Course_Domain",
    "Marital_Binary",
    "Education_Level",
]

# Features binaires (d√©j√† encod√©es 0/1)
binary_features = [
    "Tuition fees up to date",
    "Scholarship holder",
    "Debtor",
    "Gender",
    "Displaced",
]

# V√©rifier que toutes les features existent
all_features = numeric_features + categorical_features + binary_features
missing_cols = [col for col in all_features if col not in df.columns]
if missing_cols:
    print(f"‚ö†Ô∏è Colonnes manquantes : {missing_cols}")
else:
    print("‚úÖ Toutes les features sont disponibles")

# Cr√©er X (features) et y (target binaire)
X = df[all_features].copy()
y = (df["Dropout_Binary"] == "Dropout").astype(int)  # 1 = Dropout, 0 = Non-Dropout

print("\nDimensions:")
print(f"  X : {X.shape}")
print(f"  y : {y.shape}")
print("\nDistribution du target:")
print(f"  Dropout (1)     : {y.sum()} ({y.mean() * 100:.1f}%)")
print(f"  Non-Dropout (0) : {len(y) - y.sum()} ({(1 - y.mean()) * 100:.1f}%)")

In [None]:
# =============================================================================
# 6.2 Gestion des Valeurs Manquantes
# =============================================================================
# Les ratios Success_Rate peuvent avoir des NaN (division par 0 si enrolled=0)

print("üìä Valeurs manquantes dans X:")
print("=" * 50)
missing = X.isnull().sum()
missing_pct = (X.isnull().sum() / len(X)) * 100
missing_df = pd.DataFrame({"Manquantes": missing, "Pourcentage": missing_pct})
missing_df = missing_df[missing_df["Manquantes"] > 0].sort_values("Manquantes", ascending=False)

if len(missing_df) > 0:
    display(missing_df)

    # Strat√©gie : Imputer avec la m√©diane pour les variables num√©riques
    # (les NaN signifient 0 unit√©s inscrites = √©tudiant probablement en difficult√©)
    print("\nüîß Strat√©gie d'imputation : M√©diane pour les variables num√©riques")

    for col in missing_df.index:
        if col in numeric_features:
            median_val = X[col].median()
            X[col] = X[col].fillna(median_val)
            print(f"  {col}: NaN ‚Üí {median_val:.3f} (m√©diane)")

    print("\n‚úÖ Apr√®s imputation:")
    print(f"  Valeurs manquantes restantes: {X.isnull().sum().sum()}")
else:
    print("‚úÖ Aucune valeur manquante!")

In [None]:
# =============================================================================
# 6.3 Pipeline de Pr√©traitement et Split Train/Test
# =============================================================================

# Cr√©er le preprocessor avec ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        (
            "cat",
            OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"),
            categorical_features,
        ),
        ("bin", "passthrough", binary_features),
    ],
    remainder="drop",
)

# Split stratifi√© (pr√©serve les proportions de classes)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("üìä Split Train/Test (stratifi√©):")
print("=" * 50)
print(f"\n  Train: {len(X_train)} samples ({len(X_train) / len(X) * 100:.0f}%)")
print(f"    - Dropout: {y_train.sum()} ({y_train.mean() * 100:.1f}%)")
print(f"    - Non-Dropout: {len(y_train) - y_train.sum()} ({(1 - y_train.mean()) * 100:.1f}%)")

print(f"\n  Test: {len(X_test)} samples ({len(X_test) / len(X) * 100:.0f}%)")
print(f"    - Dropout: {y_test.sum()} ({y_test.mean() * 100:.1f}%)")
print(f"    - Non-Dropout: {len(y_test) - y_test.sum()} ({(1 - y_test.mean()) * 100:.1f}%)")

# V√©rifier que les proportions sont pr√©serv√©es
print(
    f"\n‚úÖ Proportions pr√©serv√©es: Train {y_train.mean() * 100:.1f}% vs Test {y_test.mean() * 100:.1f}%"
)

In [None]:
# =============================================================================
# 6.4 Mod√®le Baseline : Logistic Regression
# =============================================================================
# Pourquoi Logistic Regression comme baseline ?
# - Interpr√©table (coefficients = importance des features)
# - Rapide √† entra√Æner
# - R√©f√©rence pour comparer les mod√®les plus complexes

# Cr√©er le pipeline complet (pr√©traitement + mod√®le)
baseline_model = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "classifier",
            LogisticRegression(
                class_weight="balanced",  # G√®re le d√©s√©quilibre des classes
                max_iter=1000,
                random_state=42,
            ),
        ),
    ]
)

# Entra√Æner le mod√®le
print("üîÑ Entra√Ænement du mod√®le baseline (Logistic Regression)...")
baseline_model.fit(X_train, y_train)
print("‚úÖ Mod√®le entra√Æn√©!")

# Pr√©dictions
y_pred = baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)[:, 1]

# M√©triques
print("\n" + "=" * 60)
print("üìä √âVALUATION DU MOD√àLE BASELINE")
print("=" * 60)

print("\nüìã Rapport de Classification:")
print("-" * 40)
print(classification_report(y_test, y_pred, target_names=["Non-Dropout", "Dropout"]))

# M√©triques cl√©s
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print("\nüéØ M√©triques Prioritaires (classe Dropout):")
print("-" * 40)
print(f"  Recall (Dropout)  : {recall:.3f} ‚≠ê (priorit√© : ne pas manquer les √† risque)")
print(f"  F1-Score (Dropout): {f1:.3f}")
print(f"  AUC-ROC           : {auc:.3f}")

In [None]:
# =============================================================================
# 6.5 Visualisation des R√©sultats
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Matrice de confusion
ax1 = axes[0]
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Non-Dropout", "Dropout"])
disp.plot(ax=ax1, cmap="Blues", values_format="d")
ax1.set_title(
    "Matrice de Confusion\n(Logistic Regression Baseline)",
    fontsize=14,
    fontweight="bold",
)

# Courbe ROC
ax2 = axes[1]
RocCurveDisplay.from_predictions(y_test, y_pred_proba, ax=ax2, color="#3498db", lw=2)
ax2.plot([0, 1], [0, 1], "k--", lw=1, label="Random (AUC=0.5)")
ax2.set_title(f"Courbe ROC (AUC = {auc:.3f})", fontsize=14, fontweight="bold")
ax2.legend(loc="lower right")

plt.tight_layout()
plt.show()

# Interpr√©tation de la matrice de confusion
tn, fp, fn, tp = cm.ravel()
print("\nüìä Interpr√©tation de la Matrice de Confusion:")
print("=" * 60)
print(f"  ‚úÖ Vrais N√©gatifs (TN)  : {tn} Non-Dropouts correctement identifi√©s")
print(f"  ‚úÖ Vrais Positifs (TP)  : {tp} Dropouts correctement identifi√©s")
print(f"  ‚ö†Ô∏è Faux Positifs (FP)   : {fp} Non-Dropouts class√©s Dropout (fausse alerte)")
print(f"  ‚ùå Faux N√©gatifs (FN)   : {fn} Dropouts manqu√©s (risque principal!)")
print(
    f"\nüí° Sur {y_test.sum()} √©tudiants √† risque, le mod√®le en identifie {tp} ({tp / y_test.sum() * 100:.1f}%)"
)

### Synth√®se Phase 6 : Mod√®le Baseline

**R√©sultats du mod√®le Logistic Regression (Baseline)** :

| M√©trique | Valeur | Interpr√©tation |
|----------|--------|----------------|
| **Recall (Dropout)** | 83.1% | Sur 284 √©tudiants √† risque, 236 sont identifi√©s |
| **F1-Score (Dropout)** | 81.4% | Bon √©quilibre pr√©cision/rappel |
| **AUC-ROC** | 93.1% | Excellente capacit√© de discrimination |
| **Accuracy** | 87.8% | Performance globale |

**Analyse des erreurs** :
- **48 Faux N√©gatifs** : √âtudiants √† risque non d√©tect√©s (erreur critique ‚ùå)
- **60 Faux Positifs** : Fausses alertes (co√ªt acceptable)

**Conclusion** :
Le mod√®le baseline Logistic Regression atteint d√©j√† des performances tr√®s satisfaisantes gr√¢ce :
1. Aux features engineered (`Success_Rate_Sem2` avec r = -0.705)
2. √Ä la gestion du d√©s√©quilibre via `class_weight='balanced'`
3. √Ä un bon pr√©traitement (StandardScaler + OneHotEncoder)

**Prochaines √©tapes sugg√©r√©es** :
1. Tester d'autres mod√®les (Random Forest, XGBoost, etc.)
2. Optimiser les hyperparam√®tres avec GridSearchCV
3. Explorer SMOTE vs class_weight pour le d√©s√©quilibre
4. Ajuster le seuil de d√©cision pour maximiser le Recall

## Phase 6.2 : Comparaison de Mod√®les et Optimisation

**Objectif** : Tester plusieurs algorithmes et optimiser le meilleur mod√®le.

### Mod√®les √† comparer :
| Niveau | Mod√®le | Caract√©ristiques |
|--------|--------|------------------|
| Baseline | Logistic Regression | D√©j√† fait (Recall=83.1%, AUC=93.1%) |
| Interm√©diaire | Random Forest | Robuste, feature importance native |
| Interm√©diaire | Gradient Boosting | Performant, g√®re bien le d√©s√©quilibre |
| Interm√©diaire | SVM (RBF) | Bon pour donn√©es non-lin√©aires |
| Avanc√© | XGBoost | √âtat de l'art, tr√®s performant |

### Strat√©gie d'√©valuation :
- **Cross-validation 5-fold stratifi√©e** pour comparer √©quitablement
- **M√©trique principale** : F1-Score (√©quilibre pr√©cision/rappel)
- **M√©trique secondaire** : Recall (ne pas manquer les dropouts)

### Comparaison de Mo√®les avec Cross-Validation

In [None]:
# Calculer scale_pos_weight pour XGBoost (ratio classe majoritaire/minoritaire)
scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

# D√©finir les mod√®les √† comparer
models = {
    "Logistic Regression": LogisticRegression(
        class_weight="balanced", max_iter=1000, random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1
    ),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM (RBF)": SVC(kernel="rbf", class_weight="balanced", probability=True, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=100,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        eval_metric="logloss",
        verbosity=0,
    ),
}

print(f"\nMod√®les √† comparer : {len(models)}")
for name in models.keys():
    print(f"  - {name}")

In [None]:
# =============================================================================
# 6.2.2 Cross-Validation 5-Fold Stratifi√©e
# =============================================================================
# Stratification : pr√©serve les proportions de classes dans chaque fold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Stocker les r√©sultats
results = {
    "Model": [],
    "F1 Mean": [],
    "F1 Std": [],
    "Recall Mean": [],
    "Recall Std": [],
    "AUC Mean": [],
    "AUC Std": [],
}

print("üîÑ Cross-validation en cours...")
print("=" * 70)

for name, model in models.items():
    # Cr√©er le pipeline pour chaque mod√®le
    pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", model)])

    # Calculer les scores
    f1_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)
    recall_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1)
    auc_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)

    # Stocker les r√©sultats
    results["Model"].append(name)
    results["F1 Mean"].append(f1_scores.mean())
    results["F1 Std"].append(f1_scores.std())
    results["Recall Mean"].append(recall_scores.mean())
    results["Recall Std"].append(recall_scores.std())
    results["AUC Mean"].append(auc_scores.mean())
    results["AUC Std"].append(auc_scores.std())

    print(f"\nüìä {name}:")
    print(f"   F1-Score : {f1_scores.mean():.3f} (¬±{f1_scores.std():.3f})")
    print(f"   Recall   : {recall_scores.mean():.3f} (¬±{recall_scores.std():.3f})")
    print(f"   AUC-ROC  : {auc_scores.mean():.3f} (¬±{auc_scores.std():.3f})")

print("\n" + "=" * 70)
print("‚úÖ Cross-validation termin√©e!")

In [None]:
# =============================================================================
# 6.2.3 Tableau Comparatif des Mod√®les
# =============================================================================

results_df = pd.DataFrame(results)

# Trier par F1-Score
results_df = results_df.sort_values("F1 Mean", ascending=False)


# Formater pour l'affichage
def format_score(mean, std):
    return f"{mean:.3f} ¬±{std:.3f}"


display_df = pd.DataFrame(
    {
        "Mod√®le": results_df["Model"],
        "F1-Score": [
            format_score(m, s) for m, s in zip(results_df["F1 Mean"], results_df["F1 Std"])
        ],
        "Recall": [
            format_score(m, s) for m, s in zip(results_df["Recall Mean"], results_df["Recall Std"])
        ],
        "AUC-ROC": [
            format_score(m, s) for m, s in zip(results_df["AUC Mean"], results_df["AUC Std"])
        ],
    }
)

print("üìä Comparaison des Mod√®les (Cross-Validation 5-Fold)")
print("=" * 70)
display(display_df.reset_index(drop=True))

# Identifier le meilleur mod√®le
best_model_name = results_df.iloc[0]["Model"]
best_f1 = results_df.iloc[0]["F1 Mean"]
best_recall = results_df.iloc[0]["Recall Mean"]

print(f"\nüèÜ Meilleur mod√®le (F1-Score) : {best_model_name}")
print(f"   F1 = {best_f1:.3f}, Recall = {best_recall:.3f}")

In [None]:
# =============================================================================
# 6.2.4 Visualisation Comparative
# =============================================================================

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Pr√©parer les donn√©es pour les graphiques
models_names = results_df["Model"].tolist()
colors = ["#3498db", "#2ecc71", "#e74c3c", "#9b59b6"]

# Graphique 1: F1-Score
ax1 = axes[0]
bars1 = ax1.barh(
    models_names,
    results_df["F1 Mean"],
    xerr=results_df["F1 Std"],
    color=colors,
    alpha=0.8,
    capsize=5,
)
ax1.set_xlabel("F1-Score", fontsize=12)
ax1.set_title("F1-Score par Mod√®le", fontsize=14, fontweight="bold")
ax1.set_xlim(0.65, 0.85)
for i, (v, s) in enumerate(zip(results_df["F1 Mean"], results_df["F1 Std"])):
    ax1.text(v + s + 0.005, i, f"{v:.3f}", va="center", fontsize=10)

# Graphique 2: Recall
ax2 = axes[1]
bars2 = ax2.barh(
    models_names,
    results_df["Recall Mean"],
    xerr=results_df["Recall Std"],
    color=colors,
    alpha=0.8,
    capsize=5,
)
ax2.set_xlabel("Recall", fontsize=12)
ax2.set_title("Recall (Dropout) par Mod√®le", fontsize=14, fontweight="bold")
ax2.set_xlim(0.65, 0.85)
for i, (v, s) in enumerate(zip(results_df["Recall Mean"], results_df["Recall Std"])):
    ax2.text(v + s + 0.005, i, f"{v:.3f}", va="center", fontsize=10)

# Graphique 3: AUC-ROC
ax3 = axes[2]
bars3 = ax3.barh(
    models_names,
    results_df["AUC Mean"],
    xerr=results_df["AUC Std"],
    color=colors,
    alpha=0.8,
    capsize=5,
)
ax3.set_xlabel("AUC-ROC", fontsize=12)
ax3.set_title("AUC-ROC par Mod√®le", fontsize=14, fontweight="bold")
ax3.set_xlim(0.85, 0.95)
for i, (v, s) in enumerate(zip(results_df["AUC Mean"], results_df["AUC Std"])):
    ax3.text(v + s + 0.002, i, f"{v:.3f}", va="center", fontsize=10)

plt.tight_layout()
plt.show()

print("\nüí° Observation : Les performances sont tr√®s proches entre les mod√®les.")
print(
    "   La Logistic Regression a le meilleur Recall (79.9%), importante pour d√©tecter les dropouts."
)

### 6.2.5 Optimisation des Hyperparam√®tres

**R√©sultats de la comparaison** :
- Les 4 mod√®les ont des performances tr√®s similaires (F1 entre 0.77 et 0.79)
- **Logistic Regression** a le meilleur **Recall** (79.9%) - crucial pour notre objectif
- **SVM (RBF)** a le meilleur **F1-Score** (79.0%)

**Choix** : Nous allons optimiser **Logistic Regression** car :
1. Meilleur Recall (priorit√© pour d√©tecter les dropouts)
2. Plus interpr√©table (coefficients analysables)
3. Plus rapide √† entra√Æner et d√©ployer

In [None]:
# =============================================================================
# 6.2.6 GridSearchCV pour Logistic Regression
# =============================================================================

# Grille d'hyperparam√®tres pour Logistic Regression
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10, 100],  # Force de r√©gularisation (inverse)
    "classifier__penalty": ["l1", "l2"],  # Type de r√©gularisation
    "classifier__solver": ["saga"],  # Solver compatible avec l1 et l2
    "classifier__class_weight": ["balanced"],  # Gestion du d√©s√©quilibre
    "classifier__max_iter": [2000],  # Convergence
}

# Pipeline de base
base_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(random_state=42)),
    ]
)

# GridSearchCV avec F1-Score comme m√©trique
print("üîÑ Optimisation des hyperparam√®tres en cours...")
print("   (GridSearchCV avec 5-fold cross-validation)")
print("=" * 60)

grid_search = GridSearchCV(
    base_pipeline,
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring="f1",
    n_jobs=-1,
    verbose=1,
)

grid_search.fit(X_train, y_train)

print("\n" + "=" * 60)
print("‚úÖ Optimisation termin√©e!")
print("\nüèÜ Meilleurs hyperparam√®tres:")
for param, value in grid_search.best_params_.items():
    print(f"   {param.replace('classifier__', '')}: {value}")
print(f"\nüìä Meilleur F1-Score (CV): {grid_search.best_score_:.3f}")

In [None]:
# =============================================================================
# 6.2.7 √âvaluation du Mod√®le Optimis√© sur le Test Set
# =============================================================================

# Le meilleur mod√®le est d√©j√† r√©entra√Æn√© sur tout le train set
best_model = grid_search.best_estimator_

# Pr√©dictions sur le test set
y_pred_opt = best_model.predict(X_test)
y_pred_proba_opt = best_model.predict_proba(X_test)[:, 1]

# M√©triques
print("=" * 60)
print("üìä √âVALUATION DU MOD√àLE OPTIMIS√â (Test Set)")
print("=" * 60)

print("\nüìã Rapport de Classification:")
print("-" * 40)
print(classification_report(y_test, y_pred_opt, target_names=["Non-Dropout", "Dropout"]))

# M√©triques cl√©s
recall_opt = recall_score(y_test, y_pred_opt)
f1_opt = f1_score(y_test, y_pred_opt)
auc_opt = roc_auc_score(y_test, y_pred_proba_opt)

print("\nüéØ M√©triques Prioritaires (classe Dropout):")
print("-" * 40)
print(f"  Recall (Dropout)  : {recall_opt:.3f}")
print(f"  F1-Score (Dropout): {f1_opt:.3f}")
print(f"  AUC-ROC           : {auc_opt:.3f}")

# Comparaison avec le baseline
print("\nüìà Comparaison avec le Baseline:")
print("-" * 40)
print(
    f"  Recall    : {recall:.3f} ‚Üí {recall_opt:.3f} ({'+' if recall_opt >= recall else ''}{(recall_opt - recall) * 100:.1f}%)"
)
print(
    f"  F1-Score  : {f1:.3f} ‚Üí {f1_opt:.3f} ({'+' if f1_opt >= f1 else ''}{(f1_opt - f1) * 100:.1f}%)"
)
print(
    f"  AUC-ROC   : {auc:.3f} ‚Üí {auc_opt:.3f} ({'+' if auc_opt >= auc else ''}{(auc_opt - auc) * 100:.1f}%)"
)

In [None]:
# =============================================================================
# 6.2.8 Visualisation Finale : Matrice de Confusion et Feature Importance
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. Matrice de confusion du mod√®le optimis√©
ax1 = axes[0]
cm_opt = confusion_matrix(y_test, y_pred_opt)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_opt, display_labels=["Non-Dropout", "Dropout"])
disp.plot(ax=ax1, cmap="Blues", values_format="d")
ax1.set_title(
    "Matrice de Confusion\n(Logistic Regression Optimis√©)",
    fontsize=14,
    fontweight="bold",
)

# 2. Feature Importance (coefficients absolus)
ax2 = axes[1]

# Extraire les coefficients du mod√®le
classifier = best_model.named_steps["classifier"]
coefficients = classifier.coef_[0]

# R√©cup√©rer les noms des features apr√®s transformation
feature_names = (
    numeric_features
    + list(
        best_model.named_steps["preprocessor"]
        .named_transformers_["cat"]
        .get_feature_names_out(categorical_features)
    )
    + binary_features
)

# Cr√©er un DataFrame pour visualisation
coef_df = pd.DataFrame(
    {
        "Feature": feature_names,
        "Coefficient": coefficients,
        "Abs_Coefficient": np.abs(coefficients),
    }
).sort_values("Abs_Coefficient", ascending=True)

# Top 10 features les plus importantes
top_features = coef_df.tail(10)
colors = ["#e74c3c" if c < 0 else "#2ecc71" for c in top_features["Coefficient"]]
ax2.barh(top_features["Feature"], top_features["Coefficient"], color=colors)
ax2.axvline(x=0, color="black", linestyle="-", linewidth=0.5)
ax2.set_xlabel("Coefficient (impact sur le dropout)", fontsize=11)
ax2.set_title(
    "Top 10 Features les Plus Influentes\n(Rouge = ‚Üë risque, Vert = ‚Üì risque)",
    fontsize=14,
    fontweight="bold",
)

plt.tight_layout()
plt.show()

# Interpr√©tation
print("\nüí° Interpr√©tation des Coefficients:")
print("=" * 60)
print("  üî¥ Coefficient POSITIF ‚Üí Augmente le risque de dropout")
print("  üü¢ Coefficient N√âGATIF ‚Üí Diminue le risque de dropout")

### Synth√®se Phase 6.2 : Comparaison et Optimisation

**R√©sultats de la comparaison (Cross-Validation 5-Fold)** :

| Mod√®le | F1-Score | Recall | AUC-ROC |
|--------|----------|--------|---------|
| SVM (RBF) | 0.790 ¬±0.015 | 0.790 ¬±0.019 | 0.908 ¬±0.010 |
| Logistic Regression | 0.784 ¬±0.021 | 0.799 ¬±0.024 | 0.913 ¬±0.012 |
| Gradient Boosting | 0.783 ¬±0.013 | 0.726 ¬±0.012 | 0.911 ¬±0.013 |
| Random Forest | 0.774 ¬±0.018 | 0.713 ¬±0.012 | 0.906 ¬±0.013 |

**Choix et Optimisation** :
- Mod√®le choisi : **Logistic Regression** (meilleur Recall, interpr√©table)
- Hyperparam√®tres optimaux : `C=1`, `penalty='l1'`, `solver='saga'`

**R√©sultats du mod√®le optimis√© (Test Set)** :

| M√©trique | Baseline | Optimis√© | Changement |
|----------|----------|----------|------------|
| Recall (Dropout) | 83.1% | 83.1% | +0.0% |
| F1-Score (Dropout) | 81.4% | 81.5% | +0.1% |
| AUC-ROC | 93.1% | 93.1% | +0.0% |

**Conclusion** :
1. Les 4 mod√®les test√©s ont des performances tr√®s similaires
2. L'optimisation des hyperparam√®tres n'a pas significativement am√©lior√© les r√©sultats
3. Cela confirme que les **features engineered** sont le facteur cl√© de la performance
4. Le mod√®le baseline √©tait d√©j√† proche de l'optimal gr√¢ce √† l'EDA de qualit√©

**Top 5 Features les plus importantes** (Logistic Regression avec L1) :
1. `Success_Rate_Sem2` - Taux de r√©ussite 2√®me semestre
2. `Success_Rate_Sem1` - Taux de r√©ussite 1er semestre
3. `Tuition fees up to date` - Frais de scolarit√© √† jour
4. `Age_Group_36+` - √âtudiants de 36 ans et plus
5. `Avg_Grade` - Moyenne des notes

## Phase 6.3 : D√©ploiement du Mod√®le

**Objectif** : Sauvegarder le mod√®le entra√Æn√© et cr√©er une interface de pr√©diction r√©utilisable.

Cette phase comprend :
1. Sauvegarde du pipeline complet (pr√©traitement + mod√®le)
2. Fonction de pr√©diction pour de nouvelles donn√©es
3. Documentation et exemple d'utilisation

In [None]:
# =============================================================================
# 6.3.1 SAUVEGARDE DU MOD√àLE
# =============================================================================

# Cr√©er le r√©pertoire models/ s'il n'existe pas
os.makedirs("../models", exist_ok=True)

# Nom du fichier avec version
model_filename = "../models/dropout_predictor_v1.joblib"

# Sauvegarder le pipeline complet (pr√©traitement + mod√®le)
# best_model est cr√©√© dans la cellule d'√©valuation (grid_search.best_estimator_)
joblib.dump(best_model, model_filename)

# V√©rifier la taille du fichier
file_size = os.path.getsize(model_filename) / 1024  # en KB

print("=" * 60)
print("MODELE SAUVEGARDE AVEC SUCCES")
print("=" * 60)
print(f"\nFichier: {model_filename}")
print(f"Taille: {file_size:.1f} KB")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Informations sur le mod√®le sauvegard√©
print("\nContenu du pipeline:")
print("   1. ColumnTransformer (pr√©traitement)")
print("      - StandardScaler pour variables num√©riques")
print("      - OneHotEncoder pour variables cat√©gorielles")
print("      - Passthrough pour variables binaires")
print("   2. LogisticRegression (mod√®le optimis√©)")
print(f"      - C={best_model.named_steps['classifier'].C}")
print(f"      - penalty={best_model.named_steps['classifier'].penalty}")
print(f"      - class_weight={best_model.named_steps['classifier'].class_weight}")

In [None]:
# =============================================================================
# 6.3.2 FONCTION DE PR√âDICTION R√âUTILISABLE
# =============================================================================


def predict_dropout(student_data, model_path="../models/dropout_predictor_v1.joblib"):
    """
    Pr√©dit le risque de dropout pour un √©tudiant.

    Parameters
    ----------
    student_data : dict ou pd.DataFrame
        Donn√©es de l'√©tudiant avec les features requises
    model_path : str
        Chemin vers le mod√®le sauvegard√©

    Returns
    -------
    dict
        Pr√©diction (0/1), probabilit√© de dropout, et niveau de risque
    """
    import joblib
    import pandas as pd

    # Charger le mod√®le
    model = joblib.load(model_path)

    # Convertir en DataFrame si n√©cessaire
    if isinstance(student_data, dict):
        student_df = pd.DataFrame([student_data])
    else:
        student_df = student_data.copy()

    # Pr√©dire
    prediction = model.predict(student_df)[0]
    proba = model.predict_proba(student_df)[0, 1]

    # D√©terminer le niveau de risque
    if proba < 0.3:
        risk_level = "üü¢ FAIBLE"
    elif proba < 0.5:
        risk_level = "üü° MOD√âR√â"
    elif proba < 0.7:
        risk_level = "üü† √âLEV√â"
    else:
        risk_level = "üî¥ CRITIQUE"

    return {
        "prediction": "Dropout" if prediction == 1 else "Non-Dropout",
        "probability": round(proba, 3),
        "risk_level": risk_level,
    }


# Test avec un exemple du dataset
print("=" * 60)
print("üß™ TEST DE LA FONCTION DE PR√âDICTION")
print("=" * 60)

# S√©lectionner un √©tudiant al√©atoire du test set
test_sample = X_test.iloc[0:1]  # Premier √©tudiant du test set
actual_label = y_test.iloc[0]

print("\nüìã Donn√©es de l'√©tudiant test:")
print(test_sample.to_string())

result = predict_dropout(test_sample)

print("\nüéØ R√©sultat de la pr√©diction:")
print(f"   Pr√©diction     : {result['prediction']}")
print(f"   Probabilit√©    : {result['probability']:.1%}")
print(f"   Niveau de risque: {result['risk_level']}")
print(f"   Valeur r√©elle  : {'Dropout' if actual_label == 1 else 'Non-Dropout'}")
print(
    f"   Correct        : {'‚úÖ' if (result['prediction'] == 'Dropout') == (actual_label == 1) else '‚ùå'}"
)

### Documentation - Utilisation du Mod√®le

#### Features Requises

Le mod√®le attend un DataFrame avec les 16 colonnes suivantes :

| Feature | Type | Description |
|---------|------|-------------|
| `Success_Rate_Sem1` | float | Ratio unit√©s valid√©es/inscrites au 1er semestre (0-1) |
| `Success_Rate_Sem2` | float | Ratio unit√©s valid√©es/inscrites au 2√®me semestre (0-1) |
| `Avg_Grade` | float | Moyenne des notes sur les 2 semestres (0-20) |
| `Total_Approved` | int | Total des unit√©s valid√©es (sem1 + sem2) |
| `Age at enrollment` | int | √Çge √† l'inscription |
| `Admission grade` | float | Note d'admission (0-200) |
| `Performance_Trend` | float | Diff√©rence grade sem2 - grade sem1 |
| `Age_Group` | str | '17-20', '21-25', '26-35', ou '36+' |
| `Course_Domain` | str | 'Tech', 'Sant√©', 'Business', 'Social', 'Arts', 'Education' |
| `Marital_Binary` | str | 'Solo' ou 'Couple' |
| `Education_Level` | str | 'Secondaire', 'Sup√©rieur', ou 'Autre' |
| `Tuition fees up to date` | int | 1 si frais √† jour, 0 sinon |
| `Scholarship holder` | int | 1 si boursier, 0 sinon |
| `Debtor` | int | 1 si d√©biteur, 0 sinon |
| `Gender` | int | 1 = Homme, 0 = Femme |
| `Displaced` | int | 1 si d√©plac√©, 0 sinon |

#### Exemple d'utilisation

```python
import joblib
import pandas as pd

# Charger le mod√®le
model = joblib.load('models/dropout_predictor_v1.joblib')

# Cr√©er les donn√©es d'un √©tudiant
student = pd.DataFrame([{
    'Success_Rate_Sem1': 0.8,
    'Success_Rate_Sem2': 0.7,
    'Avg_Grade': 12.5,
    'Total_Approved': 10,
    'Age at enrollment': 20,
    'Admission grade': 140.0,
    'Performance_Trend': -1.0,
    'Age_Group': '17-20',
    'Course_Domain': 'Tech',
    'Marital_Binary': 'Solo',
    'Education_Level': 'Secondaire',
    'Tuition fees up to date': 1,
    'Scholarship holder': 0,
    'Debtor': 0,
    'Gender': 1,
    'Displaced': 0
}])

# Pr√©dire
prediction = model.predict(student)[0]
probability = model.predict_proba(student)[0, 1]

print(f"Pr√©diction: {'Dropout' if prediction == 1 else 'Non-Dropout'}")
print(f"Probabilit√© de dropout: {probability:.1%}")
```

### Synth√®se Phase 6.3 : D√©ploiement du Mod√®le

#### R√©alisations

1. **Mod√®le Sauvegard√©** : `models/dropout_predictor_v1.joblib` (5.7 KB)
   - Pipeline complet incluant pr√©traitement + mod√®le
   - Portable et r√©utilisable sans d√©pendance au notebook

2. **Fonction de Pr√©diction** : `predict_dropout()`
   - Accepte dict ou DataFrame
   - Retourne pr√©diction, probabilit√© et niveau de risque
   - Cat√©gorisation du risque en 4 niveaux (Faible/Mod√©r√©/√âlev√©/Critique)

3. **Documentation** : Guide complet des features requises

#### Performance du Mod√®le D√©ploy√©

| M√©trique | Valeur |
|----------|--------|
| **Recall (Dropout)** | 83.1% |
| **F1-Score** | 81.5% |
| **AUC-ROC** | 93.1% |
| **Pr√©cision globale** | 88% |

#### Top 3 Pr√©dicteurs

1. `Success_Rate_Sem2` - Taux de r√©ussite 2√®me semestre
2. `Success_Rate_Sem1` - Taux de r√©ussite 1er semestre  
3. `Tuition fees up to date` - Frais de scolarit√© √† jour

---

## Conclusion de l'Analyse

Ce projet a d√©montr√© une approche compl√®te de machine learning pour la pr√©diction du dropout √©tudiant :

1. **EDA approfondie** - Exploration des 37 variables, identification des patterns
2. **Feature Engineering** - Cr√©ation de 9 nouvelles features pertinentes
3. **Mod√©lisation** - Comparaison de 4 algorithmes, optimisation des hyperparam√®tres
4. **D√©ploiement** - Mod√®le portable et fonction de pr√©diction r√©utilisable

**Insight cl√©** : La qualit√© du feature engineering (ratios de r√©ussite) a eu plus d'impact que le choix de l'algorithme. Les variables acad√©miques du 2√®me semestre sont les meilleurs pr√©dicteurs du dropout.