In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

from alyra_ai_ml.data.loader import rename_columns, strip_column_names
from alyra_ai_ml.schemas import StudentDataSchemaRenamed

sns.set_theme(style="darkgrid")

DATASET = "../data/dataset.csv"


raw_df = pd.read_csv(DATASET, sep=";").pipe(strip_column_names).pipe(rename_columns)
print(f"Raw DF: {raw_df.shape}")

df = StudentDataSchemaRenamed.validate(raw_df, lazy=True)
print(f"Validated DF: {df.shape}")

difference = raw_df.shape[0] - df.shape[0]
print(f"Difference: {difference}")

label_encoder = LabelEncoder()
df["Target_Encoded"] = label_encoder.fit_transform(df["Target"])

In [None]:
df.shape

In [None]:
df.head().T

In [None]:
print(list(df.columns))

In [None]:
Categories_features = [
    "Marital status",
    "Application mode",
    "Application order",
    "Course",
    "Daytime/evening attendance",
    "Previous qualification",
    "Nationality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
]
Admission_Num = [
    "Previous qualification (grade)",
    "Admission grade",
    "Age at enrollment",
]
Binary_Fields = [
    "Displaced",
    "Educational special needs",
    "Debtor",
    "Tuition fees up to date",
    "Gender",
    "Scholarship holder",
    "International",
]
CurricularUnits1stSem_features = [
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
]
CurricularUnit2ndSem_features = [
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)",
]
Macroeconomic_Indicators = [
    "Unemployment rate",
    "Inflation rate",
    "GDP",
    # "Target",
]

x = (
    Categories_features
    + Admission_Num
    + Binary_Fields
    + CurricularUnit2ndSem_features
    + Macroeconomic_Indicators
    + CurricularUnits1stSem_features
)
print(len(x))
print(len(df.columns))

In [None]:
df[Macroeconomic_Indicators].head().T
# df.info()

## Nettoyage des noms des colonnes
Certaines colonnes contiennent des espaces à la fin, ou des `\t`. On va les nettoyer en utilisant la méthode `strip` de Python.

In [None]:
df.head().T

## Vérification des valeurs manquantes
- isna()
- isnull()

In [None]:
missing = df.isna().sum()
if missing.sum() == 0:
    print("Aucune valeur manquante")
else:
    print("Il y a des valeurs manquantes")
    print(missing[missing > 0].sum())

In [None]:
missing = df.isnull().sum()
if missing.sum() == 0:
    print("Aucune valeur manquante")
else:
    print("Il y a des valeurs manquantes")
    print(missing[missing > 0].sum())

## Vérification des valeurs dupliquées

In [None]:
if df.duplicated().sum() == 0:
    print("Aucune valeur dupliquée")
else:
    print("Il y a des valeurs dupliquées")
    print(df.duplicated().sum())

In [None]:
df[["Marital status", "Application mode", "Application order"]].describe().T

## Etude de la Variable Cible (aka Target)

In [None]:
# Distribution de la target
target_counts = df["Target"].value_counts()
target_pct = df["Target"].value_counts(normalize=True)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

COLORS = {"Dropout": "#1f77b4", "Enrolled": "#ff7f0e", "Graduate": "#2ca02c"}

target_order = COLORS.keys()
colors = COLORS.values()

ax1 = axes[0]
bars = ax1.bar(target_order, [target_counts[t] for t in target_order], color=colors)
ax1.set_xlabel("Statut")
ax1.set_ylabel("Nombre d'étudiants")

ax2 = axes[1]
pie = ax2.pie(target_counts, labels=target_order, autopct="%1.1f%%", colors=colors)

plt.suptitle("Répartition des étudiants par statut")
plt.tight_layout()
plt.show()

## Étude des Outliers
- Méthode des Z-Score
- Méthode des IQR

Travail sur les variables qualitatives

### Z-Score

In [None]:
# Dans cette cellule, je vais regarder les boxplots pour `Age at enrollment`

plt.figure()
# plt.subplot(1, 2, 1)
columns = ["Age at enrollment", "Admission grade", "Previous qualification (grade)"]
columns = ["Unemployment rate", "Inflation rate", "GDP"]
sns.boxplot(data=df[columns])
# plt.subplot(1, 2, 2)
plt.title("Avant traitement des outliers")
# plt.xticks(rotation=45)
plt.show()

plt.figure()

In [None]:
q1 = df[columns].quantile(0.25)
q3 = df[columns].quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
outlier_mask = ((df[columns] < lower) | (df[columns] > upper)).any(axis=1)
outlier_df = df[~outlier_mask]

plt.figure()
sns.boxplot(data=outlier_df[columns])  # [columns])
plt.show()

plt.figure()

In [None]:
fig, axes = plt.subplots(1, len(columns), figsize=(12, 4))
for idx, col in enumerate(columns):
    mask = (df[col] < lower[col]) | (df[col] > upper[col])
    outliers = df.loc[mask, col]
    axes[idx].boxplot(outliers.dropna())
    axes[idx].set_title(f"{col}\n({len(outliers)} outliers)")
plt.tight_layout()
plt.show()

In [None]:
df[["Admission grade"]].hist(figsize=(20, 15), bins=30)

In [None]:
df[CurricularUnits1stSem_features].hist(figsize=(20, 15), bins=30)
df[CurricularUnit2ndSem_features].hist(figsize=(20, 15), bins=30)

In [None]:
df["Marital status"].value_counts(normalize=True) * 100

In [None]:
df["Nationality"].value_counts(normalize=True) * 100

In [None]:
plt.figure(figsize=(20, 16))
corr_matrix = df[
    CurricularUnits1stSem_features + CurricularUnit2ndSem_features + ["Target"]
].corr(numeric_only=True)

# Plot heatmap
# annot=False: Hide numbers to avoid clutter due to small cell size
# cmap='coolwarm': Red indicates positive correlation, Blue indicates negative
sns.heatmap(
    corr_matrix,
    annot=False,
    fmt=".2f",
    cmap="coolwarm",
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)

In [None]:
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
n_cols = 6
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
axes = axes.ravel()

for idx, col in enumerate(numeric_cols):
    df[col].hist(ax=axes[idx], bins=30, edgecolor="black")
    axes[idx].set_title(col[:25], fontsize=10)
    axes[idx].tick_params(labelsize=8)

# Masquer les axes inutilisés
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 12))
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
sns.heatmap(df[numeric_cols].corr(), annot=False, cmap="coolwarm", center=0)
plt.title("Matrice de corrélation")
plt.tight_layout()
plt.show()

In [None]:
key_numeric = [
    "Admission grade",
    "Age at enrollment",
    "Curricular units 1st sem (grade)",
    "Curricular units 2nd sem (grade)",
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for idx, col in enumerate(key_numeric):
    sns.boxplot(
        data=df,
        x="Target",
        y=col,
        ax=axes[idx],
        order=["Dropout", "Enrolled", "Graduate"],
        hue="Target",
        hue_order=["Dropout", "Enrolled", "Graduate"],
        palette={"Dropout": "#e74c3c", "Enrolled": "#f39c12", "Graduate": "#27ae60"},
        legend=False,
    )
    axes[idx].set_title(f"{col} par Target")

plt.tight_layout()
plt.show()

In [None]:
categorical_cols = [
    "Marital status",
    "Gender",
    "Scholarship holder",
    "Debtor",
    "Tuition fees up to date",
]

for col in categorical_cols:
    fig, ax = plt.subplots(figsize=(10, 5))
    crosstab = pd.crosstab(df[col], df["Target"], normalize="index")
    crosstab[["Dropout", "Enrolled", "Graduate"]].plot(
        kind="bar", stacked=True, ax=ax, color=["#e74c3c", "#f39c12", "#27ae60"]
    )
    plt.title(f"{col} vs Target (proportions)")
    plt.xlabel(col)
    plt.ylabel("Proportion")
    plt.legend(title="Target")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoder la target pour calcul de corrélation
# le = LabelEncoder()
# df['Target_encoded'] = le.fit_transform(df['Target'])

# Corrélation avec la target
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
correlations = (
    df[numeric_cols]
    .corrwith(df["Target_Encoded"])
    .sort_values(key=abs, ascending=False)
)

plt.figure(figsize=(10, 12))
correlations.drop("Target_Encoded").plot(
    kind="barh",
    color=[
        "#27ae60" if x > 0 else "#e74c3c" for x in correlations.drop("Target_Encoded")
    ],
)
plt.title("Corrélation des variables avec Target")
plt.xlabel("Coefficient de corrélation")
plt.tight_layout()
plt.show()

In [None]:
df.drop(columns=["Target"], axis=1).corr()["Target_Encoded"]

In [None]:
fig = px.imshow(df.drop(columns=["Target"], axis=1))
fig.show()

In [None]:
(df.isna().sum() / df.shape[0]).sort_values(ascending=False)

In [None]:
df["Target_Encoded"].value_counts().plot.pie()
# df.columns

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(df.isna(), cbar=False)