In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OrdinalEncoder


In [40]:
df = pd.read_csv('../data/data_cleaned.csv', sep=',')

In [41]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Target_encoded'] = le.fit_transform(df['Target'])

# Drop 'Target' variable
df.drop('Target', axis = 1, inplace = True)

In [42]:
# Change data types of columns that should be categorical
col = ['Marital_status', 'Application_mode', 'Course', 'Previous_qualification',
       'Mother_qualification', 'Father_qualification', 'Mother_occupation',
       'Father_occupation','Target_encoded']

df[col] = df[col].astype('category')

In [43]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   Marital_status                                4424 non-null   category
 1   Application_mode                              4424 non-null   category
 2   Application_order                             4424 non-null   int64   
 3   Course                                        4424 non-null   category
 4   Daytime/evening_attendance                    4424 non-null   int64   
 5   Previous_qualification                        4424 non-null   category
 6   Previous_qualification_grade                  4424 non-null   float64 
 7   Nationality                                   4424 non-null   int64   
 8   Mother_qualification                          4424 non-null   category
 9   Father_qualification                          4424 n

# Data Redundancy

In [44]:
cats = df.select_dtypes(include=['category']).columns.tolist()
print(cats)

['Marital_status', 'Application_mode', 'Course', 'Previous_qualification', 'Mother_qualification', 'Father_qualification', 'Mother_occupation', 'Father_occupation', 'Target_encoded']


In [45]:
stud_selected = df.drop(['Nationality', 'International', 'Educational_special_needs'], axis = 1)  

In [31]:
stud_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 34 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   Marital_status                                4424 non-null   category
 1   Application_mode                              4424 non-null   category
 2   Application_order                             4424 non-null   int64   
 3   Course                                        4424 non-null   category
 4   Daytime/evening_attendance                    4424 non-null   int64   
 5   Previous_qualification                        4424 non-null   category
 6   Previous_qualification_grade                  4424 non-null   float64 
 7   Mother_qualification                          4424 non-null   category
 8   Father_qualification                          4424 non-null   category
 9   Mother_occupation                             4424 n

## Numerical features

In [9]:
# Averaging academic performance data across two semesters
# stud_selected['avg_credited'] = stud_selected[['Curricular_units_1st_sem_credited','Curricular_units_2nd_sem_credited']].mean(axis = 1)

# stud_selected['avg_enrolled'] = stud_selected[['Curricular_units_1st_sem_enrolled','Curricular_units_2nd_sem_enrolled']].mean(axis = 1)

# stud_selected['avg_evaluations'] = stud_selected[['Curricular_units_1st_sem_evaluations','Curricular_units_2nd_sem_evaluations']].mean(axis = 1)

# stud_selected['avg_approved'] = stud_selected[['Curricular_units_1st_sem_approved','Curricular_units_2nd_sem_approved']].mean(axis = 1)

# stud_selected['avg_grade'] = stud_selected[['Curricular_units_1st_sem_grade','Curricular_units_2nd_sem_grade']].mean(axis = 1)

# stud_selected['avg_without_evaluations'] = stud_selected[['Curricular_units_1st_sem_without_evaluations','Curricular_units_2nd_sem_without_evaluations']].mean(axis = 1)

# stud_selected['approval_rate'] = stud_selected['avg_approved'] / stud_selected['avg_evaluations'].replace(0, 1)



| **Prestazione media**    Compatta le due note in un’unica metrica di qualità.     
                                                          
| **Efficienza**          Normalizzi per il numero di unità: evita che chi segue più corsi sembri “peggiore” solo perché ha più valutazioni. 

| **Trend / miglioramento** Cattura se lo studente recupera o peggiora; spesso è più predittivo del valore assoluto.    
                       
| **Carico di lavoro**  Indicatori di impegno complessivo, utili per dropout-risk.                                                         

| **Flag di rischio precoce**    Variabile binaria che riassume il primo semestre.                                                                  


In [46]:
# Average performance
stud_selected["avg_grade"] = stud_selected[['Curricular_units_1st_sem_grade','Curricular_units_2nd_sem_grade']].mean(axis = 1)

# Pass-rate per semester
# avoid division by zero, keep np.nan only when the source is actually missing
stud_selected["pass_rate_1st"] = np.where(
    stud_selected["Curricular_units_1st_sem_enrolled"] > 0,
    stud_selected["Curricular_units_1st_sem_approved"] /
    stud_selected["Curricular_units_1st_sem_enrolled"],
    0.0                           # oppure np.nan, decidi tu
)

stud_selected["pass_rate_2nd"] = np.where(
    stud_selected["Curricular_units_2nd_sem_enrolled"] > 0,
    stud_selected["Curricular_units_2nd_sem_approved"] /
    stud_selected["Curricular_units_2nd_sem_enrolled"],
    0.0
)

stud_selected["pass_rate_delta"] = (
    stud_selected["pass_rate_2nd"] - stud_selected["pass_rate_1st"]
)


# Deltas (trend)
stud_selected["grade_delta"] = stud_selected["Curricular_units_2nd_sem_grade"] - stud_selected["Curricular_units_1st_sem_grade"]
stud_selected["approved_delta"] = stud_selected["Curricular_units_2nd_sem_approved"]   - stud_selected["Curricular_units_1st_sem_approved"]

# Workload aggregates
stud_selected["total_enrolled"] = stud_selected["Curricular_units_1st_sem_enrolled"] + stud_selected["Curricular_units_2nd_sem_enrolled"]
stud_selected["total_credited"] = stud_selected["Curricular_units_1st_sem_credited"] + stud_selected["Curricular_units_2nd_sem_credited"]

# Early-warning flag
#stud_selected["early_fail_flag"] = (stud_selected["pass_rate_1st"] < 0.50).astype(int)

quando due set di variabili sono altamente correlati perché misurano lo stesso concetto a momenti diversi, trasforma le seconde in segnali di progresso (differenze, rapporti, medie); spesso è l’andamento, non il valore assoluto, a distinguere chi si laurea da chi abbandona.

In [47]:
# To avoid division-by-zero we add .clip(lower=1)
stud_selected["eval_completion_rate_1st"] = (
    stud_selected["Curricular_units_1st_sem_evaluations"] /
    (stud_selected["Curricular_units_1st_sem_evaluations"] + 
     stud_selected["Curricular_units_1st_sem_without_evaluations"]).clip(lower=1)
)

stud_selected["eval_completion_rate_2nd"] = (
    stud_selected["Curricular_units_2nd_sem_evaluations"] /
    (stud_selected["Curricular_units_2nd_sem_evaluations"] + 
     stud_selected["Curricular_units_2nd_sem_without_evaluations"]).clip(lower=1)
)

stud_selected["eval_completion_delta"] = (
    stud_selected["eval_completion_rate_2nd"] - 
    stud_selected["eval_completion_rate_1st"]
)

0 → 1: quanto lo studente si presenta davvero agli esami.

Delta > 0: migliora l’impegno; <br>
Delta < 0: cala (segnale di rischio).

In [48]:
# Drop unselected features along with the features that have been aggregated into new ones 
stud_selected = stud_selected.drop(columns = ['Curricular_units_1st_sem_credited', 
                                              'Curricular_units_1st_sem_enrolled', 
                                              'Curricular_units_1st_sem_evaluations', 
                                              'Curricular_units_1st_sem_approved', 
                                              'Curricular_units_1st_sem_grade', 
                                              'Curricular_units_1st_sem_without_evaluations', 
                                              'Curricular_units_2nd_sem_credited', 
                                              'Curricular_units_2nd_sem_enrolled', 
                                              'Curricular_units_2nd_sem_evaluations', 
                                              'Curricular_units_2nd_sem_approved', 
                                              'Curricular_units_2nd_sem_grade', 
                                              'Curricular_units_2nd_sem_without_evaluations'])

## ENCODING

In [49]:
# Identify categorical columns (object or category dtype)
cat_cols = stud_selected.select_dtypes(include=["category"]).columns.tolist()
print("Categorical features:", cat_cols)

Categorical features: ['Marital_status', 'Application_mode', 'Course', 'Previous_qualification', 'Mother_qualification', 'Father_qualification', 'Mother_occupation', 'Father_occupation', 'Target_encoded']


In [50]:
def imbalance_report(df: pd.DataFrame, cat_columns, rare_thresh=0.01):
    """
    Return a DataFrame with imbalance metrics for each categorical column.
    - dominant_share: % of the most frequent category
    - rare_share    : total % of categories below `rare_thresh`
    - n_categories  : number of distinct categories
    """
    rows = []
    for col in cat_columns:
        freqs = df[col].value_counts(normalize=True, dropna=False)
        dominant_share = freqs.iloc[0]
        rare_share = freqs[freqs < rare_thresh].sum()
        rows.append({
            "feature": col,
            "dominant_share": dominant_share,
            "rare_share": rare_share,
            "n_categories": len(freqs)
        })
    return pd.DataFrame(rows).set_index("feature")           \
                             .sort_values("dominant_share", ascending=False)

imbalance_df = imbalance_report(stud_selected, cat_cols, rare_thresh=0.01)
print(imbalance_df.head(16))    # show the most skewed first


                        dominant_share  rare_share  n_categories
feature                                                         
Marital_status                0.885850    0.007911             6
Previous_qualification        0.840190    0.035036            17
Target_encoded                0.499322    0.000000             3
Application_mode              0.386076    0.030515            18
Mother_occupation             0.356465    0.034358            32
Father_qualification          0.273282    0.040461            34
Mother_qualification          0.241637    0.029611            29
Father_occupation             0.228300    0.025316            46
Course                        0.173146    0.002712            17


| `dominant_share ≥ 0.85` | Variabile dominata da un’unica modalità → alto rischio che le altre colonne “valgano zero”. 

| `rare_share ≥ 0.20`     | Molte modalità con frequenza < 1 % → sparsità elevata in one-hot, possibile over-fitting. 
  
| `n_categories ≫ 50`     | Cardinalità alta, anche se distribuzione non estrema.                                       


In [36]:
stud_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Marital_status                4424 non-null   category
 1   Application_mode              4424 non-null   category
 2   Application_order             4424 non-null   int64   
 3   Course                        4424 non-null   category
 4   Daytime/evening_attendance    4424 non-null   int64   
 5   Previous_qualification        4424 non-null   category
 6   Previous_qualification_grade  4424 non-null   float64 
 7   Mother_qualification          4424 non-null   category
 8   Father_qualification          4424 non-null   category
 9   Mother_occupation             4424 non-null   category
 10  Father_occupation             4424 non-null   category
 11  Admission_grade               4424 non-null   float64 
 12  Displaced                     4424 non-null   in

In [51]:
# ---------------------------------------------------------------------
# 0) Imports
# ---------------------------------------------------------------------
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder          # pip install category_encoders
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

# ---------------------------------------------------------------------
# 1) Prepare X and y  (raw dataframe → split)
X = stud_selected.drop(columns='Target_encoded')               # raw, **un-encoded**
y = stud_selected["Target_encoded"]         # three classes (e.g. 0,1,2 or strings)

# Make sure y is an integer array (if it's text, map to ints once)
if y.dtype == "object":
    y = y.map({c: i for i, c in enumerate(y.unique())})

train_X, val_X, train_y, val_y = train_test_split(
    X, y, test_size=0.20, random_state=1, stratify=y            # stratify keeps class balance
)

# ---------------------------------------------------------------------
# 2) Pre-processing blocks
# ---------------------------------------------------------------------
onehot_cols = ["Marital_status", "Application_mode", "Course"]   # low-cardinality
target_cols = ["Previous_qualification", "Mother_occupation",
               "Father_qualification", "Mother_qualification",
               "Father_occupation"]                              # high-cardinality

num_cols = X.select_dtypes(exclude=["object", "category"]).columns.tolist()

preprocess = ColumnTransformer([
    ("oh", OneHotEncoder(handle_unknown="ignore", drop="if_binary"), onehot_cols),
    ("te", TargetEncoder(cols=target_cols, smoothing=5.0),           target_cols),
    ("num", "passthrough",                                           num_cols)
])

# ---------------------------------------------------------------------
# 3) Random-Forest classifier
# ---------------------------------------------------------------------
clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced",        # handles slight class imbalance
    n_jobs=-1,
    random_state=1
)

pipe = Pipeline([
    ("prep", preprocess),
    ("rf",   clf)
])

# ---------------------------------------------------------------------
# 4) Fit on training data
# ---------------------------------------------------------------------
pipe.fit(train_X, train_y)

# ---------------------------------------------------------------------
# 5) Validation on held-out 20 %
# ---------------------------------------------------------------------
val_pred = pipe.predict(val_X)
macro_f1 = f1_score(val_y, val_pred, average="macro")

print(f"Validation macro-F1: {macro_f1:.3f}")
print(classification_report(val_y, val_pred))

# ---------------------------------------------------------------------
# 6) (Optional) Five-fold stratified CV for a more stable estimate
# ---------------------------------------------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cv_scores = cross_val_score(pipe, X, y, cv=cv, scoring="f1_macro")
print(f"5-fold CV macro-F1: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


Validation macro-F1: 0.726
              precision    recall  f1-score   support

           0       0.85      0.74      0.79       284
           1       0.55      0.50      0.52       159
           2       0.82      0.91      0.86       442

    accuracy                           0.78       885
   macro avg       0.74      0.72      0.73       885
weighted avg       0.78      0.78      0.78       885





5-fold CV macro-F1: 0.719 ± 0.022
