## Load the Cleaned Data

In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the cleaned dataset
data_path = '../data/processed/cleaned_data.csv'
df_cleaned = pd.read_csv(data_path)

# Display the first few rows of the dataset
df_cleaned.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,-1.965152,0,-0.287638,0.124386,0.765761,0
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,0.661712,0,0.876222,-1.105222,0.347199,2
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,-1.965152,0,-0.287638,0.124386,0.765761,0
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,0.418247,0,-0.813253,-1.466871,-1.375511,2
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,0.533572,0,0.876222,-1.105222,0.347199,2


In [10]:
# Split the data into features (X) and target (y)
X = df_cleaned.drop('Target', axis=1)  # All columns except 'Target'
y = df_cleaned['Target']               # The 'Target' column

In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
### Random Forest Classifier ###
print("Training Random Forest...")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

Training Random Forest...


In [13]:
# Predictions and evaluation
rf_y_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_y_pred))

Random Forest Accuracy: 0.7612951807228916
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.78      0.80       441
           1       0.54      0.30      0.38       245
           2       0.77      0.92      0.84       642

    accuracy                           0.76      1328
   macro avg       0.71      0.67      0.67      1328
weighted avg       0.74      0.76      0.74      1328



In [24]:
# Feature Importance for Random Forest
print("Random Forest Feature Importances:", rf_model.feature_importances_)

Random Forest Feature Importances: [0.00365778 0.021736   0.01548694 0.03380309 0.00290838 0.00567241
 0.03788051 0.00181116 0.02024337 0.02203108 0.02491145 0.02683422
 0.04140607 0.00797871 0.00099606 0.01052645 0.03612934 0.0096475
 0.01988892 0.03882768 0.00159306 0.0061153  0.01843568 0.03683555
 0.08683239 0.05992648 0.00466178 0.00557963 0.02237594 0.04242251
 0.14674548 0.11382768 0.00497404 0.02180379 0.02145823 0.02403532]


In [15]:
import joblib  # Add this line at the top of your code
# Save Random Forest model
rf_model_path = '../models/random_forest_model.pkl'
joblib.dump(rf_model, rf_model_path)
print(f"Random Forest model saved to {rf_model_path}")

Random Forest model saved to ../models/random_forest_model.pkl


In [21]:
### XGBoost Classifier ###
!pip install xgboost
from xgboost import XGBClassifier
print("Training XGBoost...")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/124.9 MB 508.0 kB/s eta 0:04:05
   ---------------------------------------- 0.5/124.9 MB 508.0 kB/s eta 0:04:05
   ---------------------------------------- 0.8/124.9 MB 568.6 kB/s eta 0:03:39
   ---------------------------------------- 0.8/124.9 MB 568.6 kB/s eta 0:03:39
   ---------------------------------------- 0.8/124.9 MB 568.6 kB/s eta 0:03:39
   -----------------------

Parameters: { "use_label_encoder" } are not used.



In [22]:
# Predictions and evaluation
xgb_y_pred = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_y_pred))
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_y_pred))

XGBoost Accuracy: 0.7680722891566265
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.76      0.79       441
           1       0.56      0.40      0.47       245
           2       0.79      0.92      0.85       642

    accuracy                           0.77      1328
   macro avg       0.72      0.69      0.70      1328
weighted avg       0.76      0.77      0.76      1328



In [25]:
# Feature Importance for XGBoost
print("XGBoost Feature Importances:", xgb_model.feature_importances_)

XGBoost Feature Importances: [0.01418724 0.01627711 0.01729712 0.02663462 0.01845397 0.0149558
 0.01558631 0.01654263 0.01430725 0.01589114 0.01926053 0.01331599
 0.01604368 0.01250485 0.00839298 0.03181771 0.10180705 0.01959576
 0.02895857 0.02259554 0.         0.0185565  0.06107422 0.0312
 0.02847164 0.01404556 0.01552831 0.01932123 0.02659729 0.02513027
 0.22149074 0.01888542 0.02470326 0.01526918 0.01677992 0.0185205 ]


In [23]:
# Save XGBoost model
xgb_model_path = '../models/xgboost_model.pkl'
joblib.dump(xgb_model, xgb_model_path)
print(f"XGBoost model saved to {xgb_model_path}")

XGBoost model saved to ../models/xgboost_model.pkl
