In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import time
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv("data/train.csv", index_col="id")

# Pre-process

In [3]:
Categorical = ["Marital status", 
               "Application mode", 
               "Application order", 
               "Course", 
               "Previous qualification", 
               "Nacionality", 
               "Mother's qualification", 
               "Father's qualification", 
               "Mother's occupation", 
               "Father's occupation"]
Boolean = ["Daytime/evening attendance", 
           "Displaced", 
           "Educational special needs", 
           "Debtor", 
           "Tuition fees up to date", 
           "Gender", 
           "Scholarship holder", 
           "International" ]
Continuous = ["Previous qualification (grade)", 
              "Admission grade", 
              "Age at enrollment", 
              "Curricular units 1st sem (credited)", 
              "Curricular units 1st sem (enrolled)", 
              "Curricular units 1st sem (evaluations)", 
              "Curricular units 1st sem (approved)", 
              "Curricular units 1st sem (grade)", 
              "Curricular units 1st sem (without evaluations)",
              "Curricular units 2nd sem (credited)", 
              "Curricular units 2nd sem (enrolled)", 
              "Curricular units 2nd sem (evaluations)", 
              "Curricular units 2nd sem (approved)", 
              "Curricular units 2nd sem (grade)", 
              "Curricular units 2nd sem (without evaluations)",
              "Unemployment rate",
              "Inflation rate",
              "GDP"]

In [36]:
# Separate features and target
X = train.drop(columns=['Target'])
y = train['Target']
y.head(5)

id
0    Graduate
1     Dropout
2     Dropout
3    Enrolled
4    Graduate
Name: Target, dtype: object

In [37]:
X = pd.get_dummies(X, columns=Categorical)

In [38]:
# Encode target column
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

In [39]:
#y = pd.get_dummies(y)
#y.head()

# Train

In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Set parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # For classification
    'num_class': len(target_encoder.classes_),  # Number of unique classes in target
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'max_depth': 3,
    'lambda': 4,
    'alpha': 1,
    'subsample': 1,
    'colsample_bytree': 1
}

# Train the model
start_time = time.time()
model = xgb.train(params, dtrain, num_boost_round=2000, evals=[(dval, 'eval')], early_stopping_rounds=20, verbose_eval=False)
end_time = time.time()

# Predict on validation set
y_val_pred = model.predict(dval)

# Predict on training set
y_train_pred = model.predict(dtrain)

# Evaluate
val_accuracy = accuracy_score(y_val, y_val_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f}")

best_iteration = model.best_iteration
print(f"Number of boosting rounds used: {best_iteration + 1}")
print(f"Time taken: {round(end_time - start_time)} seconds")

Validation Accuracy: 0.8351
Training Accuracy: 0.8469
Number of boosting rounds used: 804
Time taken: 39 seconds
