In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import time
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv("data/train.csv", index_col="id")

# Pre-process

In [3]:
Categorical = ["Marital status", 
               "Application mode", 
               "Application order", 
               "Course", 
               "Previous qualification", 
               "Nacionality", 
               "Mother's qualification", 
               "Father's qualification", 
               "Mother's occupation", 
               "Father's occupation"]
Boolean = ["Daytime/evening attendance", 
           "Displaced", 
           "Educational special needs", 
           "Debtor", 
           "Tuition fees up to date", 
           "Gender", 
           "Scholarship holder", 
           "International" ]
Continuous = ["Previous qualification (grade)", 
              "Admission grade", 
              "Age at enrollment", 
              "Curricular units 1st sem (credited)", 
              "Curricular units 1st sem (enrolled)", 
              "Curricular units 1st sem (evaluations)", 
              "Curricular units 1st sem (approved)", 
              "Curricular units 1st sem (grade)", 
              "Curricular units 1st sem (without evaluations)",
              "Curricular units 2nd sem (credited)", 
              "Curricular units 2nd sem (enrolled)", 
              "Curricular units 2nd sem (evaluations)", 
              "Curricular units 2nd sem (approved)", 
              "Curricular units 2nd sem (grade)", 
              "Curricular units 2nd sem (without evaluations)",
              "Unemployment rate",
              "Inflation rate",
              "GDP"]

In [4]:
# Separate features and target
X = train.drop(columns=['Target'])
y = train['Target']

In [5]:
X = pd.get_dummies(X, columns=Categorical)

In [6]:
# Encode target column
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

# Train

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'eta': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 9],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'n_estimators': [100, 200, 300]
}

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(target_encoder.classes_))

# Perform grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the model with the best parameters
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

# Predict
y_pred = best_model.predict(X_val)

# Evaluate
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

In [35]:
# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Set parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # For classification
    'num_class': len(target_encoder.classes_),  # Number of unique classes in target
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'max_depth': 6,
}

# Train the model
start_time = time.time()
model = xgb.train(params, dtrain, num_boost_round=500, evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=False)
end_time = time.time()

# Predict
y_pred = model.predict(dval)

# Evaluate
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")
best_iteration = model.best_iteration
print(f"Number of boosting rounds used: {best_iteration + 1}")
print(f"Time taken: {end_time - start_time} seconds")

Validation Accuracy: 0.8338
Number of boosting rounds used: 223
Time taken: 14.54142713546753 seconds
