In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import time
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv("data/train.csv", index_col="id")
submission_test = pd.read_csv("data/test.csv", index_col="id")

# Pre-process

In [3]:
Categorical = ["Marital status", 
               "Application mode", 
               "Application order", 
               "Course", 
               "Previous qualification", 
               "Nacionality", 
               "Mother's qualification", 
               "Father's qualification", 
               "Mother's occupation", 
               "Father's occupation"]
Boolean = ["Daytime/evening attendance", 
           "Displaced", 
           "Educational special needs", 
           "Debtor", 
           "Tuition fees up to date", 
           "Gender", 
           "Scholarship holder", 
           "International" ]
Continuous = ["Previous qualification (grade)", 
              "Admission grade", 
              "Age at enrollment", 
              "Curricular units 1st sem (credited)", 
              "Curricular units 1st sem (enrolled)", 
              "Curricular units 1st sem (evaluations)", 
              "Curricular units 1st sem (approved)", 
              "Curricular units 1st sem (grade)", 
              "Curricular units 1st sem (without evaluations)",
              "Curricular units 2nd sem (credited)", 
              "Curricular units 2nd sem (enrolled)", 
              "Curricular units 2nd sem (evaluations)", 
              "Curricular units 2nd sem (approved)", 
              "Curricular units 2nd sem (grade)", 
              "Curricular units 2nd sem (without evaluations)",
              "Unemployment rate",
              "Inflation rate",
              "GDP"]

In [4]:
# Separate features and target
X_train_raw = train.drop(columns=['Target'])
y = train['Target']
y.head(5)

id
0    Graduate
1     Dropout
2     Dropout
3    Enrolled
4    Graduate
Name: Target, dtype: object

In [5]:
# Combine train and test for consistent encoding
combined = pd.concat([X_train_raw, submission_test], axis=0)

# Apply get_dummies
combined_encoded = pd.get_dummies(combined, columns=Categorical)

# Split back into train and test
X = combined_encoded.loc[X_train_raw.index]
X_test = combined_encoded.loc[submission_test.index]

In [6]:
# Encode target column
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

In [7]:
#y = pd.get_dummies(y)
#y.head()

# Train

In [8]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(X_test)

# Set parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # For classification
    'num_class': len(target_encoder.classes_),  # Number of unique classes in target
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'max_depth': 3,
    'lambda': 4,
    'alpha': 1,
    'subsample': 1,
    'colsample_bytree': 1
}

# Train the model
start_time = time.time()
model = xgb.train(params, dtrain, num_boost_round=2000, verbose_eval=False)
end_time = time.time()
print(f"Time taken: {round(end_time - start_time)} seconds")

Time taken: 134 seconds


In [10]:
# Predict on validation set
y_pred = model.predict(dtest)

In [19]:
target_list = target_encoder.inverse_transform([int(n) for n in y_pred])

In [25]:
# Create a DataFrame
df = pd.DataFrame({'id': submission_test.index, 'Target': target_list})

# Write the DataFrame to a CSV file
df.to_csv('data/output.csv', index=False)

In [24]:
submission_test.index

Index([ 76518,  76519,  76520,  76521,  76522,  76523,  76524,  76525,  76526,
        76527,
       ...
       127520, 127521, 127522, 127523, 127524, 127525, 127526, 127527, 127528,
       127529],
      dtype='int64', name='id', length=51012)