In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import umap
import datetime
import optuna
import pprint
import joblib

from autogluon.tabular import TabularDataset, TabularPredictor

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, label_binarize, PolynomialFeatures, RobustScaler
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold
from sklearn.base import clone
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score, accuracy_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV
from sklearn import metrics
from sklearn.tree import plot_tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer

import statsmodels.api as sm

from scipy import stats

import xgboost

import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

gc.collect()

26436

In [28]:
train_df = pd.read_csv("data/train.csv", index_col=0)
test_df = pd.read_csv("data/test.csv", index_col=0)
original_df = pd.read_csv("data/original.csv",sep=";")
train_features = test_df.columns

cat_features = ['Marital status', 'Application mode', "Application order", 'Course', "Daytime/evening attendance",
                'Previous qualification', 'Nacionality', "Mother's qualification", 
                "Father's qualification", "Mother's occupation",
                "Father's occupation", "Displaced", "Educational special needs", "Debtor", "Tuition fees up to date",
                "Gender", "Scholarship holder", "International"]
cont_features = [feature for feature in train_features if feature not in cat_features]

In [81]:
for col in train_df.columns:
    print(f"{col:>40} nunique {train_df[col].nunique()}")

                          Marital status nunique 6
                        Application mode nunique 22
                       Application order nunique 8
                                  Course nunique 19
              Daytime/evening attendance nunique 2
                  Previous qualification nunique 21
          Previous qualification (grade) nunique 110
                             Nacionality nunique 18
                  Mother's qualification nunique 35
                  Father's qualification nunique 39
                     Mother's occupation nunique 40
                     Father's occupation nunique 56
                         Admission grade nunique 668
                               Displaced nunique 2
               Educational special needs nunique 2
                                  Debtor nunique 2
                 Tuition fees up to date nunique 2
                                  Gender nunique 2
                      Scholarship holder nunique 2
                   

In [29]:
for feat in cat_features:
    dtype = pd.CategoricalDtype(categories=list(set(train_df[feat]) | set(test_df[feat]) | set(original_df[feat])), ordered=False)
    for df in [train_df, test_df, original_df]:
        df[feat] = df[feat].astype(dtype)

combined_df = pd.concat([train_df, original_df], axis=0, ignore_index=True)

In [30]:
poly_feats = joblib.load("AutogluonModels/ag-misc/poly_feats.pkl")

train_cont = combined_df[cont_features]
test_cont = test_df[cont_features]

poly_train = poly_feats.transform(train_cont)
poly_train = pd.DataFrame(poly_train, index=train_cont.index, columns=poly_feats.get_feature_names_out())

poly_test = poly_feats.transform(test_cont)
poly_test = pd.DataFrame(poly_test, index=test_cont.index, columns=poly_feats.get_feature_names_out())

train_data = combined_df.drop(columns=cont_features)
train_data = pd.concat([train_data, poly_train], axis=1)
test_data = test_df.drop(columns=cont_features)
test_data = pd.concat([test_data, poly_test], axis=1)

train_data = TabularDataset(train_data)
test_data = TabularDataset(test_data)

# ag_poly_orig = TabularPredictor.load("AutogluonModels/ag-20240619_194852")
# ag_poly_orig.feature_importance(train_data)
fi = pd.read_csv("ag_feature_importances.csv", index_col=0)

filtered_importances = fi[abs(fi["importance"]) - 1e-4 > 0]

fig = px.bar(x=filtered_importances.index, y=filtered_importances["importance"])
fig.update_layout(
    xaxis=dict(
        showticklabels=False
    )
)

In [31]:
study = optuna.load_study(
    storage="sqlite:///optuna.sqlite3",
    study_name="aug_xgb_v1",
)

In [32]:
sorted_trials = sorted(study.trials, key=lambda x: x.value if x.value else 0)
top_params = [(t.value, t.params) for t in sorted_trials[:-21:-1]]
for value, param_dict in top_params:
    param_dict["value"] = value

top_params = pd.DataFrame([t[1] for t  in top_params])
top_params.insert(0, "value", top_params.pop("value"))
top_params

Unnamed: 0,value,include_orig,prune_low_freq,prune_low_importance,poly_feats,use_standardscaler,use_robustscaler,classifier,n_estimators,eta,gamma,max_depth,max_leaves,colsample_bytree,colsample_bylevel,colsample_bynode,reg_lambda,reg_alpha,grow_policy,min_child_weight,max_delta_step,poly_feats_degrees,with_centering,with_scaling
0,0.831608,False,True,False,False,False,False,xgboost,1883,0.112997,0.963429,3,1346,0.838586,0.807475,0.70647,3,8,lossguide,70.27952,94.63438,,,
1,0.83068,False,False,False,True,True,True,xgboost,282,0.41117,0.033985,3,3611,0.62613,0.993347,0.103194,7,7,depthwise,628.25377,98.79208,2.0,True,False
2,0.830197,False,True,False,False,False,True,xgboost,648,0.295632,1.409754,3,778,0.916791,0.923863,0.680886,1,8,depthwise,50.156449,64.305984,,False,False
3,0.829962,False,False,False,True,True,True,xgboost,2564,0.364086,1.832124,3,2982,0.528489,0.82678,0.246813,4,6,depthwise,14.623085,40.438325,2.0,True,False
4,0.829609,False,False,False,True,True,True,xgboost,968,0.499941,0.293968,3,4280,0.809267,0.9308,0.154274,6,4,depthwise,702.534889,87.735757,2.0,True,False
5,0.829164,False,True,False,False,False,True,xgboost,101,0.427706,1.321473,3,2039,0.938425,0.610336,0.475127,0,19,depthwise,42.303674,63.856475,,False,False
6,0.829151,False,False,False,True,True,True,xgboost,3618,0.551059,0.297905,4,4569,0.942594,0.990392,0.176987,5,9,depthwise,507.267807,71.320731,2.0,True,False
7,0.828864,False,True,False,False,False,True,xgboost,530,0.294177,1.284759,3,962,0.897616,0.610439,0.70499,1,20,depthwise,76.918538,65.417646,,False,False
8,0.828838,False,False,False,True,True,True,xgboost,2252,0.48106,0.208399,3,3864,0.692294,0.994614,0.140059,7,5,depthwise,646.09797,86.958329,2.0,True,False
9,0.828707,False,False,False,True,True,True,xgboost,2536,0.441049,0.152439,3,28,0.706814,0.998392,0.160563,6,4,depthwise,672.756739,82.725194,2.0,True,False


In [33]:
sp_rows = 5
sp_cols = 5
sp_ids = [(r, c) for r in range(1, sp_rows+1) for c in range(1, sp_cols+1)]
fig = make_subplots(rows=sp_rows, cols=sp_cols, subplot_titles=top_params.columns)

for sp, column in zip(sp_ids, top_params.columns):
    if column not in ["id"]:
        fig.append_trace(
            go.Histogram(
                histfunc="count",
                x=top_params[column],
                name=column,
            ),
            row=sp[0],
            col=sp[1],
        )

variable_kstests = pd.DataFrame()
for column in top_params.columns:
    if top_params[column].dtype not in ["object"]:
        ks_test = stats.kstest(top_params[column], stats.norm.cdf)
        kd_pd = pd.DataFrame({
            "statistic":ks_test.statistic,
            "pvalue":ks_test.pvalue
        },index=[column])   

        variable_kstests = pd.concat([variable_kstests, kd_pd], axis=0)
        
print(variable_kstests)

fig.update_layout(height=1000, width=1800, title_text="Histograms of all columns")
fig.show()
#fig.write_image("var_hists.png")
gc.collect()


                      statistic        pvalue
value                  0.795917  4.373034e-14
include_orig           0.500000  3.787595e-05
prune_low_freq         0.500000  3.787595e-05
prune_low_importance   0.500000  3.787595e-05
poly_feats             0.500000  3.787595e-05
use_standardscaler     0.500000  3.787595e-05
use_robustscaler       0.591345  3.397160e-07
n_estimators           1.000000  0.000000e+00
eta                    0.537306  6.297666e-06
gamma                  0.513555  2.012841e-05
max_depth              0.998650  8.073266e-58
max_leaves             1.000000  0.000000e+00
colsample_bytree       0.668240  2.489991e-09
colsample_bylevel      0.686239  6.770519e-10
colsample_bynode       0.532093  8.176501e-06
reg_lambda             0.798650  3.273501e-14
reg_alpha              0.948650  3.249539e-26
min_child_weight       1.000000  0.000000e+00
max_delta_step         1.000000  0.000000e+00
poly_feats_degrees          NaN           NaN


4000

In [77]:
more_important_features = fi[fi["importance"] > 1e-3]
more_important_features

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
Tuition fees up to date,0.02864,0.001352,5.942267e-07,5,0.031424,0.025856
Course,0.01340,0.003338,4.260248e-04,5,0.020272,0.006528
Scholarship holder,0.00976,0.001152,2.289540e-05,5,0.012133,0.007387
Father's occupation,0.00364,0.002032,8.024572e-03,5,0.007823,-0.000543
Application mode,0.00336,0.000654,1.640861e-04,5,0.004707,0.002013
...,...,...,...,...,...,...
Curricular units 1st sem (evaluations)^2 Curricular units 2nd sem (evaluations),0.00116,0.000410,1.595589e-03,5,0.002004,0.000316
Age at enrollment Curricular units 2nd sem (enrolled)^2,0.00112,0.000335,8.525787e-04,5,0.001809,0.000431
Curricular units 1st sem (approved) Curricular units 1st sem (grade) Curricular units 2nd sem (approved),0.00112,0.000867,2.232672e-02,5,0.002906,-0.000666
Curricular units 1st sem (grade) Curricular units 2nd sem (approved) Curricular units 2nd sem (grade),0.00108,0.000657,1.065582e-02,5,0.002433,-0.000273


In [68]:
def objective(trial):
    xgb_model = xgboost.XGBClassifier(
        enable_categorical=True,
        n_jobs=-1,

        n_estimators=trial.suggest_int("n_estimators", 1000, 5000),
        gamma=trial.suggest_float("gamma", 0, 2),
        max_depth=trial.suggest_int("max_depth", 2, 5),
        max_leaves=trial.suggest_int("max_leaves", 1000, 6000),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1),
        colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.5, 1),
        colsample_bynode=trial.suggest_float("colsample_bynode", 0.05, 1),
        reg_lambda=trial.suggest_float("reg_lambda", 0, 10),
        reg_alpha=trial.suggest_float("reg_alpha", 0, 10),
        grow_policy=trial.suggest_categorical(
                "grow_policy", ["depthwise", "lossguide"]
        ),
        min_child_weight=trial.suggest_float("min_child_weight", 0, 1000),
        max_delta_step=trial.suggest_float("max_delta_step", 50, 1000)
    )
    
    # poly_feats = PolynomialFeatures(trial.suggest_int("poly_feats_high_degree", 1, 3))

    # robust_scaler = None
    # if trial.suggest_categorical("use_robustscaler", [True, False]):
    #     robust_scaler = RobustScaler(
    #         with_centering=trial.suggest_categorical("with_centering", [True, False]),
    #         with_scaling=trial.suggest_categorical("with_scaling", [True, False]),
    #     )

    label_enc = LabelEncoder()
    x = train_data[more_important_features.index]
    y = label_enc.fit_transform(train_data["Target"])
    # x = train_df.drop(columns=["Target"])
    # y = label_enc.fit_transform(train_df["Target"])
    # kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
    kfold = StratifiedKFold(n_splits=5)

    rfecv = RFECV(
        estimator=xgb_model,
        step=5,
        cv=kfold,
        scoring="accuracy",
        min_features_to_select=1,
        n_jobs=-1
    )
    rfecv.fit(x,y)
    cv_results = rfecv.cv_results_

    trial.set_user_attr("best_features_count", int(rfecv.n_features_))
    trial.set_user_attr("mean_test_score", [float(i) for i in cv_results["mean_test_score"]])
    trial.set_user_attr("std_test_score", [float(i) for i in cv_results["std_test_score"]])

    y_preds = rfecv.predict(x)
    # xgb_model.fit(x, y)
    # y_preds = xgb_model.predict(x)
    return accuracy_score(y, y_preds)

In [70]:
# optuna.delete_study(study_name="optimized_xgb_v2", storage="sqlite:///optuna.sqlite3")

study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///optuna.sqlite3",
    study_name="optimized_xgb_v2",
    load_if_exists=True
)

study.optimize(objective, n_trials=50)

[I 2024-06-21 00:22:28,073] Using an existing study with name 'optimized_xgb_v2' instead of creating a new one.


[I 2024-06-21 00:24:54,377] Trial 10 finished with value: 0.8276173062192681 and parameters: {'n_estimators': 1778, 'gamma': 1.4050802075420419, 'max_depth': 3, 'max_leaves': 3012, 'colsample_bytree': 0.5099105080567855, 'colsample_bylevel': 0.8214587008484072, 'colsample_bynode': 0.12235140242755654, 'reg_lambda': 0.6560699703436619, 'reg_alpha': 7.890292375901586, 'grow_policy': 'depthwise', 'min_child_weight': 35.96416965551831, 'max_delta_step': 968.8106311587065}. Best is trial 2 with value: 0.8384275159991105.
[I 2024-06-21 00:29:17,604] Trial 11 finished with value: 0.8439499888809271 and parameters: {'n_estimators': 3160, 'gamma': 0.15563127404979826, 'max_depth': 5, 'max_leaves': 1056, 'colsample_bytree': 0.8603376837333011, 'colsample_bylevel': 0.8410853752141421, 'colsample_bynode': 0.6754096761398442, 'reg_lambda': 2.8778005526731456, 'reg_alpha': 9.040655843252802, 'grow_policy': 'lossguide', 'min_child_weight': 8.636579454882735, 'max_delta_step': 941.9576381343791}. Best

In [76]:
print(study.best_params)
best_model = xgboost.XGBClassifier(
        enable_categorical=True,
        n_jobs=-1,
        **study.best_params
    )

label_enc = LabelEncoder()
x = train_data[more_important_features.index]
y = label_enc.fit_transform(train_data["Target"])

best_model.fit(x, y)
test_preds = best_model.predict(test_data[more_important_features.index])
test_preds = label_enc.inverse_transform(test_preds)

out_pd = pd.DataFrame(index=test_data.index)
out_pd["Target"] = list(test_preds)
out_pd.to_csv("optimized_xgb_v2.csv", columns=["Target"], index=True)

{'n_estimators': 2157, 'gamma': 0.006252158249428007, 'max_depth': 5, 'max_leaves': 2001, 'colsample_bytree': 0.9492903718787332, 'colsample_bylevel': 0.8030169059938119, 'colsample_bynode': 0.3696029439392706, 'reg_lambda': 1.0819082868484293, 'reg_alpha': 8.111300991263777, 'grow_policy': 'lossguide', 'min_child_weight': 264.90623838902513, 'max_delta_step': 742.7952645023429}
