In [1]:
from tools import *
from data import *
from model_xgb import hyperparam_tuning, select_features_xgb
import json

In [None]:
tuned_results = hyperparam_tuning(X_train, y_train)

# with open("params_xgb.json", "w") as outfile: 
#     json.dump(tuned_results.best_params_, outfile)

In [None]:
tuned_results = hyperparam_tuning(X_train, y_train, n_iter=1000)

# with open("params_xgb.json", "w") as outfile: 
#     json.dump(tuned_results.best_params_, outfile)

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import RFECV
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# The function that performs hyperparameter tuning and feature selection
def hyperparam_tuning_with_feature_selection(X_train, y_train, k=5, n_iter=20) -> RandomizedSearchCV:
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    # Separate categorical and numerical columns
    cat_cols = X_train.select_dtypes(include=['object']).columns
    num_cols = X_train.select_dtypes(exclude=['object']).columns
    
    # Create a preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', num_cols),  # Keep numeric columns as they are
            ('cat', OneHotEncoder(), cat_cols)  # One-hot encode categorical columns
        ]
    )
    
    # XGBoost classifier
    xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=len(np.unique(y_train)), 
                                  eval_metric="mlogloss", random_state=42)
    
    # Feature selection (recursive feature elimination with cross-validation)
    feature_selector = RFECV(estimator=xgb_model, step=1, cv=cv, scoring='accuracy')
    
    # Build a pipeline with preprocessing, feature selection, and model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', feature_selector),
        ('classifier', xgb_model)
    ])
    
    # Hyperparameter grid for RandomizedSearchCV
    param_dist = {
        'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'classifier__max_depth': [3, 6, 9, 12],
        'classifier__n_estimators': [100, 300, 500],
        'classifier__subsample': [0.6, 0.8, 1.0],
        'classifier__colsample_bytree': [0.6, 0.8, 1.0],
        'classifier__gamma': [0, 1, 5],
        'classifier__min_child_weight': [1, 3, 5]
    }
    
    # Randomized search for hyperparameters
    random_search = RandomizedSearchCV(
        pipeline, param_distributions=param_dist, n_iter=n_iter,
        scoring='accuracy', cv=cv, verbose=2, n_jobs=-1, random_state=42
    )
    
    # Fit the random search
    random_search.fit(X_train, y_train)
    
    # Output best results
    print("Best parameters found:", random_search.best_params_)
    print("Best accuracy score:", random_search.best_score_)
    
    return random_search, feature_selector


In [3]:
# Call the hyperparameter tuning function with feature selection
random_search, feature_selector = hyperparam_tuning_with_feature_selection(X_train, y_train, k=5, n_iter=20)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


KeyboardInterrupt: 

In [7]:
selected_features = {}
for i in range(12, 0, -1):
    selected_features[i] = select_features_xgb(X_train, y_train, n_features=i)

Selected Features: ['x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
Selected Features: ['x2', 'x3', 'x4', 'x5', 'x6', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
Selected Features: ['x2', 'x3', 'x4', 'x6', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
Selected Features: ['x2', 'x3', 'x4', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']


KeyboardInterrupt: 

In [3]:
with open('params_xgb.json', 'r') as file:
    xgb_params = json.load(file)
selected_features = ['x2', 'x3', 'x4', 'x6', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']

xgb = Model(
    model_type='xgb', 
    selected_features=selected_features,
    xgb_params=xgb_params
)
xgb.fit(X_train, y_train)

In [4]:
train_kcv(
    model_type='xgb',
    X_train=X_train, 
    y_train=y_train,
    selected_features=selected_features,
    xgb_params=xgb_params
)

Cross-validation accuracy: 0.6936 ± 0.0108
Cross-validation log loss: 0.6919 ± 0.0081


In [6]:
selected_features = {}
for i in range(13, 0, -1):
    print(f"RUNNING ITERATION FOR {i} FEATURES")
    if i != 13: 
        selected_features[i] = select_features_xgb(X_train, y_train, n_features=i)
    else: 
        selected_features[i] = [f"x{i}" for i in range(1, 14)]
    train_kcv(
        model_type='xgb',
        X_train=X_train, 
        y_train=y_train,
        selected_features=selected_features[i],
        xgb_params=xgb_params
    )
    print()

RUNNING ITERATION FOR 13 FEATURES
Cross-validation accuracy: 0.6924 ± 0.0092
Cross-validation log loss: 0.6932 ± 0.0087

RUNNING ITERATION FOR 12 FEATURES
Selected Features: ['x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
Cross-validation accuracy: 0.6890 ± 0.0094
Cross-validation log loss: 0.6927 ± 0.0086

RUNNING ITERATION FOR 11 FEATURES
Selected Features: ['x2', 'x3', 'x4', 'x5', 'x6', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
Cross-validation accuracy: 0.6928 ± 0.0103
Cross-validation log loss: 0.6926 ± 0.0083

RUNNING ITERATION FOR 10 FEATURES
Selected Features: ['x2', 'x3', 'x4', 'x6', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
Cross-validation accuracy: 0.6936 ± 0.0108
Cross-validation log loss: 0.6919 ± 0.0081

RUNNING ITERATION FOR 9 FEATURES
Selected Features: ['x2', 'x3', 'x4', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']
Cross-validation accuracy: 0.6948 ± 0.0126
Cross-validation log loss: 0.6918 ± 0.0086

RUNNING ITERATION FOR 8 FEATURES
Selected Featur

In [11]:
selected_features_2 = [
    ['x4', 'x8', 'x9', 'x10', 'x11'],
    ['x1', 'x2', 'x3', 'x5'],
    ['x6', 'x7', 'x10', 'x12', 'x13']
]
for _features in selected_features_2: 
    print(f"FEATURES: {_features}")
    train_kcv(
        model_type='xgb',
        X_train=X_train, 
        y_train=y_train,
        selected_features=_features,
        xgb_params=xgb_params
    )
    print()

FEATURES: ['x4', 'x8', 'x9', 'x10', 'x11']
Cross-validation accuracy: 0.6958 ± 0.0100
Cross-validation log loss: 0.6879 ± 0.0083

FEATURES: ['x1', 'x2', 'x3', 'x5']
Cross-validation accuracy: 0.3962 ± 0.0209
Cross-validation log loss: 1.0955 ± 0.0063

FEATURES: ['x6', 'x7', 'x10', 'x12', 'x13']
Cross-validation accuracy: 0.4086 ± 0.0149
Cross-validation log loss: 1.0886 ± 0.0066

