In [1]:
from tools import *
from data import *
from model_xgb import hyperparam_tuning
import json 

In [8]:
tuned_results = hyperparam_tuning(X_train, y_train)

with open("model_xgb_params.json", "w") as outfile: 
    json.dump(tuned_results.best_params_, outfile)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'subsample': 0.6, 'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.8}
Best accuracy score: 0.6933999999999999


In [9]:
tuned_results = hyperparam_tuning(X_train, y_train, n_iter=1000)

with open("model_xgb_params.json", "w") as outfile: 
    json.dump(tuned_results.best_params_, outfile)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Best parameters found: {'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 1.0}
Best accuracy score: 0.6932


In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import RFECV
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# The function that performs hyperparameter tuning and feature selection
def hyperparam_tuning_with_feature_selection(X_train, y_train, k=5, n_iter=20) -> RandomizedSearchCV:
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    # Separate categorical and numerical columns
    cat_cols = X_train.select_dtypes(include=['object']).columns
    num_cols = X_train.select_dtypes(exclude=['object']).columns
    
    # Create a preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', num_cols),  # Keep numeric columns as they are
            ('cat', OneHotEncoder(), cat_cols)  # One-hot encode categorical columns
        ]
    )
    
    # XGBoost classifier
    xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=len(np.unique(y_train)), 
                                  eval_metric="mlogloss", random_state=42)
    
    # Feature selection (recursive feature elimination with cross-validation)
    feature_selector = RFECV(estimator=xgb_model, step=1, cv=cv, scoring='accuracy')
    
    # Build a pipeline with preprocessing, feature selection, and model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', feature_selector),
        ('classifier', xgb_model)
    ])
    
    # Hyperparameter grid for RandomizedSearchCV
    param_dist = {
        'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'classifier__max_depth': [3, 6, 9, 12],
        'classifier__n_estimators': [100, 300, 500],
        'classifier__subsample': [0.6, 0.8, 1.0],
        'classifier__colsample_bytree': [0.6, 0.8, 1.0],
        'classifier__gamma': [0, 1, 5],
        'classifier__min_child_weight': [1, 3, 5]
    }
    
    # Randomized search for hyperparameters
    random_search = RandomizedSearchCV(
        pipeline, param_distributions=param_dist, n_iter=n_iter,
        scoring='accuracy', cv=cv, verbose=2, n_jobs=-1, random_state=42
    )
    
    # Fit the random search
    random_search.fit(X_train, y_train)
    
    # Output best results
    print("Best parameters found:", random_search.best_params_)
    print("Best accuracy score:", random_search.best_score_)
    
    return random_search, feature_selector


In [22]:
# Call the hyperparameter tuning function with feature selection
random_search, feature_selector = hyperparam_tuning_with_feature_selection(X_train, y_train, k=5, n_iter=50)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found: {'classifier__subsample': 0.6, 'classifier__n_estimators': 100, 'classifier__min_child_weight': 3, 'classifier__max_depth': 6, 'classifier__learning_rate': 0.05, 'classifier__gamma': 1, 'classifier__colsample_bytree': 1.0}
Best accuracy score: 0.6908
