### nb_ChatGPT_XGBoost_Classifier

Use XGBoost Classifier with sklearn interface in python.

Use make_classification() to generate 10,000 rows of synthetic data
<br>with 6 numeric features and one binary target.

Split data into train/test.
<br>train the model - and evaluate the accuracy

In [1]:
import os, pickle
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
# Generate synthetic data with 5 numeric features and one binary target
X, y = make_classification(n_samples=10000, n_features=6, n_informative=3, n_redundant=1, 
                           flip_y=0.1, random_state=123)

In [3]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [4]:
# Train an XGBoost classifier using the default hyperparameters
model = xgb.XGBClassifier()

In [5]:
model.fit(X_train, y_train)

In [6]:
# Evaluate the accuracy of the model on the test set
acc = model.score(X_test, y_test)
print(f"Accuracy: {acc:.3f}")

Accuracy: 0.837


In [7]:
# define the grid of parameters we will be testing
# Note in original testing I had more values, 
# but I removed them as the accuracy was not good
param_grid = {
    'max_depth'    : [3,5,7,10,14],       # default 6
    'learning_rate': [0.2, 0.1],          # removed 0.01
    'subsample'    : [0.7],               # default 1
    'n_estimators' : [100,200,300]        # default 100
}

In [8]:
results = {}
ii=0
for max_depth in param_grid['max_depth']:
    for learning_rate in param_grid['learning_rate']:
        for subsample in param_grid['subsample']:
            for n_estimators in param_grid['n_estimators']:
                ii += 1
                xgb_model = xgb.XGBClassifier( 
                                             random_state=42,
                                             max_depth=max_depth,
                                             learning_rate=learning_rate,
                                             subsample=subsample,
                                             n_estimators=n_estimators
                                            )
                ss = f"max_depth:{max_depth:2d},learn_rate:{learning_rate:5.3f}"
                ss +=f",subsample:{subsample:3.1f},n_estim:{n_estimators:3d}"
                # print(f"training {ss}",)
                xgb_model.fit(X_train, y_train)
                y_pred = xgb_model.predict(X_test)
                acc = xgb_model.score(X_test, y_test)
                fname = 'junk.pkl'
                with open(fname, 'wb') as f:
                    pickle.dump(xgb_model, f)
                fsize_kb = round(os.path.getsize(fname) / 1024.0)
                results[ss] = {'acc':acc,'kb':fsize_kb}
                print(f"{ii:3} : {str(ss):25}: {str(results[ss]):30}")

  1 : max_depth: 3,learn_rate:0.200,subsample:0.7,n_estim:100: {'acc': 0.84, 'kb': 116}      
  2 : max_depth: 3,learn_rate:0.200,subsample:0.7,n_estim:200: {'acc': 0.842, 'kb': 226}     
  3 : max_depth: 3,learn_rate:0.200,subsample:0.7,n_estim:300: {'acc': 0.8325, 'kb': 338}    
  4 : max_depth: 3,learn_rate:0.100,subsample:0.7,n_estim:100: {'acc': 0.838, 'kb': 117}     
  5 : max_depth: 3,learn_rate:0.100,subsample:0.7,n_estim:200: {'acc': 0.8425, 'kb': 226}    
  6 : max_depth: 3,learn_rate:0.100,subsample:0.7,n_estim:300: {'acc': 0.841, 'kb': 337}     
  7 : max_depth: 5,learn_rate:0.200,subsample:0.7,n_estim:100: {'acc': 0.848, 'kb': 217}     
  8 : max_depth: 5,learn_rate:0.200,subsample:0.7,n_estim:200: {'acc': 0.842, 'kb': 424}     
  9 : max_depth: 5,learn_rate:0.200,subsample:0.7,n_estim:300: {'acc': 0.8395, 'kb': 630}    
 10 : max_depth: 5,learn_rate:0.100,subsample:0.7,n_estim:100: {'acc': 0.856, 'kb': 224}     
 11 : max_depth: 5,learn_rate:0.100,subsample:0.7,n_estim:20

In [9]:
# Let us only consider entries where acc > 0.895

# --------------------------------------------
def parse_key(ss):
    """ convenient to filter by some grid parameters"""
    dd = {}
    for part in ss.split(","):
        kk,vv = part.split(":")
        dd[kk] = float(vv)
    return dd

# --------------------------------------------
res2 = {}
for k,v in results.items():
    acc = v['acc']
    kb = v['kb']
    if acc <= 0.85:
        continue
    res2[k] = v
    print(f"{k} => {v}")

max_depth: 5,learn_rate:0.100,subsample:0.7,n_estim:100 => {'acc': 0.856, 'kb': 224}
max_depth: 7,learn_rate:0.100,subsample:0.7,n_estim:100 => {'acc': 0.8505, 'kb': 415}
max_depth:10,learn_rate:0.100,subsample:0.7,n_estim:100 => {'acc': 0.855, 'kb': 793}
max_depth:10,learn_rate:0.100,subsample:0.7,n_estim:200 => {'acc': 0.8505, 'kb': 1437}
max_depth:14,learn_rate:0.100,subsample:0.7,n_estim:100 => {'acc': 0.8565, 'kb': 1242}


In [10]:
# Interesting - the quality of the model is not very sensitive to the parameters.
# the good choice with good accurac and small model size:
# max_depth: 5,learn_rate:0.100,subsample:0.7,n_estim:100 
# => {'acc': 0.856, 'kb': 224}