In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_absolute_error, roc_auc_score

%run util_lab.ipynb

In [6]:
# https://xgboost.readthedocs.io/en/stable/python/index.html

# Load Data

In [8]:
X, Y, df_lab = get_data_lab()

In [9]:
X = df_lab.iloc[:, :-1].to_numpy()
Y = df_lab.iloc[:, -1].to_numpy()

In [10]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=0)

# Build Model

In [12]:
clf = XGBClassifier(random_state=0)

In [13]:
clf.fit(Xtrain, Ytrain)

In [14]:
YtestPred = clf.predict(Xtest)

accuracy_score(Ytest, YtestPred)

0.6979166666666666

In [None]:
print(classification_report(Ytest, YtestPred))

              precision    recall  f1-score   support

           0       0.74      0.67      0.71        52
           1       0.65      0.73      0.69        44

    accuracy                           0.70        96
   macro avg       0.70      0.70      0.70        96
weighted avg       0.70      0.70      0.70        96



In [20]:
# Mean abs error

print(mean_absolute_error(Ytest, YtestPred))

0.3020833333333333


In [22]:
# check for overfitting

YtrainPred = clf.predict(Xtrain)
accuracy_score(YtrainPred, Ytrain)

1.0

# Hyperparam Tuning 

In [24]:
clf.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'feature_weights': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 0,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [26]:
param_grid_XB = {
    'learning_rate': [1.0, 0.15, 0.1, 0.05, 0.01, 0.005, 0.001],
    'n_estimators': [1, 10, 50, 100, 150, 200, 250, 300, 500, 1000],
    'max_depth': range(0,4)
}

In [35]:
grid_search_XB = GridSearchCV(estimator=XGBClassifier(random_state=42), 
                                 param_grid=param_grid_XB, cv=10, scoring='accuracy',
                                 n_jobs=-1)

In [36]:
grid_search_XB.fit(Xtrain, Ytrain)

In [37]:
grid_search_XB.best_params_, grid_search_XB.best_score_

({'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 100},
 np.float64(0.7713438735177867))

In [39]:
model = grid_search_XB.best_estimator_

Ypred = model.predict(Xtest)

accuracy_score(Ytest, Ypred)

0.8229166666666666

In [41]:
print(classification_report(Ytest, Ypred))

              precision    recall  f1-score   support

           0       0.84      0.83      0.83        52
           1       0.80      0.82      0.81        44

    accuracy                           0.82        96
   macro avg       0.82      0.82      0.82        96
weighted avg       0.82      0.82      0.82        96



In [42]:
# overfitting 

YtrainPred = model.predict(Xtrain)

accuracy_score(Ytrain, YtrainPred)

0.8161434977578476

Exception ignored in: <function ResourceTracker.__del__ at 0x103ec1b20>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10d561b20>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x1068edb20>
Traceback (most recent call last