# XGBoost 2

In [1]:
import seaborn as sns

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

In [2]:
# loading and exploring the datas
import warnings


warnings.filterwarnings("ignore")

diamonds = sns.load_dataset("diamonds")

diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
diamonds.shape


(53940, 10)

In [4]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [5]:
diamonds.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


In [6]:
# building XGBoost DMatrix
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

In [7]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [8]:
X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [10]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [13]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

In [14]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [15]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [17]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 555.607


In [18]:
# using validation sets during training
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 100

In [19]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

In [20]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:2874.29379	validation-rmse:2817.38773
[1]	train-rmse:2092.07711	validation-rmse:2054.73630
[2]	train-rmse:1549.52687	validation-rmse:1526.30592
[3]	train-rmse:1184.46798	validation-rmse:1174.90119
[4]	train-rmse:941.09127	validation-rmse:943.28272
[5]	train-rmse:784.58014	validation-rmse:796.09651
[6]	train-rmse:685.75110	validation-rmse:705.22245
[7]	train-rmse:624.67281	validation-rmse:653.32563
[8]	train-rmse:584.19599	validation-rmse:620.30404
[9]	train-rmse:558.77667	validation-rmse:599.24504
[10]	train-rmse:543.85303	validation-rmse:586.99790
[11]	train-rmse:531.92694	validation-rmse:578.68120
[12]	train-rmse:523.08456	validation-rmse:571.73527
[13]	train-rmse:515.67753	validation-rmse:567.19913
[14]	train-rmse:510.77594	validation-rmse:564.66402
[15]	train-rmse:506.68519	validation-rmse:563.21547
[16]	train-rmse:502.96796	validation-rmse:561.80880
[17]	train-rmse:498.90184	validation-rmse:560.36561
[18]	train-rmse:492.74859	validation-rmse:558.46274
[19]	train-rms

In [21]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 100

evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=10 # Every ten rounds
)

[0]	validation-rmse:2817.38773	train-rmse:2874.29379
[10]	validation-rmse:586.99790	train-rmse:543.85303
[20]	validation-rmse:556.44229	train-rmse:487.42071
[30]	validation-rmse:554.68339	train-rmse:460.86396
[40]	validation-rmse:552.62130	train-rmse:444.03762
[50]	validation-rmse:553.50718	train-rmse:430.07110
[60]	validation-rmse:555.44368	train-rmse:418.57995
[70]	validation-rmse:555.06703	train-rmse:406.77489
[80]	validation-rmse:555.00800	train-rmse:394.18070
[90]	validation-rmse:555.74725	train-rmse:382.65353
[99]	validation-rmse:555.60692	train-rmse:373.74308


In [23]:
# XGBoost early stopping
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 5000

evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=500 #initial it was 250
)

[0]	validation-rmse:2817.38773	train-rmse:2874.29379
[500]	validation-rmse:563.29248	train-rmse:197.72375
[1000]	validation-rmse:572.18689	train-rmse:121.69016
[1500]	validation-rmse:576.42698	train-rmse:84.17960
[2000]	validation-rmse:578.58142	train-rmse:60.88001
[2500]	validation-rmse:579.96944	train-rmse:46.47975
[3000]	validation-rmse:580.78514	train-rmse:36.59124
[3500]	validation-rmse:581.36774	train-rmse:29.47363
[4000]	validation-rmse:581.86083	train-rmse:23.90580
[4500]	validation-rmse:582.07031	train-rmse:19.97500
[4999]	validation-rmse:582.23127	train-rmse:17.15835


In [24]:
n = 10000


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=50
)

[0]	validation-rmse:2817.38773	train-rmse:2874.29379
[50]	validation-rmse:553.50718	train-rmse:430.07110
[100]	validation-rmse:555.39120	train-rmse:372.49371
[150]	validation-rmse:557.80361	train-rmse:333.62595
[200]	validation-rmse:559.48104	train-rmse:300.06795
[250]	validation-rmse:561.30944	train-rmse:276.85755
[300]	validation-rmse:561.43599	train-rmse:257.34345
[350]	validation-rmse:561.82074	train-rmse:239.54154
[400]	validation-rmse:563.09188	train-rmse:223.46175
[450]	validation-rmse:563.07923	train-rmse:209.80627
[500]	validation-rmse:563.29248	train-rmse:197.72375
[550]	validation-rmse:565.24722	train-rmse:187.28444
[600]	validation-rmse:566.42436	train-rmse:176.54386
[650]	validation-rmse:567.17392	train-rmse:166.61162
[700]	validation-rmse:568.01290	train-rmse:157.61657
[750]	validation-rmse:568.62872	train-rmse:150.03843
[800]	validation-rmse:569.21259	train-rmse:143.23111
[850]	validation-rmse:569.80695	train-rmse:137.52825
[900]	validation-rmse:570.23810	train-rmse:132.

In [25]:
# XGBoost cross validation
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [26]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2874.530912,9.57651,2877.437274,37.09354
1,2089.327469,8.31729,2094.021636,24.828795
2,1550.617973,5.223297,1558.386252,18.540267
3,1183.812759,5.19342,1195.032441,13.47158
4,941.203113,4.539805,958.728828,9.479449


In [27]:
best_rmse = results['test-rmse-mean'].min()

best_rmse

549.311480649509

In [29]:
# XGBoost classification
from sklearn.preprocessing import OrdinalEncoder

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']]

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
   X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

In [30]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [31]:
params = {"objective": "multi:softprob", "tree_method": "gpu_hist", "num_class": 5}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
)

In [32]:
results.keys()

Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',
       'train-auc-std', 'train-merror-mean', 'train-merror-std',
       'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',
       'test-auc-std', 'test-merror-mean', 'test-merror-std'],
      dtype='object')

In [33]:
# XGBoost Naive vs XGBoost Sklearn
import xgboost as xgb

# Train a model using the scikit-learn API
xgb_classifier = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=3, enable_categorical=True)
xgb_classifier.fit(X_train, y_train)

# Convert the model to a native API model
model = xgb_classifier.get_booster()