<a href="https://colab.research.google.com/github/mukheshbabu/machine-learning/blob/main/XG_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install --user xgboost



**Loading and Exploring the Data**

In [28]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

diamonds = sns.load_dataset("diamonds")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [29]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [30]:
diamonds.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


**How to Build an XGBoost DMatrix**

In [31]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

In [32]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
  X[col] = X[col].astype('category')

In [33]:
X.dtypes

Unnamed: 0,0
carat,float64
cut,category
color,category
clarity,category
depth,float64
table,float64
x,float64
y,float64
z,float64


In [34]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [35]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

**Python XGBoost Regression**

In [57]:
import numpy as np

# Define hyperparameters for XGBoost
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}

# Train the XGBoost model
bst = xgb.train(params, dtrain_reg, num_boost_round=100)

# Generate predictions
predicted = bst.predict(dtest_reg)

# Convert y_test to NumPy array
actual = y_test.values.flatten()

# Calculate MSE and RMSE
mse = np.mean((actual - predicted) ** 2)
rmse = np.sqrt(mse)

print("MSE:", mse)
print("RMSE:", rmse)

MSE: 305655.6237797839
RMSE: 552.8613060974551


**Training**

In [58]:
# Define hyperparameters


n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

**Evaluation**

In [59]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [60]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

# Calculate MSE and then take the square root to get RMSE
mse = mean_squared_error(y_test, preds)  # Remove squared=False
rmse = np.sqrt(mse)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 552.861


**Using Validation Sets During Training**

In [61]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 100

evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=10 # Every ten rounds
)

[0]	validation-rmse:2817.90814	train-rmse:2874.49146
[10]	validation-rmse:592.03160	train-rmse:548.36512
[20]	validation-rmse:558.53485	train-rmse:491.09887
[30]	validation-rmse:555.51015	train-rmse:469.58201
[40]	validation-rmse:554.45666	train-rmse:454.32953
[50]	validation-rmse:554.13365	train-rmse:438.68033
[60]	validation-rmse:551.57888	train-rmse:425.38361
[70]	validation-rmse:549.26109	train-rmse:414.71115
[80]	validation-rmse:549.03952	train-rmse:405.41008
[90]	validation-rmse:551.87206	train-rmse:391.04269
[99]	validation-rmse:552.86131	train-rmse:383.48826


**XGBoost Early Stopping**

In [62]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 5000

evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=250
)


[0]	validation-rmse:2817.90814	train-rmse:2874.49146
[250]	validation-rmse:557.61263	train-rmse:283.21559
[500]	validation-rmse:564.77532	train-rmse:201.44074
[750]	validation-rmse:570.41258	train-rmse:155.76096
[1000]	validation-rmse:574.22590	train-rmse:127.04692
[1250]	validation-rmse:575.74609	train-rmse:105.13407
[1500]	validation-rmse:577.88702	train-rmse:87.17633
[1750]	validation-rmse:578.86698	train-rmse:75.08646
[2000]	validation-rmse:579.64879	train-rmse:64.86890
[2250]	validation-rmse:580.07445	train-rmse:56.54684
[2500]	validation-rmse:580.65263	train-rmse:50.04183
[2750]	validation-rmse:581.19185	train-rmse:44.39520
[3000]	validation-rmse:581.63495	train-rmse:39.38436
[3250]	validation-rmse:582.03607	train-rmse:35.32653
[3500]	validation-rmse:582.04309	train-rmse:31.85327
[3750]	validation-rmse:582.38781	train-rmse:28.94276
[4000]	validation-rmse:582.60757	train-rmse:26.57816
[4250]	validation-rmse:582.65767	train-rmse:24.39706
[4500]	validation-rmse:582.92145	train-rmse:

In [63]:
n = 10000


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=50
)

[0]	validation-rmse:2817.90814	train-rmse:2874.49146
[50]	validation-rmse:554.13365	train-rmse:438.68033
[100]	validation-rmse:553.73941	train-rmse:381.96310
[150]	validation-rmse:551.25006	train-rmse:339.03503
[200]	validation-rmse:556.18693	train-rmse:308.00219
[250]	validation-rmse:557.61263	train-rmse:283.21559
[300]	validation-rmse:559.05657	train-rmse:260.57418
[350]	validation-rmse:560.65574	train-rmse:243.73253
[400]	validation-rmse:562.32328	train-rmse:228.60311
[450]	validation-rmse:563.18017	train-rmse:213.77568
[500]	validation-rmse:564.77532	train-rmse:201.44074
[550]	validation-rmse:566.71920	train-rmse:191.07173
[600]	validation-rmse:567.56507	train-rmse:181.24398
[650]	validation-rmse:568.47124	train-rmse:172.37143
[700]	validation-rmse:569.60635	train-rmse:164.75530
[750]	validation-rmse:570.41258	train-rmse:155.76096
[800]	validation-rmse:570.93443	train-rmse:149.22645
[850]	validation-rmse:571.72429	train-rmse:142.83847
[900]	validation-rmse:572.77341	train-rmse:136.

**XGBoost Cross Validation**

In [64]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [65]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2874.224552,9.424846,2876.318793,36.995997
1,2088.350837,7.595382,2093.063623,25.351925
2,1552.629638,4.97414,1560.552731,19.550836
3,1185.994963,4.133544,1198.669943,14.648669
4,943.402904,4.757288,962.349383,11.724038


In [66]:
best_rmse = results['test-rmse-mean'].min()

best_rmse
550.8959336674216

550.8959336674216

**XGBoost Classification**

In [67]:
from sklearn.preprocessing import OrdinalEncoder

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']]

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
   X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

In [68]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [70]:
params = {"objective": "multi:softprob", "tree_method": "hist", "num_class": 5}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
)

In [72]:
results.keys()

pd.Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',

      'train-auc-std', 'train-merror-mean', 'train-merror-std',

      'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',

      'test-auc-std', 'test-merror-mean', 'test-merror-std'],

     dtype='object')

Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',
       'train-auc-std', 'train-merror-mean', 'train-merror-std',
       'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',
       'test-auc-std', 'test-merror-mean', 'test-merror-std'],
      dtype='object')

**XGBoost Native vs. XGBoost Sklearn**

In [73]:
import xgboost as xgb

# Train a model using the scikit-learn API
xgb_classifier = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=3, enable_categorical=True)
xgb_classifier.fit(X_train, y_train)

# Convert the model to a native API model
model = xgb_classifier.get_booster()