# Histogram based Gradient Boosting

## Import required libraries

In [1]:
from sklearn.ensemble import HistGradientBoostingClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from skopt import BayesSearchCV
from skopt.space import Integer, Real
from matplotlib import pyplot as plt

## Load dataset and discretize popularity

In [2]:
df = pd.read_csv('../../Dataset/data_merged.csv')
mean_popularity = 42
df["popularity"] = [ 1 if i >= mean_popularity else 0 for i in df.popularity ]
df["popularity"].head()

0    0
1    0
2    0
3    0
4    0
Name: popularity, dtype: int64

## Split features, labels and train-test data

In [3]:
X  = df.drop(['popularity'], axis=1)
y = df['popularity']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=21)

## Histogram based gradient-boosting

In [5]:
gb_clf = HistGradientBoostingClassifier(loss='log_loss', max_iter=1000)

In [6]:
gb_clf.fit(X_train, y_train)

In [7]:
gb_clf.score(X_train, y_train)

0.900140800092899

In [8]:
gb_clf.score(X_test, y_test)

0.8907855774255357

In [9]:
y_pred = gb_clf.predict(X_test)

In [10]:
print(confusion_matrix(y_test, y_pred))
print("Classification report\n")
print(classification_report(y_test, y_pred))

[[24374  1057]
 [ 2705  6310]]
Classification report

              precision    recall  f1-score   support

           0       0.90      0.96      0.93     25431
           1       0.86      0.70      0.77      9015

    accuracy                           0.89     34446
   macro avg       0.88      0.83      0.85     34446
weighted avg       0.89      0.89      0.89     34446



## Bayesian Search

In [11]:
opt = BayesSearchCV(
    HistGradientBoostingClassifier(random_state=42),
    {
        'max_iter' : Integer(100, 200),
        'learning_rate' : Real(0.01, 1),
        'max_leaf_nodes' : Integer(10,100),
        'min_samples_leaf' : Integer(10,100),
    },
    n_iter=128,
    cv=3
)

In [None]:
opt.fit(X_train, y_train)


FileNotFoundError: [Errno 2] No such file or directory: 'Saved models/GradBoost'

In [13]:

from joblib import dump
dump(opt, "Saved models_gradiant_boosting.pkl")

['Saved models_gradiant_boosting.pkl']

## Best parameters and scores obtained

In [14]:
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("best params: %s" % str(opt.best_params_))

val. score: 0.8917653718864309
test score: 0.8906113917435987
best params: OrderedDict([('learning_rate', 0.07864585802245953), ('max_iter', 200), ('max_leaf_nodes', 69), ('min_samples_leaf', 33)])


In [15]:
y_pred = opt.predict(X_test)

In [16]:
print(confusion_matrix(y_test, y_pred))
print("Classification report\n")
print(classification_report(y_test, y_pred))

[[24370  1061]
 [ 2707  6308]]
Classification report

              precision    recall  f1-score   support

           0       0.90      0.96      0.93     25431
           1       0.86      0.70      0.77      9015

    accuracy                           0.89     34446
   macro avg       0.88      0.83      0.85     34446
weighted avg       0.89      0.89      0.89     34446



On increasing number of iterations, max_leaf_nodes, min_samples_leaf etc., learning rate automatically goes down and results remain the same more or less.

## Visualize results with partial dependence plots

In [18]:
# plot_objective(opt.optimizer_results_[0],
#                    dimensions=["n_estimators", "learning_rate"],
#                    n_minimum_search=int(1e8))