# Histogram based Gradient Boosting

## Import required libraries

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

## Load dataset and discretize popularity

In [None]:
df = pd.read_csv('../../Dataset/data_merged.csv')
mean_popularity = 42
df["popularity"] = [ 1 if i >= mean_popularity else 0 for i in df.popularity ]
df["popularity"].head()

## Split features, labels and train-test data

In [None]:
X  = df.drop(['popularity'], axis=1)
y = df['popularity']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=21)

## Histogram based gradient-boosting

In [None]:
gb_clf = HistGradientBoostingClassifier(loss='binary_crossentropy', max_iter=1000)

In [None]:
gb_clf.fit(X_train, y_train)

In [None]:
gb_clf.score(X_train, y_train)

In [None]:
gb_clf.score(X_test, y_test)

In [None]:
y_pred = gb_clf.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print("Classification report\n")
print(classification_report(y_test, y_pred))

## Bayesian Search

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective
from matplotlib import pyplot as plt

In [None]:
opt = BayesSearchCV(
    HistGradientBoostingClassifier(random_state=42),
    {
        'max_iter' : Integer(100, 200),
        'learning_rate' : Real(0.01, 1),
        'max_leaf_nodes' : Integer(10,100),
        'min_samples_leaf' : Integer(10,100),
#         'C': (1e-6, 1e+6, 'log-uniform'),
#         'gamma': (1e-6, 1e+1, 'log-uniform'),
#         'degree': (1, 8),  # integer valued parameter
#         'kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
    },
    n_iter=128,
    cv=3
)

In [None]:
opt.fit(X_train, y_train)

from joblib import dump
dump(opt, "Saved models/GradBoost")

## Best parameters and scores obtained

In [None]:
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("best params: %s" % str(opt.best_params_))

In [None]:
y_pred = opt.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print("Classification report\n")
print(classification_report(y_test, y_pred))

On increasing number of iterations, max_leaf_nodes, min_samples_leaf etc., learning rate automatically goes down and results remain the same more or less.

## Visualize results with partial dependence plots

In [None]:
# plot_objective(opt.optimizer_results_[0],
#                    dimensions=["n_estimators", "learning_rate"],
#                    n_minimum_search=int(1e8))