# Adaptive Boosting Technique

## Load required libraries

In [None]:
!pip install pandas
!pip install scikit-optimize
!pip install matplotlib
!pip install scikit-learn

In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import sklearn
from skopt import BayesSearchCV
from skopt.space import Real
from skopt.space import Integer
from matplotlib import pyplot as plt
from skopt.space import Categorical
from skopt.plots import plot_objective

In [14]:
df = pd.read_csv('../../Dataset/data_merged.csv')
mean_popularity = 42
df["popularity"] = [ 1 if i >= mean_popularity else 0 for i in df.popularity ]
df["popularity"].head()

0    0
1    0
2    0
3    0
4    0
Name: popularity, dtype: int64

In [15]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME", learning_rate=1)

## Prepare and split data to train and test set

In [16]:
X  = df.drop(['popularity'], axis=1)
y = df['popularity']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=21)

## Train the model

In [18]:
ada_clf.fit(X_train, np.ravel(y_train))

## Testing model

In [21]:
y_train_pred = ada_clf.predict(X_train)

In [22]:
accuracy_score(y_train, y_train_pred)

0.8809150554491088

In [27]:
y_pred = ada_clf.predict(X_test)

In [28]:
accuracy_score(y_test, y_pred)

0.8823666027985833

In [29]:
print('final result:')
print("confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print("classification report:")
print(classification_report(y_test, y_pred))

final result:
confusion matrix:
[[23907  1519]
 [ 2533  6487]]
classification report:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92     25426
           1       0.81      0.72      0.76      9020

    accuracy                           0.88     34446
   macro avg       0.86      0.83      0.84     34446
weighted avg       0.88      0.88      0.88     34446



## Bayesian Optimization for finding appropriate hyperparameters

In [None]:
opt = BayesSearchCV(
    AdaBoostClassifier(algorithm='SAMME'),
    {
        'n_estimators' : Integer(50, 250),
        'learning_rate' : Real(0.01, 1.0)
    },
    n_iter=32,
    cv=3
)

In [36]:
opt.fit(X_train, y_train)

# from joblib import dump
# dump(opt, "Saved models/AdaBoost")



In [41]:
from joblib import dump
dump(opt, "Saved models.pkl")

['Saved models.pkl']

## Best parameters and scores obtained

In [42]:
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("best params: %s" % str(opt.best_params_))

val. score: 0.8829689949486151
test score: 0.8845148928758056
best params: OrderedDict([('learning_rate', 0.370003869483996), ('n_estimators', 250)])


In [43]:
y_pred = opt.predict(X_test)

In [44]:
print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))
print("\n\nClassification report\n")
print(classification_report(y_test, y_pred))

Confusion matrix:

[[24119  1307]
 [ 2671  6349]]


Classification report

              precision    recall  f1-score   support

           0       0.90      0.95      0.92     25426
           1       0.83      0.70      0.76      9020

    accuracy                           0.88     34446
   macro avg       0.86      0.83      0.84     34446
weighted avg       0.88      0.88      0.88     34446

