#**Ensemble Methods**

In [3]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

from sklearn.svm import SVC

from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingRegressor


from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import BaggingRegressor


from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor

import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

import catboost as cb
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

from mlxtend.classifier import  StackingClassifier
from mlxtend.regressor import  StackingRegressor



In [4]:
pokemon = pd.read_csv("Pokemon.csv")

In [5]:
pokemon['Legendary'] = pd.get_dummies(pokemon['Legendary'], drop_first=True).values

In [6]:
pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,0
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,0
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,0
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,0
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,0


In [7]:
X = pokemon.drop(['#', "Name", "Legendary", "Type 1", "Type 2"], axis=1)
y = pokemon['Legendary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=101)

## Voting  
For classification problems

Majority Voting: Mode

Odd number of classifiers are required.

In [8]:
# Instantiate the individual models
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_lr = LogisticRegression(class_weight="balanced")
clf_dt = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, random_state=500)

# Create and fit the voting classifier
clf_vote = VotingClassifier(
    estimators=[('knn', clf_knn), ('lr', clf_lr), ('dt', clf_dt)]
)
clf_vote.fit(X_train, y_train)

VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=5,
                                                   p=2, weights='uniform')),
                             ('lr',
                              LogisticRegression(C=1.0, class_weight='balanced',
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2'...
                              Decision

In [9]:
pred_vote = clf_vote.predict(X_test)
score_vote = f1_score(y_test, pred_vote)
print('F1-Score: {:.3f}'.format(score_vote))

report = classification_report(y_test, pred_vote)
print(report)

F1-Score: 0.703
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       141
           1       0.72      0.68      0.70        19

    accuracy                           0.93       160
   macro avg       0.84      0.82      0.83       160
weighted avg       0.93      0.93      0.93       160



In [11]:
got = pd.read_csv('character-predictions.csv')
got.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   S.No               1946 non-null   int64  
 1   actual             1946 non-null   int64  
 2   pred               1946 non-null   int64  
 3   alive              1946 non-null   float64
 4   plod               1946 non-null   float64
 5   name               1946 non-null   object 
 6   title              938 non-null    object 
 7   male               1946 non-null   int64  
 8   culture            677 non-null    object 
 9   dateOfBirth        433 non-null    float64
 10  DateoFdeath        444 non-null    float64
 11  mother             21 non-null     object 
 12  father             26 non-null     object 
 13  heir               23 non-null     object 
 14  house              1519 non-null   object 
 15  spouse             276 non-null    object 
 16  book1              1946 

## **Averaging**

Both classification and regression

Soft Voting: Mean

Regression:mean of predicted values

Classification:mean of predicted probabilities

In [12]:
clf_lr = LogisticRegression(class_weight='balanced')
clf_dt = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, random_state=500)
clf_svm = SVC(probability=True, class_weight='balanced', random_state=500)

estimators = [
    ('lr', clf_lr),
    ('dt', clf_dt),
    ('svm', clf_svm)
]

# Build and fit an averaging classifier
clf_avg = VotingClassifier(estimators,
                            voting= 'soft') # Averaging
                            # for regressor there is no voting parameter.
                            # weights=[w_1, w_2, ..., w_N] can be also stated.
clf_avg.fit(X_train, y_train)

# Evaluate model performance
acc_avg = accuracy_score(y_test,  clf_avg.predict(X_test))
print('Accuracy: {:.2f}'.format(acc_avg))

Accuracy: 0.93


## **Bootstrap Aggregating**

**Bootstrapping requires:**

Random subsamples

Using replacement


**Heterogeneous:**

Different algorithms (fine-tuned)

Small amount of estimators

Voting, Averaging, and Stacking


**Homogeneous:**

The same algorithm ("weak" model)

Large amount of estimators

Bagging and Boosting

## **Bagging Classifier**

In [13]:
#Take a sample with replacement 
X_train_sample = X_train.sample(frac=1.0, replace=True, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

# Build a "weak" Decision Tree classifier
clf = DecisionTreeClassifier(max_depth=4, max_features=2, random_state=500)

# Fit the model to the training sample
clf.fit(X_train_sample, y_train_sample)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=500, splitter='best')

In [14]:
# Instantiate the base model
clf_dt = DecisionTreeClassifier(max_depth=4)

# Build and train the Bagging classifier
clf_bag = BaggingClassifier(
  n_estimators=21,
  base_estimator=clf_dt,
  oob_score=True,
  bootstrap=True,
  max_features=5,
  random_state=500)
clf_bag.fit(X_train, y_train)

print('OOB-Score: {:.3f}'.format(clf_bag.oob_score_))
# Predict the labels of the test set
pred = clf_bag.predict(X_test)

# Show the F1-score
print('F1-Score: {:.3f}'.format(f1_score(y_test, pred)))

OOB-Score: 0.966
F1-Score: 0.706


## **Gradual Learning**

Principle: iterative learning

Dependent estimators

Learning different tasks for the same goal

Sequential building

## **AdaBoost**

In [15]:
# Instantiate a normalized linear regression model
reg_lm = LinearRegression(normalize=True)

# Build and fit an AdaBoost regressor
reg_ada = AdaBoostRegressor(reg_lm,
                            n_estimators=12,
                            learning_rate=2,
                            random_state=500)
reg_ada.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)

# Evaluate the performance using the RMSE
rmse = np.sqrt(mean_squared_error(y_test, pred))
print('RMSE: {:.3f}'.format(rmse))

RMSE: 0.288


## **Gradient Boosting**

In [16]:
# Build and fit a Gradient Boosting classifier
clf_gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=500)
clf_gbm.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = clf_gbm.predict(X_test)

# Evaluate the performance based on the accuracy
acc = accuracy_score(y_test, pred)
print('Accuracy: {:.3f}'.format(acc))

# Get and show the Confusion Matrix
cm = confusion_matrix(y_test, pred)
print(cm)

Accuracy: 0.938
[[137   4]
 [  6  13]]


XGBoost and Light Gradient Boosting Machine

In [17]:
clf_xgb = XGBClassifier(
    n_estimators = 100,
    learning_rate = 0.1,
    max_depth=3,
    
)

clf_lgb = LGBMClassifier(
    n_estimators = 100,
    learning_rate = 0.1,
    max_depth=-1)

In [18]:
# Build and fit a CatBoost regressor
reg_cat = cb.CatBoostRegressor(n_estimators=10, learning_rate=0.1, max_depth=3, random_state=500)
reg_cat.fit(X_train, y_train)

# Calculate the predictions on the set set
pred = reg_cat.predict(X_test)

# Evaluate the performance using the RMSE
rmse_cat = np.sqrt(mean_squared_error(y_test, pred))
print('RMSE (CatBoost): {:.3f}'.format(rmse_cat))

0:	learn: 0.2460623	total: 47.4ms	remaining: 426ms
1:	learn: 0.2348788	total: 47.8ms	remaining: 191ms
2:	learn: 0.2242178	total: 48.2ms	remaining: 113ms
3:	learn: 0.2145248	total: 48.6ms	remaining: 72.9ms
4:	learn: 0.2068429	total: 49ms	remaining: 49ms
5:	learn: 0.2001496	total: 49.6ms	remaining: 33ms
6:	learn: 0.1929999	total: 50.1ms	remaining: 21.5ms
7:	learn: 0.1876807	total: 50.8ms	remaining: 12.7ms
8:	learn: 0.1830909	total: 51.4ms	remaining: 5.71ms
9:	learn: 0.1792090	total: 52.8ms	remaining: 0us
RMSE (CatBoost): 0.239


In [19]:
# Build and fit a XGBoost regressor
reg_xgb = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=500)
reg_xgb.fit(X_train, y_train)

# Build and fit a LightGBM regressor
reg_lgb = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, seed=500)
reg_lgb.fit(X_train, y_train)

# Calculate the predictions and evaluate both regressors
pred_xgb = reg_xgb.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))
pred_lgb = reg_lgb.predict(X_test)
rmse_lgb = np.sqrt(mean_squared_error(y_test, pred_lgb))

print('Extreme: {:.3f}, Light: {:.3f}'.format(rmse_xgb, rmse_lgb))

Extreme: 0.193, Light: 0.189


## **Stacking**

In [20]:
# Build and fit a Decision Tree classifier
clf_dt = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, random_state=500)
clf_dt.fit(X_train, y_train)

# Build and fit a 5-nearest neighbors classifier using the 'Ball-Tree' algorithm
clf_knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree')
clf_knn.fit(X_train, y_train)

# Evaluate the performance using the accuracy score
print('Decision Tree: {:0.4f}'.format(accuracy_score(y_test, clf_dt.predict(X_test))))
print('5-Nearest Neighbors: {:0.4f}'.format(accuracy_score(y_test, clf_knn.predict(X_test))))

pred_dt  = clf_dt.predict(X_train)
pred_knn = clf_knn.predict(X_train)
# Create a Pandas DataFrame with the predictions
pred_df = pd.DataFrame({
	'pred_dt':pred_dt,
    'pred_knn': pred_knn
}, index=X_train.index)

# Concatenate X_train with the predictions DataFrame
X_train_2nd = pd.concat([X_train, pred_df], axis=1)

# Build the second-layer meta estimator
clf_stack = DecisionTreeClassifier(random_state=500)
clf_stack.fit(X_train_2nd, y_train)

pred_dt  = clf_dt.predict(X_test)
pred_knn = clf_knn.predict(X_test)

# Create a Pandas DataFrame with the predictions
pred_df = pd.DataFrame({
	'pred_dt': pred_dt,
    'pred_knn':pred_knn
}, index=X_test.index)

# Concatenate X_test with the predictions DataFrame
X_test_2nd = pd.concat([X_test, pred_df], axis=1)

# Obtain the final predictions from the second-layer estimator
pred_stack = clf_stack.predict(X_test_2nd)

# Evaluate the new performance on the test set
print('Accuracy: {:0.4f}'.format(accuracy_score(y_test, pred_stack)))

Decision Tree: 0.9437
5-Nearest Neighbors: 0.9125
Accuracy: 0.9250


## **MLExtend Stacking**

Individual estimators are trained on the
complete features

The meta-estimator is trained using the
predictions as the only meta-features

The meta-estimator can be trained with
labels or probabilities as target

In [21]:
# Instantiate the first-layer classifiers
clf_dt = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, random_state=500)
clf_knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree')

# Instantiate the second-layer meta classifier
clf_meta = DecisionTreeClassifier(random_state=500)

# Build the Stacking classifier
clf_stack = StackingClassifier(
    classifiers=[clf_dt,clf_knn],
    meta_classifier=clf_meta,
    use_features_in_secondary= True
    )
clf_stack.fit(X_train, y_train)

# Evaluate the performance of the Stacking classifier
pred_stack = clf_stack.predict(X_test)
print("Accuracy: {:0.4f}".format(accuracy_score(y_test, pred_stack)))

Accuracy: 0.9250


In [22]:
# Instantiate the 1st-layer regressors
reg_dt = DecisionTreeRegressor(min_samples_leaf=11, min_samples_split=33, random_state=500)
reg_lr = LinearRegression(normalize=True)
reg_ridge = Ridge(random_state=500)

# Instantiate the 2nd-layer regressor
reg_meta = LinearRegression()

# Build the Stacking regressor
reg_stack = StackingRegressor(
    regressors =[reg_dt, reg_lr, reg_ridge],
    meta_regressor=reg_meta)
reg_stack.fit(X_train, y_train)

# Evaluate the performance on the test set using the MAE metric
pred = reg_stack.predict(X_test)
print('MAE: {:.3f}'.format(mean_absolute_error(y_test, pred)))

MAE: 0.074
