In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
import warnings
warnings.filterwarnings('ignore')

import random
random.seed(10)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Hard Voting

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

hard_voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard')
hard_voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

In [7]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
            
hvc_predict = hard_voting_clf.predict(X_test)            
print("Hard voting clasifier accuracy: ", accuracy_score(y_test, hvc_predict))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
Hard voting clasifier accuracy:  0.912


# Soft Voting

In [8]:
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)

soft_voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
soft_voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(probability=True, random_state=42))],
                 voting='soft')

In [9]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, soft_voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


# Bagging & Pasting

In [10]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=False, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [11]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.92


In [12]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))

0.856


In [13]:
lr_bag_clf = BaggingClassifier(
    LogisticRegression(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)
lr_bag_clf.fit(X_train, y_train)
lr_y_pred = lr_bag_clf.predict(X_test)

In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, lr_y_pred))

0.84


# Out of bag samples

#### oob_decision_function_ is returning calss probabilities as the base esitmator (DecisionTree) has got predict_proba() method.

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

In [17]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

# Random Forest.

In [18]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, \
                                                 n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)  

y_pred_rf = rnd_clf.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [22]:
#from sklearn.datasets import fetch_mldata
digit_dataset = pd.read_csv('../dataset/digit_recognizer_train.csv')
digit_X = digit_dataset.iloc[:, 1:]
digit_y = digit_dataset['label']
digit_X_train, digit_X_test, digit_y_train, digit_y_test = \
                                    train_test_split(digit_X, digit_y, test_size=0.2)

In [23]:
rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(digit_X_train, digit_y_train)

RandomForestClassifier(random_state=42)

# Feature Importance
  * if you look at a single Decision Tree, important features are likely to appear closer to the root of the tree, 
    while unimportant features will often appear closer to the leaves (or not at all). 
    It is therefore possible to get an estimate of a feature’s importance by computing the average depth at which it 
    appears across all trees in the forest. Scikit-Learn computes this automatically for every feature after training. 
    You can access the result using the feature\_importances\_ variable.

In [24]:
rnd_clf.feature_importances_

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.95685473e-06, 6.58732394e-07,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 3.86013982e-06, 2.57240945e-06,
       1.67148095e-06, 6.91803381e-06, 2.37880357e-06, 3.01191413e-06,
       2.85137901e-06, 5.45432808e-06, 0.00000000e+00, 1.81954707e-06,
       6.10203147e-06, 2.95866817e-06, 1.28444593e-06, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.30028644e-06,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [25]:
for feature, imp_score in sorted(zip(digit_dataset.columns, \
                                    rnd_clf.feature_importances_), key=lambda x: x[1], reverse=True):
    if(imp_score > 0.0001):
        print(feature, imp_score)

pixel432 0.008733417110655927
pixel460 0.008413993554574251
pixel405 0.008090404130597752
pixel349 0.00794537962198465
pixel377 0.007596250021411368
pixel408 0.007212378662906189
pixel404 0.007091795798950185
pixel459 0.006920766891424367
pixel346 0.006825317663935314
pixel376 0.0068053939235530655
pixel541 0.006646485555546839
pixel209 0.006507646876740392
pixel154 0.006484927980250387
pixel488 0.0063485886552726
pixel542 0.006344805713061915
pixel350 0.006239575252241664
pixel374 0.006071960041463447
pixel317 0.006050664550252324
pixel289 0.006026288204801286
pixel153 0.00598510873813774
pixel210 0.0059657885974518975
pixel514 0.005835821498373061
pixel322 0.005769873298390828
pixel595 0.005647756467591681
pixel380 0.005620488495407564
pixel436 0.005601724003420501
pixel401 0.005555138491047325
pixel513 0.005484107582954661
pixel568 0.005476731947757119
pixel485 0.005470913941614827
pixel540 0.005386683264786126
pixel345 0.005361211599027789
pixel400 0.005314602306428785
pixel373 0.0

In [26]:
digit_y_pred = rnd_clf.predict(digit_X_train)

In [27]:
from sklearn.metrics import f1_score
print(f1_score(digit_y_train, digit_y_pred, average="weighted"))

1.0


In [28]:
digit_y_test_pred = rnd_clf.predict(digit_X_test)
print(f1_score(digit_y_test, digit_y_test_pred, average="weighted"))

0.9664259789444235


# Boosting

### Ada Boost
*combine multiple week learner to make good prediction

In [29]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=2), n_estimators=500,
        algorithm="SAMME.R", learning_rate=0.5, random_state=42
    )
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.5, n_estimators=500, random_state=42)

In [30]:
from sklearn.metrics import accuracy_score
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.856

# Gradient Boosting

In [31]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [32]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [33]:
y_cap = tree_reg1.predict(X)

In [34]:
y_cap.shape

(100,)

In [35]:
y2 = y - y_cap
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [36]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [37]:
y4 = y3 - tree_reg3.predict(X)
tree_reg4 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg4.fit(X, y4)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [38]:
print("x Sample :", X[1,0])
print("y - value:", y[1])

x Sample : 0.45071430640991617
y - value: 0.594479790484422


In [51]:
gen = ((i*10 for i in range(100)))

In [57]:
next(gen)

40