## Task 13:stacking

In [1]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.18.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.18.0


In [6]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.preprocessing import LabelEncoder

In [7]:
import numpy as np
import pandas as pd 
data = pd.read_csv('./mushrooms_sample.csv')

In [11]:
le=LabelEncoder()
data['class'] = le.fit_transform(data['class'])
encoded_data = pd.get_dummies(data)
y = encoded_data['class'].values.reshape(-1,1).ravel()
X = encoded_data

## Train by base 3 model


In [12]:
RANDOM_SEED = 42

clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=RANDOM_SEED)
clf3 = GaussianNB()
lr = LogisticRegression()

# Starting from v0.16.0, StackingCVRegressor supports
# `random_state` to get deterministic result.
sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3],  
                            use_probas=True,
                            meta_classifier=lr,  
                            random_state=RANDOM_SEED)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], ['KNN', 'Random Forest', 'Naive Bayes','StackingClassifier']):
    scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

3-fold cross validation:

Accuracy: 1.00 (+/- 0.00) [KNN]
Accuracy: 1.00 (+/- 0.00) [Random Forest]
Accuracy: 1.00 (+/- 0.00) [Naive Bayes]
Accuracy: 1.00 (+/- 0.00) [StackingClassifier]


## Involve Grid search

In [16]:
from sklearn.model_selection import GridSearchCV

# Initializing models

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=RANDOM_SEED)
clf3 = GaussianNB()
lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], 
                            meta_classifier=lr,
                            random_state=RANDOM_SEED)

params = {'kneighborsclassifier__n_neighbors': [1,3,5],
          'randomforestclassifier__n_estimators': [10, 50],
          'meta_classifier__C': [0.1, 10.0]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=5,
                    refit=True)
grid.fit(X, y)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 50}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 3, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 3, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 50}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 3, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}
1.000 +/- 0.00 {'kneighborsclassifier__n_neighbors': 3, 'meta_classifier__C': 10.0, 'randomforestclassifier_