In [24]:
import numpy as np

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

# Voting

In [2]:
X,y = make_moons(n_samples=500,noise=0.3,random_state=42)
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                random_state=42)

In [3]:
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

In [4]:
voting_clf = VotingClassifier(estimators=[('lr',log_clf),
                                         ('rf',rnd_clf),
                                         ('svc',svm_clf)],
                             voting='hard')
voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

In [5]:
for clf in (log_clf,rnd_clf,svm_clf,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912


In [6]:
# Ejemplo con predict_proba
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42,probability=True)

In [7]:
voting_clf = VotingClassifier(estimators=[('lr',log_clf),
                                         ('rf',rnd_clf),
                                         ('svc',svm_clf)],
                             voting='soft')
voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(probability=True, random_state=42))],
                 voting='soft')

In [8]:
for clf in (log_clf,rnd_clf,svm_clf,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


# Bagging/pasting

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [10]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators = 500,max_samples=100,
                           bootstrap=True,n_jobs=-1)
bag_clf.fit(X_train,y_train)

y_pred = bag_clf.predict(X_test)

## OOB Score

In [11]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators=500,bootstrap=True,
                           n_jobs=-1,oob_score=True)
bag_clf.fit(X_train,y_train)
bag_clf.oob_score_

0.896

In [12]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.92

In [14]:
bag_clf.oob_decision_function_[1]

array([0.38624339, 0.61375661])

# Ejercicios
8. Load the MNIST data (introduced in Chapter 3), and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing). Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM classifier. Next, try to combine them into an ensemble that outperforms each individual classifier on the validation set, using soft or hard voting. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

In [42]:
from sklearn.datasets import fetch_openml
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier

In [16]:
X,y = fetch_openml('mnist_784',return_X_y=True,as_frame=False)

In [17]:
X_train,y_train = X[:50000],y[:50000]
X_valid,y_valid = X[50000:60000],y[50000:60000]
X_test,y_test = X[60000:],y[60000:]

In [18]:
et_clf = ExtraTreesClassifier(random_state=42,n_jobs=-1)
rnd_clf = RandomForestClassifier(random_state=42,n_jobs=-1)
svm_clf = SVC(random_state=42,probability=True)

voting_clf = VotingClassifier(estimators=[('lr',et_clf),
                                         ('rf',rnd_clf),
                                         ('svc',svm_clf)],
                             voting='soft',n_jobs=-1)

for clf in (et_clf,rnd_clf,svm_clf,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_valid)
    print(clf.__class__.__name__,accuracy_score(y_valid,y_pred))

ExtraTreesClassifier 0.9743
RandomForestClassifier 0.9736
SVC 0.9802
VotingClassifier 0.9813


In [19]:
for clf in (et_clf,rnd_clf,svm_clf,voting_clf):
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

ExtraTreesClassifier 0.9703
RandomForestClassifier 0.968
SVC 0.9785
VotingClassifier 0.9783


9. Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image’s class. Train a classifier on this new training set. Congratulations, you have just trained a blender, and together with the classifiers it forms a stacking ensemble! Now evaluate the ensemble on the test set. For each image in the test set, make predictions with all your classifiers, then feed the predictions to the blender to get the ensemble’s predictions. How does it compare to the voting classifier you trained earlier?

In [20]:
y_valid_et = et_clf.predict(X_valid)
y_valid_rnd = rnd_clf.predict(X_valid)
y_valid_svm = svm_clf.predict(X_valid)
x_train_blender = np.c_[y_valid_et,y_valid_rnd,y_valid_svm]
y_train_blender = y_valid.copy()

In [36]:
y_test_et = et_clf.predict(X_test)
y_test_rnd = rnd_clf.predict(X_test)
y_test_svm = svm_clf.predict(X_test)
x_test_blender = np.c_[y_test_et,y_test_rnd,y_test_svm]
y_test_blender = y_test.copy()

In [None]:
ohe = OneHotEncoder()
x_train_blender = ohe.fit_transform(x_train_blender)

In [40]:
x_test_blender = ohe.fit_transform(x_test_blender)

In [51]:
### Entrenemos un par de modelos, y veamos qué onda
rf_clf = RandomForestClassifier(random_state=42,max_features=6,n_jobs=-1)
dt_clf = DecisionTreeClassifier(random_state=42,max_features=6)
et_clf_ = ExtraTreesClassifier(random_state=42,max_features=6)
kn_clf = KNeighborsClassifier(n_jobs=-1)
for clf in (rf_clf,dt_clf,et_clf_,kn_clf):
    clf.fit(x_train_blender,y_train_blender)
    y_pred_blender = clf.predict(x_test_blender)
    print(clf.__class__.__name__,accuracy_score(y_test_blender,y_pred_blender))

RandomForestClassifier 0.9762
DecisionTreeClassifier 0.9751
ExtraTreesClassifier 0.9753
KNeighborsClassifier 0.9745


Funcan más o menos igual el blender con el voting classifier (este erra en más o menos 20 instancias más, pero creo que ese error es despreciable)