# Ensemble Learning

Siguiendo:
- "Ensemble Learning for AI Developers. Learn Bagging, Stacking, and Boosting Methods with Use Cases".

## Chapter 1. Why Ensemble Techniques are Needed

## Chapter 2. Mixing Training Data

In [9]:
# 2.1 Training a Decision Tree Using scikit-learn

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size = 0.2, random_state = 123)

tree = DecisionTreeClassifier()
tree.fit(train_X, train_Y)

print(tree.score(test_X, test_Y))

0.9666666666666667


In [10]:
# Listing 2-2. Training Random Forest Using scikit-learn with Number of Decision Trees = 4

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
train_X, test_X, train_Y, test_Y = train_test_split(X, y,
test_size = 0.1, random_state = 123)
forest = RandomForestClassifier(n_estimators=8)
forest = forest.fit(train_X, train_Y)

print(forest.score(test_X, test_Y))

rf_output = forest.predict(test_X)
print(rf_output)

1.0
[1 2 2 1 0 2 1 0 0 1 2 0 1 2 2]


### Sampling Without Replacement (WOR)

In [14]:
# Listing 2-3. Sampling Without Replacement in scikit-learn

from sklearn.utils import resample
import numpy as np
# Random seed fixed so result could be replicated by Reader
np.random.seed(123)
#data to be sampled
data = [1, 2, 3, 4, 5, 6, 7, 8, 9]
# Number of divisions needed
num_divisions = 2
list_of_data_divisions = []
for x in range(0, num_divisions):
   sample = resample(data, replace=False, n_samples=5)
   list_of_data_divisions.append(sample)
print('Samples', list_of_data_divisions)
# Output: Samples [[8, 1, 6, 7, 4], [4, 6, 5, 3, 8]]

Samples [[8, 1, 6, 7, 4], [4, 6, 5, 3, 8]]


### Sammpling with Replacement (WR)

In [13]:
#Listing 2-4. Sampling with Replacement in scikit-learn

from sklearn.utils import resample
import numpy as np

# Random seed fixed so result could be replicated by Reader
np.random.seed(123)
# data to be sampled
data = [1, 2, 3, 4, 5, 6, 7, 8, 9]
# Number of divisions needed
num_divisions = 3
list_of_data_divisions = []

for x in range(0, num_divisions):
   sample = resample(data, replace=True, n_samples=4)
   list_of_data_divisions.append(sample)
print("Samples", list_of_data_divisions)

Samples [[3, 3, 7, 2], [4, 7, 2, 1], [2, 1, 1, 4]]


### Bagging

In [42]:
# Listing 2-5. Bagging from Primitives

from sklearn.utils import resample
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import numpy as np
from sklearn.metrics import accuracy_score

# data to be sampled
n_samples = 100
X,y = make_classification(n_samples=n_samples, n_features=4,
n_informative=2, n_redundant=0, random_state=0, shuffle=False)

#divide data into train and test set
X_train, X_test, y_train, y_test =  train_test_split(X, y,
test_size = 0.1, random_state = 123)

# Number of divisions needed
num_divisions = 3
list_of_data_divisions = []

# Divide data into divisions
for x in range(0, num_divisions):
    X_train_sample, y_train_sample = resample(X_train, y_train, replace=True, n_samples=7)
    sample = [X_train_sample, y_train_sample]
    list_of_data_divisions.append(sample)
    #print(list_of_data_divisions)

# Learn a Classifier for each data divisions
learners = []
for data_division in list_of_data_divisions:
    data_x = data_division[0]
    data_y = data_division[1]
    decision_tree = tree.DecisionTreeClassifier()
    decision_tree.fit(data_x, data_y)
    learners.append(decision_tree)

# Combine output of all classifiers using voting
predictions = []
for i in range(len(y_test)):
    counts = [0 for _ in range(num_divisions)]
    for j , learner in enumerate(learners):
        prediction = learner.predict([X_test[i]])
        if prediction == 1:
            counts[j] = counts[j] + 1
    final_predictions = np.argmax(counts)
    predictions.append(final_predictions)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
# Output: Accuracy: 0.9

Accuracy: 0.9


In [55]:
# Listing 2-6. Bagging scikit-learn

from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=4,
                          n_informative=2, n_redundant=0,
                          random_state=0, shuffle=False)

#divide data into train and test set
X_train, X_test, y_train, y_test =  train_test_split(X, y,
test_size = 0.2, random_state = 123)
clf = BaggingClassifier(base_estimator=SVC(),
n_estimators=10, random_state=0).fit(X_train, y_train)
print(clf.score(X_test, y_test))
# Output: 0.9

0.85


### K-Folds Cross-validation

In [57]:
import numpy as np
from sklearn.model_selection import KFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)
kf.get_n_splits(X)
print(kf)

# Output:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

KFold(n_splits=2, random_state=None, shuffle=False)
TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]


#### Stratified K-Folds Cross-validation

In [60]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])

skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)
print(skf)
# Output:
# StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
# Output:
# TRAIN: [1 3] TEST: [0 2]
# TRAIN: [0 2] TEST: [1 3]

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]


## Chapter 3. Mixing Models

### Voting Ensembles

#### Listing 3-1. Max Voting Ensemble

In [1]:
#Listing 3-1. Max Voting Ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
import numpy as np

In [2]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, stratify=y, random_state=123)

In [3]:
### k-Nearest Neighbors (k-NN)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1, 25)}
knn_gs = GridSearchCV(knn, params_knn, cv=5)
knn_gs.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_

In [4]:
### Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
params_rf = {'n_estimators': [50, 100, 200]}
rf_gs = GridSearchCV(rf, params_rf, cv=5)
rf_gs.fit(X_train, y_train)
rf_best = rf_gs.best_estimator_

In [5]:
### Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=123,
solver='liblinear', penalty='l2', max_iter=5000)
C = np.logspace(1, 4, 10)
params_lr = dict(C=C)
lr_gs = GridSearchCV(log_reg, params_lr, cv=5, verbose=0)
lr_gs.fit(X_train, y_train)
lr_best = lr_gs.best_estimator_

In [22]:
# combine all three Voting Ensembles
from sklearn.ensemble import VotingClassifier
estimators=[
    ('knn', knn_best), 
    ('rf', rf_best), 
    ('log_reg',lr_best)
]
ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, y_train)

print("knn_gs.score: ", knn_best.score(X_test, y_test))
print("rf_gs.score: ", rf_best.score(X_test, y_test))
print("log_reg.score: ", lr_best.score(X_test, y_test))
print("ensemble.score: ", ensemble.score(X_test, y_test))

knn_gs.score:  0.9239766081871345
rf_gs.score:  0.9532163742690059
log_reg.score:  0.9415204678362573
ensemble.score:  0.9473684210526315


#### Averaging/Soft Voting

In [24]:
# Listing 3-2. Averaging

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
import numpy as np

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, stratify=y, random_state=0)

In [16]:
### k-Nearest Neighbors (k-NN)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1, 25)}
knn_gs = GridSearchCV(knn, params_knn, cv=5)
knn_gs.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_
knn_gs_predictions = knn_gs.predict(X_test)

In [17]:
### Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
params_rf = {'n_estimators': [50, 100, 200]}
rf_gs = GridSearchCV(rf, params_rf, cv=5)
rf_gs.fit(X_train, y_train)
rf_best = rf_gs.best_estimator_
rf_gs_predictions = rf_gs.predict(X_test)

In [18]:
### Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=123,
solver='liblinear', penalty='l2', max_iter=5000)
C = np.logspace(1, 4, 10)
params_lr = dict(C=C)
lr_gs = GridSearchCV(log_reg, params_lr, cv=5, verbose=0)
lr_gs.fit(X_train, y_train)
lr_best = lr_gs.best_estimator_
log_reg_predictions = lr_gs.predict(X_test)

In [21]:
# combine all three by averaging the Ensembles results
#average_prediction = (log_reg_predictions + knn_gs_predictions + rf_gs_predictions)/3.0

# Alternatively combine all through using VotingClassifier with voting='soft' parameter

# combine all three Voting Ensembles
from sklearn.ensemble import VotingClassifier
estimators=[
    ('knn', knn_best), 
    ('rf', rf_best), 
    ('log_reg', lr_best)
]
ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, y_train)

print("knn_gs.score: ", knn_gs.score(X_test, y_test))
print("rf_gs.score: ", rf_gs.score(X_test, y_test))
print("log_reg.score: ", lr_gs.score(X_test, y_test))
print("ensemble.score: ", ensemble.score(X_test, y_test))

knn_gs.score:  0.9239766081871345
rf_gs.score:  0.9532163742690059
log_reg.score:  0.9415204678362573
ensemble.score:  0.9473684210526315


### Hyperparameter Tuning Ensembles

- Instead of relying on different models to make ensemble models, you use a good machine learning model and train this model using different hyperparameter settings.

In [26]:
# Listing 3-3. Hyperparameter Tuning Ensembles
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
import numpy as np

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, stratify=y, random_state=0)

In [27]:
### Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rf_1 = RandomForestClassifier(random_state=0, n_estimators=10)
rf_1.fit(X_train, y_train)

rf_2 = RandomForestClassifier(random_state=0, n_estimators=50)
rf_2.fit(X_train, y_train)

rf_3 = RandomForestClassifier(random_state=0, n_estimators=100)
rf_3.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [28]:
# combine all three Voting Ensembles
from sklearn.ensemble import VotingClassifier
estimators = [('rf_1', rf_1), ('rf_2', rf_2), ('rf_3', rf_3)]
ensemble = VotingClassifier(estimators, voting='hard')
ensemble.fit(X_train, y_train)

print("rf_1.score: ", rf_1.score(X_test, y_test))
print("rf_2.score: ", rf_2.score(X_test, y_test))
print("rf_3.score: ", rf_3.score(X_test, y_test))
print("ensemble.score: ", ensemble.score(X_test, y_test))

rf_1.score:  0.935672514619883
rf_2.score:  0.9473684210526315
rf_3.score:  0.9532163742690059
ensemble.score:  0.9473684210526315


### Horizontal Voting Ensembles

Take a look at Listing 3-4 where we have implemented horizontal voting ensembles using keras, tensorflow and scikit learn libraries.
Ver págs 41-42.

In [30]:
# Listing 3-4. Horizontal Voting Ensembles
# Probar en Colab 

### Snapshot Ensembles

- Snapshot ensembles are an extension of a horizontal voting ensemble. Instead of saving models after the minimum threshold, you modify the learning rate of the model itself.
- When training a machine learning model, it is often desirable to start the initial higher learning and then slowly decrease the learning rate.

## Chapter 4. Mixing Combinations

- Introduce and explain boosting.
- Examine how to implement boosting using scikit-learn.
- Introduce and explain stacking.
- Examine how to implement boosting using scikit-learn.
- Look at other examples of mixing combinations.

### Boosting

- We start with a collection of learners. Each ML learner is trained on a particular subset of training objects. If a model learner has a weak performance, we could provide greater emphasis to that particular learner. This is known as boosting.
- One of the simplest but most important of boosting techniques, AdaBoost.

### AdaBoost

In [31]:
#Listing 4-1. AdaBoost Using scikit-learn
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier

X, y = load_iris(return_X_y=True)
clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, X, y, cv=5)
print(scores.mean())

0.9466666666666665


### Gradient Boosting

In [34]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

X, y = make_hastie_10_2(random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0 ).fit(X, y)

scores = cross_val_score(clf, X, y, cv=5)

print(scores.mean())

0.9225


### XGBoost

In [38]:
# Listing 4-3. XGBoost Example on Breast Cancer Dataset Using scikit-learn and XGBoost Library

import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# read in data
iris = load_breast_cancer()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y,

                                                    test_size=0.2, random_state=42)
# use DMatrix for xgbosot
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# set xgboost params
param = {
   'max_depth': 5,  # the maximum depth of each tree
   'eta': 0.3,  # the training step for each iteration
   'objective': 'multi:softprob',  # error evaluation formulticlass training
   'num_class': 3}  # the number of classes that exist in this datset
num_round = 200  # the number of training iterations
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)
preds_rounded = np.argmax(preds, axis=1)
print(accuracy_score(y_test, preds_rounded))

0.9649122807017544


### Stacking

- Key idea: instead of using trivial functions (such as hard voting) to aggregate the predictions of all learners in an ensemble, we train a model to perform this aggregation.
- Base Learners, Meta Learner

In [40]:
# Listing 4-4. Stacking Classifier Using scikit-learn

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier

X, y = load_iris(return_X_y=True)
estimators = [
    ("rf", RandomForestClassifier(n_estimators=10, random_state=42)),
    ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=42))),
]
clf = StackingClassifier(estimators=estimators, final_estimator=
LogisticRegression())

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
clf.fit(X_train, y_train).score(X_test, y_test)

0.9473684210526315

In [42]:
# Listing 4-5. Stacking Regression Using scikit-learn

from sklearn.datasets import load_diabetes
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

X, y = load_diabetes(return_X_y=True)
estimators = [("lr", RidgeCV()), ("svr", LinearSVR(random_state=42))]
reg = StackingRegressor(
   estimators=estimators,
   final_estimator=RandomForestRegressor(n_estimators=10, random_state=42),
)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)
reg.fit(X_train, y_train).score(X_test, y_test)

0.3642619780615395

## Chapter 5. Using Ensemble Learning Libraries

- ML-Ensemble, a Python-based open source library that wraps scikit ensemble classes to offer a high-level API.
- Scale XGBoost via Dask, a flexible library for parallel computing in Python. Dask and XGBoost can work together to train gradient-boosted trees in parallel.
- Learn boosting using Microsoft LightGBM.
- Introduce AdaNet, a lightweight TensorFlow-based framework for learning neural network architecture, but is also used for learning to ensemble models.

### ML-Ensemble

also: mlens

In [43]:
!pip install mlens

Collecting mlens
  Downloading mlens-0.2.3-py2.py3-none-any.whl (227 kB)
[K     |████████████████████████████████| 227 kB 561 kB/s eta 0:00:01
Installing collected packages: mlens
Successfully installed mlens-0.2.3


In [50]:
# ---Data setup----
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_moons
seed = 42

X, y = make_moons(n_samples=10000,noise=0.4, random_state=seed)

# --- 1. Initialize ---
from mlens.ensemble import SuperLearner
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed)

# --- 2. Build the first layer ---
ensemble.add([RandomForestClassifier(random_state=seed),
SVC(random_state=seed)])

# --- 3. Attach the final meta learner
ensemble.add_meta(LogisticRegression(solver="sag", max_iter=2000))

# --- Train ---
ensemble.fit(X_train, y_train)

# --- Predict ---
preds = ensemble.predict(X_test)

print("Fit data:\n%r" % ensemble.data)

Fit data:
                                   score-m  score-s  ft-m  ft-s  pt-m  pt-s
layer-1  svc                          0.01     0.00  0.14  0.02  0.02  0.00
layer-1  randomforestclassifier       0.00     0.00  0.57  0.01  0.04  0.00



Multilayer Ensembles

In [52]:
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)

# Build the first layer
ensemble.add([RandomForestClassifier(random_state=seed),LogisticRegression(random_state=seed)])

# Build the 2nd layer
ensemble.add([LogisticRegression(random_state=seed),
SVC(random_state=seed)])

# Attach the final meta estimator
ensemble.add_meta(SVC(random_state=seed))

SuperLearner(array_check=None, backend=None, folds=2,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=7270, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=2, raise_on_ex...d97670>)],
   n_jobs=-1, name='group-12', raise_on_exception=True, transformers=[])],
   verbose=1)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=42, sample_size=20,
       scorer=<function accuracy_score at 0x7fd20bd97670>, shuffle=False,
       verbose=2)

### Ensemble Model Selection

#### Evaluator

In [53]:
from mlens.model_selection import Evaluator
from scipy.stats import randint
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

#### Preprocessing

The mlens preprocessing feature helps you compare the models across a set of preprocessing pipelines. It does this via a class that acts as a transformer, allowing you to use lower or incoming layers as a “preprocessing” step, so that you need only evaluate the metalearners iteratively. Let’s look at the code to understand it better.

In [59]:
from mlens.model_selection import Evaluator
from mlens.ensemble import SequentialEnsemble #--1
from mlens.metrics import make_scorer
from scipy.stats import uniform, randint

base_learners = [RandomForestClassifier(random_state=seed), SVC(probability=True)] #--2

proba_transformer = SequentialEnsemble(model_selection=True, random_state=seed) \
                        .add('blend', base_learners, proba=True) #--3

class_transformer = SequentialEnsemble(model_selection=True, random_state=seed) \
                        .add('blend', base_learners, proba=False) #--4

preprocessing = {
    'proba': [('layer-1', proba_transformer)],
    'class': [('layer-1', class_transformer)]
} #--5

meta_learners = [
    SVC(random_state=seed), 
    ('rf',RandomForestClassifier(random_state=seed))
] #--6
        
params = {
    'svc': {'C': uniform(0, 10)},
    'class.rf': {'max_depth': randint(2, 10)},
    'proba.rf': {'max_depth': randint(2, 10), 'max_features': uniform(0.5, 0.5)}
} #--7

scorer = make_scorer(accuracy_score) #--8
evaluator = Evaluator(scorer=scorer, random_state=seed, cv=2) #--9
evaluator.fit(X, y, meta_learners, params,
preprocessing=preprocessing, n_iter=2)#--10

from pandas import DataFrame
df = DataFrame(evaluator.results) #--11
df

Unnamed: 0,test_score-m,test_score-s,train_score-m,train_score-s,fit_time-m,fit_time-s,pred_time-m,pred_time-s,params
class.rf,0.8628,0.0016,0.8606,0.0002,5.041326,0.25275,0.452895,0.063201,{'max_depth': 8}
class.svc,0.8628,0.0016,0.8606,0.0002,1.717284,0.038328,0.079342,0.044751,{'C': 3.745401188473625}
proba.rf,0.86,0.0002,0.8734,0.0046,4.437818,0.012086,0.565088,0.07324,"{'max_depth': 5, 'max_features': 0.97535715320..."
proba.svc,0.8625,0.0007,0.8606,0.001,0.866501,0.12227,0.137853,0.078443,{'C': 3.745401188473625}


Remember to turn off! parameter -> model_selection=True

### Dask c/ XGBoost (y Numpy, Pandas, etc.)

Fuera de scope ahora.

### LightGBM

Fuera de scope ahora.

### AdaNet

Ejemplos en Google Colab (https://github.com/tensorflow/adanet). The notebooks are well annotated and provide ready-to-use boilerplate code to use in your ML tasks.

## Chapter 6. Tips and Best Practices

- Feature selection using a random forest model. It should not come as a surprise that feature selection and feature relevance benefits from the performance and interpretation of machine learning algorithms.
- Feature transformations with ensembles of trees.
- Building a preprocessing pipeline for a random forest regressor.
- Isolation forests, an efficient algorithm for outlier detection, especially in high-dimensional datasets.

In [65]:
from sklearn import datasets

iris = datasets.load_iris() # -1
feature_list = iris.feature_names # -2
print(feature_list)
['sepal length (cm)',
'sepal width (cm)',
'petal length (cm)',
'petal width (cm)']

X = iris.data # -3
y = iris.target # -4
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=42) # -5
rf_clf = RandomForestClassifier(n_estimators=10000, random_state=42, n_jobs=-1) # -6
rf_clf.fit(X_train, y_train) # -7

for name, score in zip(iris["feature_names"], rf_clf.feature_importances_):
     print(name, score) # -8

y_pred = clf.predict(X_test) # -9
accuracy_score(y_test, y_pred) # -10

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
sepal length (cm) 0.09906957842524829
sepal width (cm) 0.03880497890715764
petal length (cm) 0.4152569088750478
petal width (cm) 0.4468685337925464


0.98

In [69]:
from sklearn.feature_selection import SelectFromModel 

sfm = SelectFromModel(rf_clf, threshold=0.15) # - 11
sfm.fit(X_train, y_train) # -12
X_important_train = sfm.transform(X_train) # -13
X_important_test = sfm.transform(X_test)
rf_clf_important = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=-1) # -14


rf_clf_important.fit(X_important_train, y_train)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [71]:
y_important_pred = rf_clf_important.predict(X_important_test) # - 15
accuracy_score(y_test, y_important_pred)

1.0

### Feature Transformations with Ensembles of Trees