In [1]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import numpy as np

X, y = make_moons(n_samples=1000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

RandomForestRegressor not shown

### RandomForestClassifier performance

In [2]:
from sklearn.ensemble import RandomForestClassifier

# more or less the same as passing a DecisionTreeClassifier into a BaggingClassifier
# However, Random Forest is more random b/c it chooses the best split among a random subset of features
# => higher bias, less variance = better model
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [3]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred_rf, y_test)

0.82

### Performance Comparison with its roughly equivalent Bagging + Decision Tree setup

In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)

In [6]:
accuracy_score(y_pred_bag, y_test)

0.84

### Extra Trees
Extremely random trees: choose a random split within a random subspace of features

In [9]:
from sklearn.ensemble import ExtraTreesClassifier

extra_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
extra_clf.fit(X_train, y_train)
y_pred_extra = extra_clf.predict(X_test)

In [10]:
accuracy_score(y_pred_extra, y_test)

0.835

### Feature Importance

In [11]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

# create a list of tuples (feature name, importance)
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_): 
    print(name, score)

sepal length (cm) 0.09899771879420918
sepal width (cm) 0.023099322668078674
petal length (cm) 0.45100478764527424
petal width (cm) 0.426898170892438


Petal length and width are the most important features  
**Random forests are good for seeing which features actually matter, esp if you need to perform feature selection**