In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load dataset

In [2]:
data = load_breast_cancer()

# View dataset

In [3]:
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

# Exact the attributes

In [4]:
X, y = data.data, data.target

# Split the data into traning set and testing set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

# Train and test in Decision Tree

In [6]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
print("Accuracy of Decision Tree on test data: {:.2f}".format(dt.score(X_test, y_test)))

Accuracy of Decision Tree on test data: 0.91


# Train random forest

In [7]:
clf = RandomForestClassifier(n_estimators=100, oob_score=True,random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

# Estimate accuracy on testing data

In [8]:
print("Accuracy of Random forest on test data: {:.2f}".format(clf.score(X_test, y_test)))

Accuracy of Random forest on test data: 0.97


# Out-of-bag error

In [9]:
print("The out-of-bag error is: {:.2f}".format(1 - clf.oob_score_))


The out-of-bag error is: 0.04


# Changing parameters will affect accuracy and out-of-bag error

In [10]:
clf2 = RandomForestClassifier(n_estimators=5, oob_score=True,random_state=0)
clf2.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [11]:
print("Accuracy of Random forest on test data: {:.2f}".format(clf2.score(X_test, y_test)))
print("The out-of-bag error is: {:.2f}".format(1 - clf2.oob_score_))

Accuracy of Random forest on test data: 0.94
The out-of-bag error is: 0.12
