In [1]:
import pandas as pd

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import ExtraTreesClassifier as ETC

from bokeh.plotting import figure, show
from bokeh.io import output_notebook

In [2]:
def load_dataset_from_files(training, testing):
    training_data = datasets.load_svmlight_file(training)
    testing_data = datasets.load_svmlight_file(testing)
    X_train, y_train = training_data[0], training_data[1]
    X_test, y_test = testing_data[0], testing_data[1]
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_dataset_from_files('datasets/letter.scale.tr', 'datasets/letter.scale.t')

In [3]:
rfc = RFC(n_estimators=100, criterion="entropy", n_jobs=-1)
etc = ETC(n_estimators=100, criterion="entropy", n_jobs=-1)

In [4]:
train_sizes = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10500]

In [5]:
X_train_cases = []
y_train_cases = []
for size in train_sizes:
    X_train_cases.append(X_train[:size])
    y_train_cases.append(y_train[:size])

In [6]:
rfc_accuracies = []
rfc_node_count = []
rfc_tree_depth = []
etc_accuracies = []
etc_node_count = []
etc_tree_depth = []

In [7]:
def count_tree_depths(classifier):
    depths = [estimator.tree_.max_depth for estimator in classifier.estimators_]
    return sum(depths)/len(depths)

In [8]:
def count_nodes(classifier):
    node_counts = [estimator.tree_.node_count for estimator in classifier.estimators_]
    return sum(node_counts)/len(node_counts)

In [9]:
def train_classifier(X_train, y_train, classifier, accuracies, tree_depths, node_counts, X_test, y_test):
    for X, y in zip(X_train, y_train):
        classifier.fit(X, y)
        accuracies.append(classifier.score(X_test, y_test))
        tree_depths.append(count_tree_depths(classifier))
        node_counts.append(count_nodes(classifier))

In [10]:
train_classifier(X_train_cases, y_train_cases, rfc, rfc_accuracies, rfc_tree_depth, rfc_node_count, X_test, y_test)

train_classifier(X_train_cases, y_train_cases, etc, etc_accuracies, etc_tree_depth, etc_node_count, X_test, y_test)

In [11]:
output_notebook()

In [12]:
p = figure(title="Letter dataset", x_axis_label="Train size", y_axis_label="Accuracy")
p.line(train_sizes, rfc_accuracies, legend="Random Forest", line_width=2, color="blue")
p.line(train_sizes, etc_accuracies, legend="Extremely Randomized Trees", line_width=2, color="red")
show(p)

In [13]:
q = figure(title="Letter dataset", x_axis_label="Train size", y_axis_label="Average # of nodes")
q.line(train_sizes, rfc_node_count, legend="Random Forest", line_width=2, color="blue")
q.line(train_sizes, etc_node_count, legend="Extremely Randomized Trees", line_width=2, color="red")
show(q)

In [14]:
r = figure(title="Letter dataset", x_axis_label="Train size", y_axis_label="Average tree depth")
r.line(train_sizes, rfc_tree_depth, legend="Random Forest", line_width=2, color="blue")
r.line(train_sizes, etc_tree_depth, legend="Extremely Randomized Trees", line_width=2, color="red")
show(r)