In [None]:
import pandas as pd

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import ExtraTreesClassifier as ETC

from bokeh.plotting import figure, show
from bokeh.io import output_notebook

In [None]:
def load_dataset_from_files(training, testing):
    training_data = datasets.load_svmlight_file(training)
    testing_data = datasets.load_svmlight_file(testing)
    X_train, y_train = training_data[0], training_data[1]
    X_test, y_test = testing_data[0], testing_data[1]
    return {
        'X_train': training_data[0],
        'y_train': training_data[1],
        'X_test': testing_data[0],
        'y_test': testing_data[1]
    }

datasets = load_dataset_from_files('datasets/letter.scale.tr', 'datasets/letter.scale.t')

In [None]:
forest = RFC(n_estimators=100, criterion="entropy", n_jobs=-1)
trees = ETC(n_estimators=100, criterion="entropy", n_jobs=-1)

In [None]:
train_sizes = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10500]

In [None]:
X_train_cases = []
y_train_cases = []
for size in train_sizes:
    X_train_cases.append(datasets['X_train'][:size])
    y_train_cases.append(datasets['y_train'][:size])

In [None]:
accuracies_forest = []
accuracies_trees = []
avg_nodes_count_forest = []
avg_nodes_count_trees = []

In [None]:
def count_avg_nodes(classifier):
    for estimator in classifier.estimators_:
        i = 0
        nodes_total = 0
        nodes_total += estimator.tree_.node_count
        i += 1
    return nodes_total/i

In [None]:
for X, y in zip(X_train_cases, y_train_cases):
    forest.fit(X, y)
    trees.fit(X, y)
    accuracies_forest.append(forest.score(X_test, y_test))
    accuracies_trees.append(trees.score(X_test, y_test))
    avg_nodes_count_forest.append(count_avg_nodes(forest))
    avg_nodes_count_trees.append(count_avg_nodes(trees))

In [None]:
output_notebook()

In [None]:
p = figure(title="Letter dataset", x_axis_label="Train size", y_axis_label="Accuracy")
p.line(train_sizes, accuracies_forest, legend="Random Forest", line_width=2, color="blue")
p.line(train_sizes, accuracies_trees, legend="Extremely Randomized Trees", line_width=2, color="red")
show(p)

In [None]:
q = figure(title="Letter dataset", x_axis_label="Train size", y_axis_label="Average # of nodes")
q.line(train_sizes, avg_nodes_count_forest, legend="Random Forest", line_width=2, color="blue")
q.line(train_sizes, avg_nodes_count_trees, legend="Extremely Randomized Trees", line_width=2, color="red")
show(q)