In [1]:
import pandas as pd

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import ExtraTreesClassifier as ETC

from bokeh.plotting import figure, show
from bokeh.io import output_notebook

In [2]:
# Load data from file
training_data = datasets.load_svmlight_file('datasets/letter.scale.tr')
X_train, y_train = training_data[0], training_data[1]
testing_data = datasets.load_svmlight_file('datasets/letter.scale.t')
X_test, y_test = testing_data[0], testing_data[1]

In [3]:
forest = RFC(n_estimators=100, criterion="entropy", n_jobs=-1)
trees = ETC(n_estimators=100, criterion="entropy", n_jobs=-1)

In [4]:
train_sizes = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10500]

In [5]:
X_train_cases = []
y_train_cases = []
for size in train_sizes:
    X_train_cases.append(X_train[:size])
    y_train_cases.append(y_train[:size])

In [6]:
accuracies_forest = []
accuracies_trees = []

In [7]:
for X, y in zip(X_train_cases, y_train_cases):
    forest.fit(X, y)
    trees.fit(X, y)
    accuracies_forest.append(forest.score(X_test, y_test))
    accuracies_trees.append(trees.score(X_test, y_test))

In [8]:
accuracies_forest

[0.79700000000000004,
 0.85740000000000005,
 0.89059999999999995,
 0.90659999999999996,
 0.91879999999999995,
 0.92359999999999998,
 0.93640000000000001,
 0.93979999999999997,
 0.94699999999999995,
 0.94940000000000002]

In [9]:
accuracies_trees

[0.81779999999999997,
 0.88300000000000001,
 0.90259999999999996,
 0.92159999999999997,
 0.93100000000000005,
 0.94320000000000004,
 0.94679999999999997,
 0.9516,
 0.95720000000000005,
 0.96360000000000001]

In [10]:
output_notebook()

In [11]:
p = figure(title="Letter dataset", x_axis_label="Train size", y_axis_label="Accuracy")
p.line(train_sizes, accuracies_forest, legend="Random Forest", line_width=2, color="blue")
p.line(train_sizes, accuracies_trees, legend="Extremely Randomized Trees", line_width=2, color="red")
show(p)