In [1]:
import pandas as pd

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier as RFC

from bokeh.plotting import figure, show
from bokeh.io import output_notebook

In [2]:
# Load data from file
training_data = datasets.load_svmlight_file('letter.scale.tr')
X_train, y_train = training_data[0], training_data[1]
testing_data = datasets.load_svmlight_file('letter.scale.t')
X_test, y_test = testing_data[0], testing_data[1]

In [3]:
forest = RFC(n_estimators=100)

In [4]:
train_sizes = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10500]

In [5]:
X_train_cases = []
y_train_cases = []
for size in train_sizes:
    X_train_cases.append(X_train[:size])
    y_train_cases.append(y_train[:size])

In [6]:
accuracies = []

In [7]:
for X, y in zip(X_train_cases, y_train_cases):
    forest.fit(X, y)
    accuracies.append(forest.score(X_test, y_test))

In [8]:
accuracies

[0.7984,
 0.86240000000000006,
 0.89000000000000001,
 0.90500000000000003,
 0.91820000000000002,
 0.92420000000000002,
 0.93679999999999997,
 0.94120000000000004,
 0.94799999999999995,
 0.94979999999999998]

In [9]:
output_notebook()

In [10]:
p = figure(title="Letter dataset", x_axis_label="Train size", y_axis_label="Accuracy")
p.line(train_sizes, accuracies, legend="Random Forest", line_width=2)
show(p)