In [None]:
import numpy as np
import pickle
import util
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
seed=229
np.random.seed(seed)

trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Training
Skip to `Evaluation` to produce plots and an output.

In [None]:
random_forests = RandomForestRegressor(random_state=0, warm_start=True, n_estimators=10, n_jobs=-1)
params_to_try = {'max_depth': range(1,X.shape[1]+1)}
c_validator = GridSearchCV(random_forests, params_to_try, scoring=util.gini_scorer)
c_validator.fit(X, Y)

In [None]:
rf_cv_out = open('models/rforest_cv.pickle', 'wb')
pickle.dump(c_validator, rf_cv_out)
rf_cv_out.close()

rf_out = open('models/rforest.pickle', 'wb')
pickle.dump(c_validator, rf_out)
rf_out.close()

# Evaluation

In [None]:
rf_model = pickle.load(open('models/rforest.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(rf_model, X_train, X_test, y_train, y_test)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Random forests')
plt.legend()
plt.savefig('figures/learning_curves/rforest.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# Output

In [None]:
rf_model.fit(X, Y)

In [None]:
testfile = 'data/test.csv'
util.make_prediction(rf_model, testfile, 'predictions/rforest.csv')