In [14]:
import pandas as pd
import numpy as np
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn import cross_validation
from sklearn import covariance
from sklearn import ensemble
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.decomposition import PCA
import string
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
import math
import string
import os
import datetime

In [7]:
train_best_features = pd.read_csv("train_best_features.csv")
test_best_features = pd.read_csv("test_best_features.csv")




In [11]:
x_train = train_best_features.drop(['zestimate_amount'],1)
y_train = train_best_features['zestimate_amount']

In [12]:
x_test = test_best_features.drop(['zestimate_amount'],1)
y_test = test_best_features['zestimate_amount']

In [None]:
# Parameters for tuning
n_trees = np.arange(10, 100, 20)  # Trees and depth are explored on an exponentially growing space,
depths = np.arange(2, 10)   # since it is assumed that trees and depth will add accuracy in a decaying fashion.
pca_dims = range(5,15)

# To keep track of the best model
best_score = 0
# Run grid search for model with 5-fold cross validation
print '5-fold cross validation:'
for trees in n_trees:
    for depth in depths:
        for pca_dim in pca_dims:
            # Cross validation for every experiment
            k_folds = cross_validation.KFold(x_train.shape[0], n_folds=5, shuffle=True)
            scores = []
            for train_indices, validation_indices in k_folds:
                # Generate training data
                x_train_cv = x_train.iloc[train_indices].values
                y_train_cv = y_train[train_indices].values
                # Generate validation data
                x_validate = x_train.iloc[validation_indices].values
                y_validate = y_train[validation_indices].values

                #Project to the data onto axes
                pca = PCA(n_components=pca_dim)
                pca.fit(x_train_cv)
                x_train_reduced = pca.transform(x_train_cv)
                x_validate_reduced = pca.transform(x_validate)

                # Fit random forest on training data
                model = ensemble.RandomForestClassifier(n_estimators=trees, max_depth=depth)
                model.fit(x_train_reduced, y_train_cv)
                # Score on validation data
                scores += [model.score(x_validate_reduced, y_validate)]

            # Record and report accuracy
            average_score = np.mean(scores)
            print "Trees:", trees, "Depth:", depth, "PCA dim: ", pca_dim, "Score:", average_score

            # Update our record of the best parameters see so far
            if average_score > best_score:
                best_score = average_score
                best_trees = trees
                best_depth = depth
            
# Fit model on entire train set using chosen number of trees and depth
model = ensemble.RandomForestRegressor(n_estimators=best_trees, max_depth=best_depth)
model.fit(x_train, y_train)
print 'Chosen number of trees, depth:', best_trees, ',', best_depth
print 'Test accuracy:', model.score(x_test, y_test)


5-fold cross validation:
Trees: 10 Depth: 2 PCA dim:  5 Score: 0.00124246441064
Trees: 10 Depth: 2 PCA dim:  6 Score: 0.00124246441064
Trees: 10 Depth: 2 PCA dim:  7 Score: 0.00124237305632
Trees: 10 Depth: 2 PCA dim:  8 Score: 0.00114690779335
Trees: 10 Depth: 2 PCA dim:  9 Score: 0.00143371467671
