In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true;
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [1]:
import pandas as pd
import numpy as np
#from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn import cross_validation
from sklearn import covariance
from sklearn import ensemble
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.decomposition import PCA 
import string
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
import math
import string
import os
import datetime

In [3]:
## Loading Data 
test_data = pd.read_csv("data/test_imputed.csv")
train_data = pd.read_csv("data/train_imputed.csv")

In [4]:
## Separating our predictor and response variables 
x_train = train_data.drop(['zestimate_amount'],1)
y_train = train_data['zestimate_amount']
x_test = test_data.drop(['zestimate_amount'],1)
y_test = test_data['zestimate_amount']

## Random Forest Model

It is likely that at least one of our features will not be linearly related to the house price estimate. A linear regression model would not be able to effectively represent those relationships. Tree based models, however, can approximate non-linear relationships. Our implementation using random forests seeks to detect the most significant features 

## Parameter Optimization  
We tried: PCA, number of trees, depths 

In [8]:
pca = PCA(n_components=20)
pca.fit(x_train)
x_train_reduced = pca.transform(x_train)

In [13]:
# Parameters for tuning
n_trees = np.arange(10, 100, 20)  # Trees and depth are explored on an exponentially growing space,
depths = np.arange(10, 50, 10)   # since it is assumed that trees and depth will add accuracy in a decaying fashion.

In [14]:
# To keep track of the best model
best_score = 0
# Run grid search for model with 5-fold cross validation
#print '5-fold cross validation:'
for trees in n_trees:
    for depth in depths:
        # Cross validation for every experiment
        k_folds = cross_validation.KFold(x_train.shape[0], n_folds=5, shuffle=True)
        scores = []
        for train_indices, validation_indices in k_folds:
            # Generate training data
            x_train_cv = x_train.iloc[train_indices].values
            y_train_cv = y_train[train_indices].values
            # Generate validation data
            x_validate = x_train.iloc[validation_indices].values
            y_validate = y_train[validation_indices].values

            # Fit random forest on training data
            model = ensemble.RandomForestRegressor(n_estimators=trees, max_depth=depth)
            #model.fit(x_train_reduced, y_train_cv)
            model.fit(x_train_cv, y_train_cv)
            # Score on validation data
            #scores += [model.score(x_validate_reduced, y_validate)]
            scores += [model.score(x_validate, y_validate)]

            # Record and report accuracy
            average_score = np.mean(scores)
            #print "Trees:", trees, "Depth:", depth, "Score:", average_score

            # Update our record of the best parameters see so far
            if average_score > best_score:
                best_score = average_score
                best_trees = trees
                best_depth = depth


## Best Model 

In [28]:
# Fit model on entire train set using chosen number of trees and depth
model = ensemble.RandomForestRegressor(n_estimators=best_trees, max_depth=best_depth)
model.fit(x_train, y_train)
print 'Chosen number of trees, depth:', best_trees, ',', best_depth
print 'Test accuracy:', model.score(x_test, y_test)

Chosen number of trees, depth: 50 , 30
Chosen number of trees, depth: 70 , 30
Test accuracy: 0.554358358253


## Most Significant Features

In [29]:
print "Top 10 Features sorted by their Score:"
#print sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), x_train.columns), 
#             reverse=True)
sorted(zip(map(lambda x: round(x, 4), model.feature_importances_[0:10]), x_train.columns), reverse = True)

Top 10 Features sorted by their Score:


[(0.3375, 'home_size'),
 (0.0745, 'latitude'),
 (0.0704, 'property_size'),
 (0.0603, 'longitude'),
 (0.0514, 'bathrooms'),
 (0.0326, 'bedrooms'),
 (0.0309, 'year_built'),
 (0.003, 'tax_year'),
 (0.0008, 'schools_Mean ELA Score'),
 (0.0007, 'schools_Number Tested')]

In [30]:
## To fit model on entire data set 
x = pd.concat([x_train, x_test])
y = pd.concat([y_train, y_test]) 

In [31]:
#model = ensemble.RandomForestRegressor(n_estimators=best_trees, max_depth=35)
model = ensemble.RandomForestRegressor(n_estimators=70, max_depth=30)
model.fit(x, y)
#predictions = model.predict(df_to_predict)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=70, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [32]:
## Saving model to pickle file 
#import pickle
#s = pickle.dumps(model)
#pickle.dump(model, open("model.p", "wb"))

In [15]:
#model2 = pickle.load(open("model.p","rb"))