In [19]:
from IPython.display import HTML

HTML('''<script>
code_show=true;
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

## KNN Model

In [34]:
import pandas as pd
import numpy as np
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn import cross_validation
from sklearn import covariance
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor as KNN
import string
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
import math
import string
import os
import datetime
from sklearn.decomposition import PCA

#### Import data

In [43]:
train_data = pd.read_csv("train_imputed.csv")
train_data.head()

Unnamed: 0,tax_value,property_size,zestimate_amount,bathrooms,tax_year,latitude,bedrooms,year_built,home_size,longitude,...,neighborhood_Morningside Heights,neighborhood_Murray Hill,neighborhood_North Sutton Area,neighborhood_Soho,neighborhood_Tribeca,neighborhood_Upper East Side,neighborhood_Upper West Side,neighborhood_Washington Heights,neighborhood_West Village,neighborhood_Yorkville
0,2331000.0,2000.0,849427.0,1.0,2015.0,40.724448,1.0,1910.0,400.0,-73.980284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1642000.0,2250.0,3769688.0,2.0,2015.0,40.72442,3.0,1900.0,10800.0,-73.9802,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1674000.0,2250.0,7332356.0,1.0,2015.0,40.724385,2.0,1900.0,10800.0,-73.98012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2331000.0,2000.0,1991289.0,1.0,2015.0,40.724332,4.0,1910.0,3328.0,-73.980007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2331000.0,2000.0,3769688.0,1.0,2015.0,40.724845,2.0,1910.0,3328.0,-73.980804,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After importing the data, we split it up into x and y, with "zestimate_amount" as our y-value and everything else as our x.  We then created a function to fit and evaluate a KNN model within cross-validation.

In [44]:
x_train = train_data.drop(['zestimate_amount', 'tax_value'],1)
y_train = train_data['zestimate_amount']

In [45]:
def k_fold_knn(x, y, n_neighbors, num_folds):
    
    kfold = cross_validation.KFold(len(x), num_folds, shuffle = True)

    # Iterate over folds
    cv_rsquared = 0
    
    for train, test in kfold:
        x_train_cv = x.iloc[train].values
        y_train_cv = y.iloc[train].values
        x_test_cv = x.iloc[test].values
        y_test_cv = y.iloc[test].values
        
        # Fit linear regression model and evaluate
        knn_model = KNN(n_neighbors = n_neighbors)
        knn_model.fit(x_train_cv, y_train_cv)
        rsquared = knn_model.score(x_test_cv, y_test_cv)
    
        # Cummulative accuracy across folds
        cv_rsquared += rsquared

    # Return average accuracy across folds
    return cv_rsquared * 1.0 / num_folds

#### Full dataset - cross validation on KNN models with different numbers of neighbors

In [46]:
for i in range(2, 30): 
    rsquared = k_fold_knn (x_train, y_train, i, 3)
    print "R-squared for k=" + str(i) + " neighbors: " + str(rsquared)

R-squared for k=2 neighbors: -0.182994502508
R-squared for k=3 neighbors: -0.0230282455552
R-squared for k=4 neighbors: 0.121640672374
R-squared for k=5 neighbors: 0.165927482577
R-squared for k=6 neighbors: 0.169773025484
R-squared for k=7 neighbors: 0.192667973845
R-squared for k=8 neighbors: 0.220719222771
R-squared for k=9 neighbors: 0.233594383476
R-squared for k=10 neighbors: 0.230564589895
R-squared for k=11 neighbors: 0.247367657995
R-squared for k=12 neighbors: 0.260496278806
R-squared for k=13 neighbors: 0.23703399939
R-squared for k=14 neighbors: 0.250350537406
R-squared for k=15 neighbors: 0.251118738487
R-squared for k=16 neighbors: 0.274174241377
R-squared for k=17 neighbors: 0.268559089408
R-squared for k=18 neighbors: 0.296962739348
R-squared for k=19 neighbors: 0.265635719198
R-squared for k=20 neighbors: 0.258332451695
R-squared for k=21 neighbors: 0.270276226284
R-squared for k=22 neighbors: 0.272154271115
R-squared for k=23 neighbors: 0.259284386248
R-squared for k=

The number of neighbors that resulted in the highest cross-validated r-squared was 18, so this appears to be the best value for k.

#### Final KNN model for full dataset


In [47]:
final_knn_model = KNN(n_neighbors = 16)
final_knn_model.fit(x_train.values, y_train.values)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=16, p=2,
          weights='uniform')

#### Performance of final model on test data

In [48]:
test_data = pd.read_csv("test_imputed.csv")

In [51]:
x_test = test_data.drop(['zestimate_amount', 'tax_value'],1)
y_test = test_data['zestimate_amount']

In [52]:
score = final_knn_model.score(x_test.values, y_test.values)
print "Score on test set: " + str(score)

Score on test set: 0.324316012934


We then wanted to see if KNN would work better on a smaller number of predictors.  We decided to run the model again using only the predictors that we had chosen as being highly correlated with our y-value during the data cleaning stage, and also using PCA to reduce the number of dimensions.

#### Subset of predictors and PCA - cross validation on KNN models with different numbers of neighbors and different numbers of PCA components

In [53]:
train_best_features = pd.read_csv("train_best_features.csv")
test_best_features = pd.read_csv("test_best_features.csv")

In [54]:
x_train = train_best_features.drop(['zestimate_amount'],1)
y_train = train_best_features['zestimate_amount']

In [55]:
x_test = test_best_features.drop(['zestimate_amount'],1)
y_test = test_best_features['zestimate_amount']

In [56]:
def k_fold_knn_with_pca(pca_dims, neighbors, num_folds, x, y):
    
    kfold = cross_validation.KFold(len(x), num_folds, shuffle = True)

    # Iterate over folds
    cv_rsquared = 0
    
    for train, test in kfold:
        x_train_cv = x.iloc[train].values
        y_train_cv = y.iloc[train].values
        x_test_cv = x.iloc[test].values
        y_test_cv = y.iloc[test].values
        
        #Project to the data onto axes
        pca = PCA(n_components=pca_dims)
        pca.fit(x_train_cv)
        x_train_reduced = pca.transform(x_train_cv)
        x_test_reduced = pca.transform(x_test_cv)

        
        # Fit linear regression model and evaluate
        knn_model = KNN(n_neighbors = neighbors)
        knn_model.fit(x_train_reduced, y_train_cv)
        rsquared = knn_model.score(x_test_reduced, y_test_cv)
    
        # Cummulative accuracy across folds
        cv_rsquared += rsquared

    # Return average accuracy across folds
    return cv_rsquared * 1.0 / num_folds

In [57]:
pca_dims = range(5,15)
knn_neighbors = range(20,80,2)

best_r2 = 0
best_pca_dim = 0
best_knn_k = 0
for pca_dim in pca_dims:
    for knn_neighbor in knn_neighbors:
        current_r2 = k_fold_knn_with_pca(pca_dim, knn_neighbor, 4, x_train, y_train)
        print "PCA Dimensions: ", pca_dim, "\tKNN # Neighbors: ", knn_neighbor, "\tR^2: ", current_r2
        if current_r2 > best_r2:
            best_r2 = current_r2
            best_pca_dim = pca_dim
            best_knn_k = knn_neighbor
            
print "BEST HYPERPARAMETERS FOR KNN WITH WITH PCA:"
print "PCA Dimensions: ", best_pca_dim
print "KNN # Neighbors: ", best_knn_k
print "Resulting R^2:", best_r2
            

PCA Dimensions:  5 	KNN # Neighbors:  20 	R^2:  0.21287124594
PCA Dimensions:  5 	KNN # Neighbors:  22 	R^2:  0.205856231085
PCA Dimensions:  5 	KNN # Neighbors:  24 	R^2:  0.224175088622
PCA Dimensions:  5 	KNN # Neighbors:  26 	R^2:  0.219332967775
PCA Dimensions:  5 	KNN # Neighbors:  28 	R^2:  0.220625627773
PCA Dimensions:  5 	KNN # Neighbors:  30 	R^2:  0.221476817902
PCA Dimensions:  5 	KNN # Neighbors:  32 	R^2:  0.216463675034
PCA Dimensions:  5 	KNN # Neighbors:  34 	R^2:  0.231162636932
PCA Dimensions:  5 	KNN # Neighbors:  36 	R^2:  0.237302708865
PCA Dimensions:  5 	KNN # Neighbors:  38 	R^2:  0.227979953605
PCA Dimensions:  5 	KNN # Neighbors:  40 	R^2:  0.22977781995
PCA Dimensions:  5 	KNN # Neighbors:  42 	R^2:  0.254292295837
PCA Dimensions:  5 	KNN # Neighbors:  44 	R^2:  0.235447501256
PCA Dimensions:  5 	KNN # Neighbors:  46 	R^2:  0.253594903322
PCA Dimensions:  5 	KNN # Neighbors:  48 	R^2:  0.229908132533
PCA Dimensions:  5 	KNN # Neighbors:  50 	R^2:  0.2411617

The number of PCA components/number of neighbors that resulted in the highest cross-validated R-squared was 6 PCA dimensions and 52 neighbors, so these appears to be the optimal values.

#### Final KNN model for reduced number of predictors and PCA

In [58]:
pca = PCA(n_components=6)
pca.fit(x_train)
x_train_reduced = pca.transform(x_train)
x_test_reduced = pca.transform(x_test)


final_knn_model = KNN(n_neighbors = 52)
final_knn_model.fit(x_train_reduced, y_train.values)

final_knn_model.score(x_test_reduced, y_test.values)

0.23823575327221724