## KNN Model

In [10]:
import pandas as pd
import numpy as np
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn import cross_validation
from sklearn import covariance
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor as KNN
import string
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
import math
import string
import os
import datetime

#### Train data

In [2]:
train_data = pd.read_csv("train_imputed.csv")
train_data.head()

Unnamed: 0,tax_value,property_size,zestimate_amount,bathrooms,tax_year,latitude,bedrooms,year_built,home_size,longitude,...,precinct_string_26.0,precinct_string_28.0,precinct_string_30.0,precinct_string_32.0,precinct_string_33.0,precinct_string_34.0,precinct_string_5.0,precinct_string_6.0,precinct_string_7.0,precinct_string_9.0
0,2290000.0,2000.0,849427.0,1.0,2015.0,40.724448,1.0,1910.0,400.0,-73.980284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1674000.0,2250.0,7332356.0,1.0,2015.0,40.724385,2.0,1900.0,10800.0,-73.98012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2290000.0,2000.0,1991289.0,1.0,2015.0,40.724332,4.0,1910.0,3374.0,-73.980007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2837000.0,650.0,1225236.0,1.0,2015.0,40.724225,1.0,1876.0,650.0,-73.97972,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,6701000.0,2250.0,12785150.0,1.0,2015.0,40.724125,2.0,1901.0,4168.0,-73.979495,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
x_train = train_data.drop(['zestimate_amount'],1)
y_train = train_data['zestimate_amount']

In [4]:
def k_fold_knn(x, y, n_neighbors, num_folds):
    
    kfold = cross_validation.KFold(len(x), num_folds, shuffle = True)

    # Iterate over folds
    cv_rsquared = 0
    
    for train, test in kfold:
        x_train_cv = x.iloc[train].values
        y_train_cv = y.iloc[train].values
        x_test_cv = x.iloc[test].values
        y_test_cv = y.iloc[test].values
        
        # Fit linear regression model and evaluate
        knn_model = KNN(n_neighbors = n_neighbors)
        knn_model.fit(x_train_cv, y_train_cv)
        rsquared = knn_model.score(x_test_cv, y_test_cv)
    
        # Cummulative accuracy across folds
        cv_rsquared += rsquared

    # Return average accuracy across folds
    return cv_rsquared * 1.0 / num_folds

#### Cross validation on KNN models with different numbers of neighbors

In [5]:
for i in range(2, 20): 
    rsquared = k_fold_knn (x_train, y_train, i, 3)
    print "R-squared for k=" + str(i) + " neighbors: " + str(rsquared)

R-squared for k=2 neighbors: -0.139457140202
R-squared for k=3 neighbors: -0.0519592013019
R-squared for k=4 neighbors: 0.0728092888566
R-squared for k=5 neighbors: 0.131940395986
R-squared for k=6 neighbors: 0.121840242375
R-squared for k=7 neighbors: 0.185382664388
R-squared for k=8 neighbors: 0.179128618436
R-squared for k=9 neighbors: 0.169990798949
R-squared for k=10 neighbors: 0.206888224988
R-squared for k=11 neighbors: 0.221261058311
R-squared for k=12 neighbors: 0.200937665989
R-squared for k=13 neighbors: 0.216549541963
R-squared for k=14 neighbors: 0.224713162052
R-squared for k=15 neighbors: 0.260843313602
R-squared for k=16 neighbors: 0.226978146675
R-squared for k=17 neighbors: 0.222088904868
R-squared for k=18 neighbors: 0.216274333501
R-squared for k=19 neighbors: 0.23181202076


The number of neighbors that resulted in the highest cross-validated r-squared was 18, so this will be our final model.

#### Final model

In [6]:
final_knn_model = KNN(n_neighbors = 16)
final_knn_model.fit(x_train.values, y_train.values)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=16, p=2,
          weights='uniform')

#### Performance of final model on test data

In [7]:
test_data = pd.read_csv("test_imputed.csv")

In [8]:
x_test = test_data.drop(['zestimate_amount'],1)
y_test = test_data['zestimate_amount']

In [9]:
final_knn_model.score(x_test.values, y_test.values)

0.25336598409790168