## KNN Model

In [None]:
# ONLY USE THIS BLOCK OF CODE ON NICASIA'S COMPUTER 
import sys
sys.path.append("/anaconda/lib/python2.7/site-packages")
#####

import pandas as pd
import numpy as np
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn import cross_validation
from sklearn import covariance
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor as KNN
import string
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
import math
import string
import os
import datetime
from sklearn.decomposition import PCA

In [None]:
train_data = pd.read_csv("train_imputed.csv")
train_data.head()

In [None]:
x_train = train_data.drop(['zestimate_amount'],1)
y_train = train_data['zestimate_amount']

In [None]:
def k_fold_knn(x, y, n_neighbors, num_folds):
    
    kfold = cross_validation.KFold(len(x), num_folds, shuffle = True)

    # Iterate over folds
    cv_rsquared = 0
    
    for train, test in kfold:
        x_train_cv = x.iloc[train].values
        y_train_cv = y.iloc[train].values
        x_test_cv = x.iloc[test].values
        y_test_cv = y.iloc[test].values
        
        # Fit linear regression model and evaluate
        knn_model = KNN(n_neighbors = n_neighbors)
        knn_model.fit(x_train_cv, y_train_cv)
        rsquared = knn_model.score(x_test_cv, y_test_cv)
    
        # Cummulative accuracy across folds
        cv_rsquared += rsquared

    # Return average accuracy across folds
    return cv_rsquared * 1.0 / num_folds

#### Cross validation on KNN models with different numbers of neighbors

In [None]:
for i in range(2, 20): 
    rsquared = k_fold_knn (x_train, y_train, i, 3)
    print "R-squared for k=" + str(i) + " neighbors: " + str(rsquared)

The number of neighbors that resulted in the highest cross-validated r-squared was 18, so this will be our final model.

#### Final model


In [None]:
final_knn_model = KNN(n_neighbors = 16)
final_knn_model.fit(x_train.values, y_train.values)

#### Performance of final model on test data

In [None]:
test_data = pd.read_csv("test_imputed.csv")

In [None]:
x_test = test_data.drop(['zestimate_amount'],1)
y_test = test_data['zestimate_amount']

In [None]:
final_knn_model.score(x_test.values, y_test.values)

### ADDING PCA TO REDUCE FEATURE SPACE:


In [None]:
def k_fold_knn_with_pca(pca_dims, neighbors, num_folds, x, y):
    
    kfold = cross_validation.KFold(len(x), num_folds, shuffle = True)

    # Iterate over folds
    cv_rsquared = 0
    
    for train, test in kfold:
        x_train_cv = x.iloc[train].values
        y_train_cv = y.iloc[train].values
        x_test_cv = x.iloc[test].values
        y_test_cv = y.iloc[test].values
        
        #Project to the data onto axes
        pca = PCA(n_components=pca_dims)
        pca.fit(x_train_cv)
        x_train_reduced = pca.transform(x_train_cv)
        x_test_reduced = pca.transform(x_test_cv)

        
        # Fit linear regression model and evaluate
        knn_model = KNN(n_neighbors = neighbors)
        knn_model.fit(x_train_reduced, y_train_cv)
        rsquared = knn_model.score(x_test_reduced, y_test_cv)
    
        # Cummulative accuracy across folds
        cv_rsquared += rsquared

    # Return average accuracy across folds
    return cv_rsquared * 1.0 / num_folds

In [None]:
pca_dims = range(5,15)
knn_neighbors = range(20,50, 2)

best_r2 = 0
best_pca_dim = 0
best_knn_k = 0
for pca_dim in pca_dims:
    for knn_neighbor in knn_neighbors:
        current_r2 = k_fold_knn_with_pca(pca_dim, knn_neighbor, 4, x_train, y_train)
        # print "PCA Dimensions: ", pca_dim, "\tKNN # Neighbors: ", knn_neighbor, "\tR^2: ", current_r2
        if current_r2 > best_r2:
            best_r2 = current_r2
            best_pca_dim = pca_dim
            best_knn_k = knn_neighbor
            
print "BEST HYPERPARAMETERS FOR KNN WITH WITH PCA:"
print "PCA Dimensions: ", best_pca_dim
print "KNN # Neighbors: ", best_knn_k
print "Resulting R^2:", best_r2
            

Final:

In [None]:
pca = PCA(n_components=12)
pca.fit(x_train)
x_train_reduced = pca.transform(x_train)
x_test_reduced = pca.transform(x_test)


final_knn_model = KNN(n_neighbors = 30)
final_knn_model.fit(x_train_reduced, y_train.values)

final_knn_model.score(x_test_reduced, y_test.values)