# Housing Prices in King County, WA: K Nearest Neighbors
Goal
- Use K Nearest Neighbors to create a model that predicts the sale price of homes given various attributes about the house

## Obtain Data

In [15]:
# global imports

# sklearn features
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

# pandas features
import pandas as pd

# supress scientific notation
pd.options.display.float_format = '{:.2f}'.format

In [2]:
# import dataframes
%store -r dfs

# assign dataframes to variables
X_train = dfs[0]
X_val = dfs[1]
X_test = dfs[2]
y_train = dfs[3]
y_val = dfs[4]
y_test = dfs[5] 

# check importing data frames worked
df = [X_train, X_val, X_test, y_train, y_val, y_test]
for d in df:
    print(d.shape)

(3181, 18)
(682, 18)
(682, 18)
(3181, 1)
(682, 1)
(682, 1)


## Train K Nearest Neighbors Model
Model Evaluation
- Use model to make predictions for price given test predictors
- Compute metrics to compare predictions with actual price for test dataset
    - Minimize mean absolute error and mean squared error
        - Mean absolute error: the average difference between the observed price and predicted price
        - Mean squared error: the average squared difference between observed price and predicted price
            - Gives a higher weight than mean absolute error for large errors
            
Steps to Train the K Nearest Neighbors Model
1. Train a base model with default parameters
2. Evaluate base model using validation dataset
3. Train a model with preliminary best parameters chosen through random search
    - Test a wide range of parameter values
        - Choose the set of parameters that minimizes mean squared error
    - Random search is appropriate for preliminary estimates
        - Has a faster runtime than grid search
            - This is because it does not try all parameter values
4. Evaluate random search model using validation dataset
5. Train a model with best parameters chosen through grid search
    - Test a narrow range of parameter values
        - Choose the set of parameters that minimizes mean squared error
    - Grid search is appropriate for final estimates
        - Tries all combinations of parameters
6. Evaluate model using validation dataset

In [3]:
# general KNN Regressor
knn_model = KNeighborsRegressor()

# function that evaluates the model
def evaluate(model, test_pred, test_resp):
    # predictions
    predict = model.predict(test_pred)
    # metrics on test data
    test_mae = mean_absolute_error(test_resp,  predict)
    test_mse = mean_squared_error(test_resp,  predict)
    # print results
    print("Mean Absolute Error: %s" %test_mae)
    print("Mean Squared Error: %s" %test_mse)

In [4]:
# evaluate the base model
base_model = KNeighborsRegressor()
base_model.fit(X_train, y_train.values.ravel())
evaluate(base_model, X_val, y_val.values.ravel())

Mean Absolute Error: 135067.20515441496
Mean Squared Error: 47564433689.56244


In [5]:
# random search grid
random_grid = {
    'n_neighbors': [2, 5, 10, 15],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50, 70],
    'p': [1,2]}

# fit random search
knn_random = RandomizedSearchCV(estimator = knn_model, param_distributions = random_grid, n_iter = 
                               100, cv = 3, scoring = 'neg_mean_squared_error', random_state = 123)
knn_random.fit(X_train, y_train.values.ravel())

# output best parameters from random search
knn_random.best_params_

{'weights': 'uniform',
 'p': 1,
 'n_neighbors': 5,
 'leaf_size': 30,
 'algorithm': 'auto'}

In [6]:
# evaluate best random search model
best_random = knn_random.best_estimator_
evaluate(best_random, X_val, y_val.values.ravel())

Mean Absolute Error: 132503.49680547096
Mean Squared Error: 46947276498.21297


In [7]:
# grid search parameters
params_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform'],
    'algorithm': ['kd_tree'],
    'leaf_size': [20, 25, 30, 35, 40],
    'p': [1]}

# fit grid search
knn_grid = GridSearchCV(estimator = knn_model, param_grid = params_grid, cv = 3, scoring = 
                       'neg_mean_squared_error')
knn_grid.fit(X_train, y_train.values.ravel())

# output best parameters from grid search
knn_grid.best_params_

{'algorithm': 'kd_tree',
 'leaf_size': 20,
 'n_neighbors': 5,
 'p': 1,
 'weights': 'uniform'}

In [8]:
# evaluate best grid search model
best_grid = knn_grid.best_estimator_
evaluate(best_grid, X_val, y_val.values.ravel())

Mean Absolute Error: 132503.49680547096
Mean Squared Error: 46947276498.21297


## Final K Nearest Neighbors Model
- Train a model with best parameters found from grid search
- Evaluate the model using the test dataset
    - The model is not overfitted
        - The mean absolute error for the training and test datasets are similar
        - The mean squared error for the training and test datasets are similar

In [9]:
# check model performance on test dataset
final_model = KNeighborsRegressor(algorithm = 'kd_tree', leaf_size = 20, n_neighbors = 5, p = 
                                  1, weights = 'uniform')
final_model.fit(X_train, y_train.values.ravel())
evaluate(final_model, X_test, y_test.values.ravel())

Mean Absolute Error: 131426.58780384227
Mean Squared Error: 58914690993.47741


## Dataframe for Analysis
Steps to Create Dataframe for Test Data
1. Create dataframe knn_test_data with test rows by making a copy of y_test
2. Add columns 'predictions', 'difference', and 'abs_difference' to knn_test_data
3. Inner join data_pandas and knn_test_data
    - knn_test_data now has columns from data_pandas
4. Save the dataframe
    - Dataframe used in *WashingtonHouseSales-KNearestNeighborsAnalysis* program

In [22]:
# create dataframe
knn_test_data = y_test
knn_test_data.head()

Unnamed: 0,price
2760,212000.0
1520,681500.0
594,255000.0
2083,636000.0
2564,400000.0


In [23]:
# create predictions dataframe
predictions = final_model.predict(X_test)
knn_test_data['predictions'] = predictions
knn_test_data['difference'] = knn_test_data['price'] - knn_test_data['predictions']
knn_test_data['abs_difference'] = knn_test_data['difference'].abs()
knn_test_data = knn_test_data[['predictions', 'difference', 'abs_difference']]
knn_test_data.head()

Unnamed: 0,predictions,difference,abs_difference
2760,364000.0,-152000.0,152000.0
1520,792625.0,-111125.0,111125.0
594,254600.0,400.0,400.0
2083,509598.0,126402.0,126402.0
2564,370100.0,29900.0,29900.0


In [24]:
# import dataframe
%store -r data_pandas
data_pandas.head()

Unnamed: 0,price,bedroom,bathroom,sqftLot,floors,waterfront,view,condition,sqftAbove,sqftBelow,yrBuilt,yrWorked,location
0,313000.0,3.0,1.5,7912,1.5,0,0,3,1340,0,1955,2005,Seattle
1,260000.0,4.0,2.0,8625,1.0,0,0,4,1480,0,1974,1974,South Urban
2,469000.0,2.0,1.0,4400,1.0,0,0,3,1030,0,1924,2011,Seattle
3,1135000.0,4.0,2.75,8103,1.0,0,3,3,1970,1400,1970,2014,Seattle
4,409500.0,4.0,2.75,13000,1.0,0,0,3,1320,820,1968,1997,North


In [25]:
# merge dataframes
knn_test_data = knn_test_data.merge(data_pandas, how='inner', left_index=True, right_index=True)
knn_test_data = knn_test_data.reset_index(drop=True)
knn_test_data.head()

Unnamed: 0,predictions,difference,abs_difference,price,bedroom,bathroom,sqftLot,floors,waterfront,view,condition,sqftAbove,sqftBelow,yrBuilt,yrWorked,location
0,364000.0,-152000.0,152000.0,212000.0,2.0,1.5,1525,2.0,0,0,3,1020,0,2004,2004,Seattle
1,792625.0,-111125.0,111125.0,681500.0,5.0,2.75,11700,1.0,0,0,3,1630,1630,1964,2000,East Urban
2,254600.0,400.0,400.0,255000.0,3.0,1.75,9720,1.0,0,0,3,1050,500,1976,1976,South Rural
3,509598.0,126402.0,126402.0,636000.0,2.0,1.75,3600,1.5,0,0,5,1230,0,1925,1925,Seattle
4,370100.0,29900.0,29900.0,400000.0,3.0,2.25,11266,2.0,0,0,3,2140,0,1986,1986,East Rural


In [26]:
# save dataframe 
%store knn_test_data

Stored 'knn_test_data' (DataFrame)
