# Housing Prices in King County, WA: K Nearest Neighbors
Goal
- Use K Nearest Neighbors to create a model that predicts the sale price of homes given various attributes about the house

## Obtain Data

In [2]:
# global imports

# sklearn features
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

In [3]:
# import dataframes
%store -r dfs

# assign dataframes to variables
X_train = dfs[0]
X_val = dfs[1]
X_test = dfs[2]
y_train = dfs[3]
y_val = dfs[4]
y_test = dfs[5] 

# check importing data frames worked
df = [X_train, X_val, X_test, y_train, y_val, y_test]
for d in df:
    print(d.shape)

(3181, 18)
(682, 18)
(682, 18)
(3181, 1)
(682, 1)
(682, 1)


## Train K Nearest Neighbors Model
Model Evaluation
- Use model to make predictions for price given test predictors
- Compute metrics to compare predictions with actual price for test dataset
    - Minimize mean absolute error and mean squared error
        - Mean absolute error: the average difference between the observed price and predicted price
        - Mean squared error: the average squared difference between observed price and predicted price
            - Gives a higher weight than mean absolute error for large errors
            
Steps to Train the K Nearest Neighbors Model
1. Train a base model with default parameters
2. Evaluate base model using validation dataset
3. Train a model with preliminary best parameters chosen through random search
    - Test a wide range of parameter values
        - Choose the set of parameters that minimizes mean squared error
    - Random search is appropriate for preliminary estimates
        - Has a faster runtime than grid search
            - This is because it does not try all parameter values
4. Evaluate random search model using validation dataset
5. Train a model with best parameters chosen through grid search
    - Test a narrow range of parameter values
        - Choose the set of parameters that minimizes mean squared error
    - Grid search is appropriate for final estimates
        - Tries all combinations of parameters
6. Evaluate model using validation dataset

In [4]:
# general KNN Regressor
knn_model = KNeighborsRegressor()

# function that evaluates the model
def evaluate(model, test_pred, test_resp):
    # predictions
    predict = model.predict(test_pred)
    # metrics on test data
    test_mae = mean_absolute_error(test_resp,  predict)
    test_mse = mean_squared_error(test_resp,  predict)
    # print results
    print("Mean Absolute Error: %s" %test_mae)
    print("Mean Squared Error: %s" %test_mse)

In [5]:
# evaluate the base model
base_model = KNeighborsRegressor()
base_model.fit(X_train, y_train.values.ravel())
evaluate(base_model, X_val, y_val.values.ravel())

Mean Absolute Error: 135067.20515441496
Mean Squared Error: 47564433689.56244


In [7]:
# random search grid
random_grid = {
    'n_neighbors': [2, 5, 10, 15],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50, 70],
    'p': [1,2]
}

# fit random search
knn_random = RandomizedSearchCV(estimator = knn_model, param_distributions = random_grid, n_iter = 
                               100, cv = 3, scoring = 'neg_mean_squared_error')
knn_random.fit(X_train, y_train.values.ravel())

# output best parameters from random search
knn_random.best_params_

{'weights': 'uniform',
 'p': 1,
 'n_neighbors': 5,
 'leaf_size': 70,
 'algorithm': 'kd_tree'}

In [8]:
# evaluate best random search model
best_random = knn_random.best_estimator_
evaluate(best_random, X_val, y_val.values.ravel())

Mean Absolute Error: 132503.49680547096
Mean Squared Error: 46947276498.21297


In [9]:
# grid search parameters
params_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform'],
    'algorithm': ['kd_tree'],
    'leaf_size': [60, 65, 70, 75, 80],
    'p': [1]
}

# fit grid search
knn_grid = GridSearchCV(estimator = knn_model, param_grid = params_grid, cv = 3, scoring = 
                       'neg_mean_squared_error')
knn_grid.fit(X_train, y_train.values.ravel())

# output best parameters from grid search
knn_grid.best_params_

{'algorithm': 'kd_tree',
 'leaf_size': 60,
 'n_neighbors': 5,
 'p': 1,
 'weights': 'uniform'}

In [10]:
# evaluate best grid search model
best_grid = knn_grid.best_estimator_
evaluate(best_grid, X_val, y_val.values.ravel())

Mean Absolute Error: 132503.49680547096
Mean Squared Error: 46947276498.21297


## Final K Nearest Neighbors Model
- Train a model with best parameters found from grid search
- Evaluate the model using the test dataset
    - The model is not overfitted
        - The mean absolute error for the training and test datasets are similar
        - The mean squared error for the training and test datasets are similar

In [5]:
# check model performance on test dataset
final_model = KNeighborsRegressor(algorithm = 'kd_tree', leaf_size = 60, n_neighbors = 5, p = 
                                  1, weights = 'uniform')
final_model.fit(X_train, y_train.values.ravel())
evaluate(final_model, X_test, y_test.values.ravel())

Mean Absolute Error: 131426.58780384227
Mean Squared Error: 58914690993.47741
