In [14]:
#グラフをnotebook内に描画させるための設定
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# pandasでの表示列を増やす
# see: http://uyamazak.hatenablog.com/entry/2016/09/29/163534
pd.set_option("display.max_column", 101)

In [2]:
df_data = pd.read_csv("./kc_house_data.csv", parse_dates=['date'])

In [38]:
kf = KFold(n_splits=5, random_state=1234, shuffle=True)

parameters = {
    'scale_sqft_living': [0.01, 0.1,  1, 10, 100],
    'scale_grade': [0.01, 0.1, 1, 10, 100],
    'n_neighbors': [1, 2, 3, 4, 5],
}

results = []
for param in ParameterGrid(parameters):
    scale = [param['scale_sqft_living'], param['scale_grade'], 1, 1]
    n_neighbors = param['n_neighbors']
    
    maes = []
    rmses = []
    for i,(train_index, test_index) in enumerate(kf.split(df_data)):
        X = df_data[['sqft_living', 'grade', 'lat', 'long']].values * scale
        y = df_data['price'].values
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        neigh = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)
        neigh.fit(X_train, y_train) 
        y_pred = neigh.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        maes.append(mae)
        rmses.append(rmse)
    
    # print(param)
    # print("MSE=%s"%round(mse,3) )
    # print("RMSE={0}".format(round(np.mean(rmses), 3) ))
    # print("MAE={0}".format(round(np.mean(maes),3) ))
    results.append((param, np.mean(rmses), np.mean(maes)))

In [42]:
def KNeighborSearch(parameters):
    results = []
    for param in ParameterGrid(parameters):
        scale = [param['scale_sqft_living'], param['scale_grade'], 1, 1]
        n_neighbors = param['n_neighbors']

        maes = []
        rmses = []
        for i,(train_index, test_index) in enumerate(kf.split(df_data)):
            X = df_data[['sqft_living', 'grade', 'lat', 'long']].values * scale
            y = df_data['price'].values
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            neigh = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)
            neigh.fit(X_train, y_train) 
            y_pred = neigh.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mse)

            maes.append(mae)
            rmses.append(rmse)

        # print(param)
        # print("MSE=%s"%round(mse,3) )
        # print("RMSE={0}".format(round(np.mean(rmses), 3) ))
        # print("MAE={0}".format(round(np.mean(maes),3) ))
        results.append((param, np.mean(rmses), np.mean(maes)))
        
    sortedResults = sorted(results, key=lambda result: result[2])
    display(sortedResults[:5])

In [62]:
def KNeighborSearch2(parameters):
    results = []
    for param in ParameterGrid(parameters):
        scale = [param['scale_sqft_living'], param['scale_grade'], param['scale_yr_built'], 1, 1]
        n_neighbors = param['n_neighbors']

        maes = []
        rmses = []
        for i,(train_index, test_index) in enumerate(kf.split(df_data)):
            X = df_data[['sqft_living', 'grade', 'yr_built', 'lat', 'long']].values * scale
            y = df_data['price'].values
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            neigh = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1, weights=param['weights'], metric=param['metric'])
            neigh.fit(X_train, y_train) 
            y_pred = neigh.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mse)

            maes.append(mae)
            rmses.append(rmse)

        # print(param)
        # print("MSE=%s"%round(mse,3) )
        # print("RMSE={0}".format(round(np.mean(rmses), 3) ))
        # print("MAE={0}".format(round(np.mean(maes),3) ))
        results.append((param, np.mean(rmses), np.mean(maes)))
        
    sortedResults = sorted(results, key=lambda result: result[2])
    display(sortedResults[:5])

In [43]:
parameters = {
    'scale_sqft_living': [0.01,  1, 100],
    'scale_grade': [0.01, 1, 100],
    'n_neighbors': [1, 3, 5],
}

KNeighborSearch(parameters)

[({'n_neighbors': 5, 'scale_grade': 0.01, 'scale_sqft_living': 0.01},
  208084.7411852095,
  115113.44632286459),
 ({'n_neighbors': 5, 'scale_grade': 1, 'scale_sqft_living': 0.01},
  207461.0481224746,
  115185.52221889887),
 ({'n_neighbors': 5, 'scale_grade': 100, 'scale_sqft_living': 0.01},
  209992.9261807139,
  115521.02982876156),
 ({'n_neighbors': 3, 'scale_grade': 0.01, 'scale_sqft_living': 0.01},
  214082.89396131318,
  117658.7646577934),
 ({'n_neighbors': 3, 'scale_grade': 1, 'scale_sqft_living': 0.01},
  215050.9592366287,
  117978.90204602088)]

In [44]:
parameters = {
    'scale_sqft_living': [0.0001, 0.01,  1],
    'scale_grade': [0.0001, 0.01, 1, 100, 10000],
    'n_neighbors': [3, 5, 7, 10, 15],
}

KNeighborSearch(parameters)

[({'n_neighbors': 10, 'scale_grade': 100, 'scale_sqft_living': 0.0001},
  157395.64252247976,
  81201.83609207255),
 ({'n_neighbors': 10, 'scale_grade': 10000, 'scale_sqft_living': 0.0001},
  157395.64252247976,
  81201.83609207255),
 ({'n_neighbors': 10, 'scale_grade': 1, 'scale_sqft_living': 0.0001},
  157395.28556997684,
  81204.9527135123),
 ({'n_neighbors': 7, 'scale_grade': 1, 'scale_sqft_living': 0.0001},
  157227.8516688523,
  81618.26846369792),
 ({'n_neighbors': 7, 'scale_grade': 100, 'scale_sqft_living': 0.0001},
  157231.33208044223,
  81623.53202215077)]

In [54]:
parameters = {
    'scale_sqft_living': [0, 1e-5, 2e-5, 3e-5, 4e-5, 5e-5, 6e-5, 7e-5, 8e-5, 9e-5, 10e-5],
    'scale_grade': [0, 0.001, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.09, 0.1, 1],
    'scale_yr_built': [0],
    'n_neighbors': [ 5, 7, 8, 9, 10, 11, 12, 15],
}

KNeighborSearch2(parameters)

[({'n_neighbors': 15,
   'scale_grade': 0.02,
   'scale_sqft_living': 4e-05,
   'scale_yr_built': 0},
  149384.57213736034,
  77334.32921984862),
 ({'n_neighbors': 15,
   'scale_grade': 0.02,
   'scale_sqft_living': 3e-05,
   'scale_yr_built': 0},
  150461.62205734823,
  77347.92962202859),
 ({'n_neighbors': 11,
   'scale_grade': 0.02,
   'scale_sqft_living': 4e-05,
   'scale_yr_built': 0},
  148226.56274713724,
  77486.11715681315),
 ({'n_neighbors': 10,
   'scale_grade': 0.02,
   'scale_sqft_living': 3e-05,
   'scale_yr_built': 0},
  148777.7405383382,
  77499.54748980064),
 ({'n_neighbors': 12,
   'scale_grade': 0.02,
   'scale_sqft_living': 3e-05,
   'scale_yr_built': 0},
  149488.95869312176,
  77525.42608553005)]

In [60]:
parameters = {
    'scale_sqft_living': [2.7e-5, 2.9e-5, 3e-5, 3.3e-5, 3.5e-5, 3.7e-5, 4e-5],
    'scale_grade': [0.015, 0.017, 0.019, 0.02, 0.025],
    'scale_yr_built': [0],
    'n_neighbors': [13, 15, 17, 20],
    'weights': ['uniform', 'distance'],
}

KNeighborSearch2(parameters)

[({'n_neighbors': 15,
   'scale_grade': 0.02,
   'scale_sqft_living': 3e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  147324.99534741713,
  75424.4180346212),
 ({'n_neighbors': 15,
   'scale_grade': 0.019,
   'scale_sqft_living': 2.9e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  147463.3060266763,
  75456.91355847134),
 ({'n_neighbors': 15,
   'scale_grade': 0.017,
   'scale_sqft_living': 3e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  147676.8653392876,
  75471.58697201152),
 ({'n_neighbors': 15,
   'scale_grade': 0.017,
   'scale_sqft_living': 2.9e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  147628.84324046047,
  75491.90948397666),
 ({'n_neighbors': 15,
   'scale_grade': 0.02,
   'scale_sqft_living': 2.9e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  147420.38960493467,
  75497.7769431003)]

In [63]:
parameters = {
    'scale_sqft_living': [2.7e-5, 2.8e-5, 2.9e-5, 2.95e-5, 3e-5, 3.3e-5],
    'scale_grade': [0.015, 0.017, 0.018,  0.019, 0.02, 0.025],
    'scale_yr_built': [0],
    'n_neighbors': [13, 15, 17, 20],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'manhattan', 'chebyshev'],
}

KNeighborSearch2(parameters)

[({'metric': 'manhattan',
   'n_neighbors': 15,
   'scale_grade': 0.015,
   'scale_sqft_living': 2.9e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  146869.81822159316,
  74582.78164572045),
 ({'metric': 'manhattan',
   'n_neighbors': 15,
   'scale_grade': 0.015,
   'scale_sqft_living': 2.95e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  146846.62380358874,
  74592.44597261655),
 ({'metric': 'manhattan',
   'n_neighbors': 17,
   'scale_grade': 0.015,
   'scale_sqft_living': 2.95e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  147682.28825085756,
  74596.15004771718),
 ({'metric': 'manhattan',
   'n_neighbors': 17,
   'scale_grade': 0.015,
   'scale_sqft_living': 2.9e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  147811.7443840655,
  74596.84956020981),
 ({'metric': 'manhattan',
   'n_neighbors': 17,
   'scale_grade': 0.015,
   'scale_sqft_living': 2.8e-05,
   'scale_yr_built': 0,
   'weights': 'distance'},
  148098.94463854175,
  74608.06849867