### OLS vs KNN Regression

In [32]:
from sklearn import neighbors
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 999
pd.set_option('display.max_columns', 500)

pd.options.display.float_format = '{:.3f}'.format

In [33]:
milage = pd.read_csv('data/Auto.csv')

In [34]:
milage.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [113]:
milage.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
year              int64
origin            int64
name             object
dtype: object

In [114]:
Y = milage.mpg

X = milage.iloc[:,[1,4,5]]

In [115]:
X.head()

Unnamed: 0,cylinders,weight,acceleration
0,8,3504,12.0
1,8,3693,11.5
2,8,3436,11.0
3,8,3433,12.0
4,8,3449,10.5


In [116]:
Y.head()

0   18.000
1   15.000
2   18.000
3   16.000
4   17.000
Name: mpg, dtype: float64

In [117]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 123)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

The number of observations in training set is 317
The number of observations in test set is 80


### Linear Regression Model

In [118]:
# We fit an OLS model using sklearn
lrm = LinearRegression()
lrm.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.6959199745032396
-----Test set statistics-----
R-squared of the model in the test set is: 0.7116940969939705
Mean absolute error of the prediction is: 2.878100691149599
Mean squared error of the prediction is: 17.504474431414174
Root mean squared error of the prediction is: 4.183834895333965
Mean absolute percentage error of the prediction is: 11.955216087449404


In [119]:
score = cross_val_score(lrm, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.30 (+/- 1.08)


In [120]:
score

array([ 0.44953109,  0.34774392,  0.76514528,  0.67346586, -0.73297068])

### KNN Model n_neighbors=20

In [121]:
knn = neighbors.KNeighborsRegressor(n_neighbors=20)
knn.fit(X, Y)

# Trailing underscores are a common convention for a prediction.
y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7294056628758515
-----Test set statistics-----
R-squared of the model in the test set is: 0.7140217119660037
Mean absolute error of the prediction is: 2.9376874999999996
Mean squared error of the prediction is: 17.363153437500003
Root mean squared error of the prediction is: 4.166911738626101
Mean absolute percentage error of the prediction is: 11.990349186074903


In [122]:
score = cross_val_score(knn, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.31 (+/- 1.18)


In [123]:
score

array([ 0.44771673,  0.41546285,  0.78498844,  0.72042151, -0.84336205])

### KNN Model n_neighbors=10

In [124]:
knn = neighbors.KNeighborsRegressor(n_neighbors=10)
knn.fit(X, Y)

# Trailing underscores are a common convention for a prediction.
y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7424871501812503
-----Test set statistics-----
R-squared of the model in the test set is: 0.7225406979373048
Mean absolute error of the prediction is: 2.8661250000000003
Mean squared error of the prediction is: 16.845923750000004
Root mean squared error of the prediction is: 4.104378607048819
Mean absolute percentage error of the prediction is: 11.80677337183074


In [125]:
score = cross_val_score(knn, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.30 (+/- 1.24)


In [126]:
score

array([ 0.42481427,  0.47974404,  0.77029552,  0.71624922, -0.91133331])

### KNN Model n_neighbors=3

In [127]:
knn = neighbors.KNeighborsRegressor(n_neighbors=3)
knn.fit(X, Y)

# Trailing underscores are a common convention for a prediction.
y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.8120843979520599
-----Test set statistics-----
R-squared of the model in the test set is: 0.7607883134856149
Mean absolute error of the prediction is: 2.7175
Mean squared error of the prediction is: 14.523722222222224
Root mean squared error of the prediction is: 3.811000160354526
Mean absolute percentage error of the prediction is: 10.989400635760767


In [128]:
score = cross_val_score(knn, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.18 (+/- 1.24)


In [129]:
score

array([ 0.40603588,  0.14922127,  0.69137651,  0.659147  , -0.9981415 ])

### KNN model with weights

In [136]:
knn = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
knn.fit(X, Y)

# Trailing underscores are a common convention for a prediction.
y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 1.0
-----Test set statistics-----
R-squared of the model in the test set is: 0.9990735393337498
Mean absolute error of the prediction is: 0.0375
Mean squared error of the prediction is: 0.05625
Root mean squared error of the prediction is: 0.23717082451262844
Mean absolute percentage error of the prediction is: 0.24422268907563024


In [131]:
score = cross_val_score(knn, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.23 (+/- 1.29)


In [132]:
score

array([ 0.42536642,  0.31997815,  0.73463618,  0.70575255, -1.01871641])

### Comparison/Contrast

1. each of the models has a roughly similar difference between the training set and test set
1. as the n_neighbors variable increases for the KNN models, the training and test scores increase, however the cross validation score decreases, and the standard deviation of the cross validation score also increases.
1. I'm not sure there is a situation where I would absolutely choose one over the other.  Overall, the KNN models have better R-squared values, but this comes at at a cost with cross validation scores having a much higher standard deviation than linear regression. 

