### OLS vs KNN Regression

In [32]:
from sklearn import neighbors
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 999
pd.set_option('display.max_columns', 500)

pd.options.display.float_format = '{:.3f}'.format

In [33]:
milage = pd.read_csv('data/Auto.csv')

In [34]:
milage.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [35]:
milage.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
year              int64
origin            int64
name             object
dtype: object

In [94]:
Y = milage.mpg

X = milage.iloc[:,[1,4,5]]

In [95]:
X.head()

Unnamed: 0,cylinders,weight,acceleration
0,8,3504,12.0
1,8,3693,11.5
2,8,3436,11.0
3,8,3433,12.0
4,8,3449,10.5


In [96]:
Y.head()

0   18.000
1   15.000
2   18.000
3   16.000
4   17.000
Name: mpg, dtype: float64

In [97]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 123)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

The number of observations in training set is 317
The number of observations in test set is 80


### Linear Regression Model

In [98]:
# We fit an OLS model using sklearn
lrm = LinearRegression()
lrm.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.704951364521176
-----Test set statistics-----
R-squared of the model in the test set is: 0.6777403008136553
Mean absolute error of the prediction is: 3.4542573039121978
Mean squared error of the prediction is: 18.827972571884192
Root mean squared error of the prediction is: 4.339121175063471
Mean absolute percentage error of the prediction is: 14.79865678765668


In [99]:
score = cross_val_score(lrm, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.30 (+/- 1.08)


In [100]:
score

array([ 0.44953109,  0.34774392,  0.76514528,  0.67346586, -0.73297068])

### KNN Model n_neighbors=20

In [101]:
knn = neighbors.KNeighborsRegressor(n_neighbors=20)
knn.fit(X, Y)

# Trailing underscores are a common convention for a prediction.
y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7301980260282205
-----Test set statistics-----
R-squared of the model in the test set is: 0.7088887914227413
Mean absolute error of the prediction is: 3.1313750000000002
Mean squared error of the prediction is: 17.008126875
Root mean squared error of the prediction is: 4.124091036216344
Mean absolute percentage error of the prediction is: 13.15786155829295


In [102]:
score = cross_val_score(knn, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.31 (+/- 1.18)


In [103]:
score

array([ 0.44771673,  0.41546285,  0.78498844,  0.72042151, -0.84336205])

### KNN Model n_neighbors=10

In [104]:
knn = neighbors.KNeighborsRegressor(n_neighbors=10)
knn.fit(X, Y)

# Trailing underscores are a common convention for a prediction.
y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7412261574829809
-----Test set statistics-----
R-squared of the model in the test set is: 0.7258744273492388
Mean absolute error of the prediction is: 2.979125
Mean squared error of the prediction is: 16.015743750000002
Root mean squared error of the prediction is: 4.001967484875409
Mean absolute percentage error of the prediction is: 12.49706615288129


In [105]:
score = cross_val_score(knn, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.30 (+/- 1.24)


In [106]:
score

array([ 0.42481427,  0.47974404,  0.77029552,  0.71624922, -0.91133331])

### KNN Model n_neighbors=3

In [107]:
knn = neighbors.KNeighborsRegressor(n_neighbors=3)
knn.fit(X, Y)

# Trailing underscores are a common convention for a prediction.
y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.8101518925609512
-----Test set statistics-----
R-squared of the model in the test set is: 0.7660142080682358
Mean absolute error of the prediction is: 2.6574999999999998
Mean squared error of the prediction is: 13.670583333333335
Root mean squared error of the prediction is: 3.6973751950989953
Mean absolute percentage error of the prediction is: 11.317047628490357


In [108]:
score = cross_val_score(knn, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.18 (+/- 1.24)


In [109]:
score

array([ 0.40603588,  0.14922127,  0.69137651,  0.659147  , -0.9981415 ])

### KNN model with weights

In [110]:
knn = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
knn.fit(X, Y)

# Trailing underscores are a common convention for a prediction.
y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.9997697689723876
-----Test set statistics-----
R-squared of the model in the test set is: 1.0
Mean absolute error of the prediction is: 0.0
Mean squared error of the prediction is: 0.0
Root mean squared error of the prediction is: 0.0
Mean absolute percentage error of the prediction is: 0.0


In [111]:
score = cross_val_score(knn, X, Y, cv=5)
print("%0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

0.23 (+/- 1.29)


In [112]:
score

array([ 0.42536642,  0.31997815,  0.73463618,  0.70575255, -1.01871641])

### Comparison/Contrast

1. each of the models has a roughly similar difference between the training set and test set
1. as the n_neighbors variable increases for the KNN models, the training and test scores increase, however the cross validation score decreases, and the standard deviation of the cross validation score also increases.



1. Try to determine whether there is a situation where you would change your mind, or whether one is unambiguously better than the other.
1. Try to note what it is about the data that causes the better model to outperform the weaker model.