In [62]:
import pandas as pd
import numpy as np

columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 
        'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 
        'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 
        'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 
        'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv("imports-85.data", header=0, names=columns)

cars

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.00,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.00,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.00,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.00,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.40,8.50,110,5500,19,25,15250
5,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.40,8.50,110,5500,19,25,17710
6,1,?,audi,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.40,8.50,110,5500,19,25,18920
7,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.40,8.30,140,5500,17,20,23875
8,0,?,audi,gas,turbo,two,hatchback,4wd,front,99.5,...,131,mpfi,3.13,3.40,7.00,160,5500,16,22,?
9,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.50,2.80,8.80,101,5800,23,29,16430


# Data Cleaning

In [61]:
cars = cars.replace("?", np.nan)


In [56]:
#select continuous columns, excepting normalized losses, 
    #which has many missing values

cars = cars[['wheel-base', 'length', 
                   'width', 'height', 'curb-weight', 'bore', 
                   'stroke', 'compression-rate', 'horsepower', 
                   'peak-rpm', 'city-mpg', 'highway-mpg', 'price']]
cars_nums = cars.astype(float)

In [57]:
cars_nums = cars_nums[cars_nums['price'].notnull()]

In [58]:
cars_nums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 203
Data columns (total 13 columns):
wheel-base          200 non-null float64
length              200 non-null float64
width               200 non-null float64
height              200 non-null float64
curb-weight         200 non-null float64
bore                196 non-null float64
stroke              196 non-null float64
compression-rate    200 non-null float64
horsepower          198 non-null float64
peak-rpm            198 non-null float64
city-mpg            200 non-null float64
highway-mpg         200 non-null float64
price               200 non-null float64
dtypes: float64(13)
memory usage: 21.9 KB


In [59]:
cols_convert = ['bore', 'stroke', 'horsepower', 'peak-rpm']
for col in cols_convert:
    cars_nums[col] = cars_nums[col].fillna(cars_nums[col].mean())


In [87]:
cars_nums.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 203
Data columns (total 13 columns):
wheel-base          200 non-null float64
length              200 non-null float64
width               200 non-null float64
height              200 non-null float64
curb-weight         200 non-null float64
bore                200 non-null float64
stroke              200 non-null float64
compression-rate    200 non-null float64
horsepower          200 non-null float64
peak-rpm            200 non-null float64
city-mpg            200 non-null float64
highway-mpg         200 non-null float64
price               200 non-null float64
dtypes: float64(13)
memory usage: 21.9 KB


In [120]:
#normalize values, then add price back into df.
price = cars_nums['price']
cars_nums = (cars_nums-cars_nums.mean())/cars_nums.std()
cars_nums['price'] = price
cars_nums

Unnamed: 0,wheel-base,length,width,height,curb-weight,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,-1.697177,-0.439616,-0.855008,-2.055427,-0.014857,0.521295,-1.845608,-0.291493,0.204013,-0.246603,-0.652936,-0.542680,16500.0
1,-0.720075,-0.245239,-0.189262,-0.572999,0.515422,-2.420296,0.668901,-0.291493,1.352043,-0.246603,-0.963857,-0.689152,16500.0
2,0.157661,0.192109,0.143611,0.209393,-0.421726,-0.521295,0.446096,-0.042375,-0.036272,0.796716,-0.186553,-0.103263,13950.0
3,0.091417,0.192109,0.238717,0.209393,0.517350,-0.521295,0.446096,-0.540611,0.310807,0.796716,-1.119318,-1.275042,17450.0
4,0.157661,0.248803,0.191164,-0.284750,-0.093917,-0.521295,0.446096,-0.416052,0.177315,0.796716,-0.963857,-0.835625,15250.0
5,1.151325,1.496056,2.616381,0.785893,0.555916,-0.521295,0.446096,-0.416052,0.177315,0.796716,-0.963857,-0.835625,17710.0
6,1.151325,1.496056,2.616381,0.785893,0.768028,-0.521295,0.446096,-0.416052,0.177315,0.796716,-0.963857,-0.835625,18920.0
7,1.151325,1.496056,2.616381,0.868250,1.022562,-0.744707,0.446096,-0.465875,0.978266,0.796716,-1.274779,-1.567987,23875.0
9,0.389516,0.208307,-0.522135,0.209393,-0.309886,0.633001,-1.463657,-0.341316,-0.062970,1.422707,-0.342014,-0.249735,16430.0
10,0.389516,0.208307,-0.522135,0.209393,-0.309886,0.633001,-1.463657,-0.341316,-0.062970,1.422707,-0.342014,-0.249735,16925.0


In [156]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

features = ['wheel-base', 'length', 
                   'width', 'height', 'curb-weight', 'bore', 
                   'stroke', 'compression-rate', 'horsepower', 
                   'peak-rpm', 'city-mpg', 'highway-mpg']
def knn_train_test(df, training_cols, target_col, k):
    np.random.seed(1)
    shuffled_index = np.random.permutation(df.index)
    
    #split into training and test sets:
    rand_df = df.reindex(shuffled_index)
    train_df = rand_df.iloc[0:int(rand_df.shape[0]/2)]
    test_df = rand_df.iloc[int(rand_df.shape[0]/2):rand_df.shape[0]]
    #instantiate kneighbors
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(train_df[training_cols], train_df[target_col])
    predictions = knn.predict(test_df[training_cols])
    rmse = (mean_squared_error(predictions, 
                               test_df[target_col]))**(1/2)
    return rmse  


# Univariate

In [182]:
rmse_univariate = {}
for i in features:
    rmse_univariate[i] = knn_train_test(cars_nums, [i], 'price', 5)
rmse_univariate = pd.Series(rmse_univariate)
rmse_univariate = rmse_univariate.sort_values()
print (rmse_univariate)

#plt.scatter([i for i,val in enumerate(features)], rmse_univariate)

curb-weight         3439.400941
highway-mpg         3746.037338
width               3772.562362
city-mpg            3927.139968
horsepower          4472.788345
length              5207.079013
wheel-base          5463.571847
bore                6207.328761
peak-rpm            6350.427658
compression-rate    7001.541388
height              7306.648484
stroke              8178.853402
dtype: float64


# Multivariate

In [204]:
import matplotlib.pyplot as plt
%matplotlib inline
rmse_multivar = {}
for i in range (1,10):
    features = rmse_univariate.iloc[0:i]
    features_list = [idx for idx,val in features.items()]
    rmse = knn_train_test(cars_nums, features_list, 'price', 5)
    rmse_multivar["top {!r} feature(s)".format(i)] = rmse
rmse_multivar = pd.Series(rmse_multivar).sort_values()
print (rmse_multivar)

top 5 feature(s)    2641.842360
top 6 feature(s)    2760.527875
top 4 feature(s)    3124.811455
top 7 feature(s)    3174.569047
top 2 feature(s)    3228.966198
top 3 feature(s)    3283.656359
top 1 feature(s)    3439.400941
top 8 feature(s)    3489.122490
top 9 feature(s)    3609.048895
dtype: float64


In [226]:
# we will now investigate k values for the best performing features from previous

rmse_vals = {}
for i in range (4,7):
    features = rmse_univariate.index.values[0:i]
    rmse_list = []
    print(features)
    for k in range (1,20):
        rmse = knn_train_test(cars_nums, features, 'price', k)
        rmse_list.append(rmse)
    rmse_vals["top {!r} features".format(i)] = rmse_list
rmse_vals = pd.DataFrame(rmse_vals)
print (rmse_vals)

['curb-weight' 'highway-mpg' 'width' 'city-mpg']
['curb-weight' 'highway-mpg' 'width' 'city-mpg' 'horsepower']
['curb-weight' 'highway-mpg' 'width' 'city-mpg' 'horsepower' 'length']
    top 4 features  top 5 features  top 6 features
0      3264.727626     3187.078154     3039.740589
1      3118.223536     2923.038647     2673.368296
2      3270.524941     2749.046704     2717.089936
3      2993.334421     2654.273248     2792.057194
4      3124.811455     2641.842360     2760.527875
5      2989.132531     2815.948964     2874.889409
6      3050.669794     2929.925124     3037.095898
7      3104.170875     3007.403852     3129.438687
8      3112.969778     3120.561409     3303.023209
9      3180.324524     3215.793054     3364.852089
10     3251.458311     3370.747636     3490.567831
11     3403.811165     3432.169700     3530.832679
12     3565.458815     3556.624831     3533.488245
13     3565.853692     3561.602661     3592.805870
14     3582.987552     3561.044592     3702.332770
15

In [279]:
#optimal k for numbers of features:
optimal_k = {}
for i in range (4,7):
    optimal = rmse_vals['top {} features'.format(i)][rmse_vals[
        'top {} features'.format(i)]== rmse_vals[
        'top {} features'.format(i)].min()].index.values
    print(type(optimal))
    optimal_k["top {} features".format(i)] = "optimal k is {}".format(optimal)

print(optimal_k)
    
    

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
{'top 5 features': 'optimal k is [4]', 'top 4 features': 'optimal k is [5]', 'top 6 features': 'optimal k is [1]'}
