# KNN to predict cars market price

In [1]:
import pandas as pd
import numpy as np

cars = pd.read_csv('imports-85.data')
cars.head()

Unnamed: 0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,...,130,mpfi,3.47,2.68,9.00,111,5000,21,27,13495
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [2]:
cars.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

In [3]:
cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [4]:
feature_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
numeric_cars = cars[feature_cols]
numeric_cars.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,?,88.6,168.8,64.1,48.8,2548,3.47,2.68,9.0,111,5000,21,27,16500
1,?,94.5,171.2,65.5,52.4,2823,2.68,3.47,9.0,154,5000,19,26,16500
2,164,99.8,176.6,66.2,54.3,2337,3.19,3.4,10.0,102,5500,24,30,13950
3,164,99.4,176.6,66.4,54.3,2824,3.19,3.4,8.0,115,5500,18,22,17450
4,?,99.8,177.3,66.3,53.1,2507,3.19,3.4,8.5,110,5500,19,25,15250


# Cleaning data

In [5]:
numeric_cars = numeric_cars.replace('?', np.nan)
numeric_cars = numeric_cars.astype('float')
numeric_cars.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,,88.6,168.8,64.1,48.8,2548.0,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
1,,94.5,171.2,65.5,52.4,2823.0,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
2,164.0,99.8,176.6,66.2,54.3,2337.0,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
3,164.0,99.4,176.6,66.4,54.3,2824.0,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0
4,,99.8,177.3,66.3,53.1,2507.0,3.19,3.4,8.5,110.0,5500.0,19.0,25.0,15250.0


In [6]:
numeric_cars = numeric_cars.dropna(subset=['price'])
numeric_cars.isnull().sum()

normalized-losses    36
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [7]:
numeric_cars = numeric_cars.fillna(numeric_cars.mean())
numeric_cars.isnull().sum()

normalized-losses    0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
bore                 0
stroke               0
compression-rate     0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [8]:
#normalize all columns except target
price_col = numeric_cars['price']
numeric_cars = (numeric_cars - numeric_cars.min())/(numeric_cars.max() - numeric_cars.min())
numeric_cars['price'] = price_col

# Univariate KNN models

In [9]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error


def knn_train_test(train_col, target_col, df):
    knn = KNeighborsRegressor()
    np.random.seed(1)
    
    #shuffle data
    shuffled_index = np.random.permutation(df.index)
    rand_df = df.reindex(shuffled_index) 
    
    last_train_row = int(len(rand_df) * 0.75)
    
    #seperate train and test data
    train_df = rand_df.iloc[0:last_train_row]
    test_df = rand_df.iloc[last_train_row:]
    
    # Fit a KNN model using default k value.
    knn.fit(train_df[[train_col]], train_df[target_col])
    
    # Make predictions using model.
    predicted_labels = knn.predict(test_df[[train_col]])

    # Calculate and return RMSE.
    mse = mean_squared_error(test_df[target_col], predicted_labels)
    rmse = np.sqrt(mse)
    return rmse    

rmse_results = {}
train_cols = numeric_cars.columns.drop('price')

for col in train_cols:
    rmse_val = knn_train_test(col, 'price', numeric_cars)
    rmse_results[col] = rmse_val
    
rmse_results_series = pd.Series(rmse_results)
rmse_results_series.sort_values()  

curb-weight          2453.159320
highway-mpg          3094.812484
width                3149.468658
city-mpg             3405.269430
length               4419.750519
horsepower           5433.885872
wheel-base           5577.767067
bore                 5700.941366
stroke               6060.993926
normalized-losses    6471.233989
height               6571.989033
peak-rpm             6852.587978
compression-rate     8449.805996
dtype: float64

# Multivariate KNN models

In [10]:
def knn_train_test(train_cols, target_col, df):
    np.random.seed(1)
    
    # Randomize order of rows in data frame.
    shuffled_index = np.random.permutation(df.index)
    rand_df = df.reindex(shuffled_index)

    # Divide number of rows in half and round.
    last_train_row = int(len(rand_df) * 0.75)
    
    # Select the first half and set as training set.
    # Select the second half and set as test set.
    train_df = rand_df.iloc[0:last_train_row]
    test_df = rand_df.iloc[last_train_row:]
    
    # Fit model using k nearest neighbors.
    knn = KNeighborsRegressor(n_neighbors=5)
    knn.fit(train_df[train_cols], train_df[target_col])

    # Make predictions using model.
    test_df['predicted_labels'] = knn.predict(test_df[train_cols])
        
    return test_df


six_best_features = ['horsepower', 'width', 'curb-weight' , 'city-mpg' , 'highway-mpg', 'length']
prediction = knn_train_test(six_best_features, 'price', numeric_cars)
prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price,predicted_labels
55,0.445026,0.253644,0.416418,0.461538,0.15,0.346005,0.564286,0.566594,0.15,0.247664,0.755102,0.111111,0.184211,11845.0,14397.0
27,0.235602,0.48688,0.5,0.367521,1.0,0.406129,0.571429,0.661905,0.09375,0.186916,0.346939,0.305556,0.368421,8921.0,11570.4
79,0.460733,0.282799,0.476119,0.435897,0.133333,0.342126,0.45,0.661905,0.03125,0.317757,0.55102,0.277778,0.368421,9959.0,9327.0
46,0.418848,0.769679,0.873134,0.794872,0.416667,1.0,0.778571,1.0,0.06875,0.598131,0.244898,0.055556,0.078947,32250.0,38594.8
25,0.434555,0.206997,0.241791,0.299145,0.233333,0.194337,0.307143,0.552381,0.15,0.093458,0.55102,0.5,0.578947,7609.0,6453.4
3,0.518325,0.373178,0.529851,0.521368,0.541667,0.518231,0.464286,0.633333,0.0625,0.313084,0.55102,0.138889,0.157895,17450.0,13380.4
173,0.0,0.460641,0.514925,0.529915,0.591667,0.384794,0.521429,0.609524,0.96875,0.116822,0.142857,0.472222,0.447368,10698.0,10265.4
52,0.251309,0.189504,0.383582,0.333333,0.525,0.177269,0.35,0.514286,0.125,0.093458,0.346939,0.5,0.578947,6695.0,6997.6
153,0.08377,0.265306,0.426866,0.282051,0.941667,0.311094,0.364286,0.457143,0.125,0.065421,0.265306,0.388889,0.421053,7898.0,8520.8
135,0.445026,0.364431,0.679104,0.529915,0.691667,0.512025,0.714286,0.47619,0.125,0.523364,0.55102,0.166667,0.263158,18150.0,16085.0


In [14]:
#Calculate mse and rmse
mse = mean_squared_error(prediction['price'], prediction['predicted_labels'])
rmse = np.sqrt(mse)

print("MSE:")
print(mse)
print("RMSE:")
print(rmse)

MSE:
5292630.217600001
RMSE:
2300.5717153785927
