# Predicting Car Prices

In this project I am going to use machine learning workflow to predict a car's market price using its attributes. The dataset we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates and more. You can read more about the dataset [here.](https://archive.ics.uci.edu/ml/datasets/automobile)

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import math
pd.options.display.max_columns = 99

In [22]:
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars = pd.read_csv('data/imports-85.data', names=cols)

cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [23]:
# Select only the columns with continuous values from - https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names
continuous_values_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
numeric_cars = cars[continuous_values_cols]

In [24]:
numeric_cars.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,?,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495
1,?,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500
2,?,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500
3,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950
4,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450


## Data Cleaning

In [25]:
numeric_cars = numeric_cars.replace('?', np.nan)

In [26]:
numeric_cars.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495
1,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500
2,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500
3,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950
4,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450


In [27]:
# convert columns from object to float or int type
numeric_cars = numeric_cars.astype('float64')
numeric_cars.isnull().sum()

normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [28]:
# drop all columns where price is null because it is our target variable
numeric_cars = numeric_cars.dropna(subset=['price'])
numeric_cars.isnull().sum()

normalized-losses    37
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [29]:
# replace other missing values with column means
numeric_cars = numeric_cars.fillna(numeric_cars.mean())

In [30]:
numeric_cars.isnull().sum()

normalized-losses    0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 0
stroke               0
compression-rate     0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [31]:
# normalize all columns except for the target
target_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 
               'curb-weight', 'engine-size','bore', 'stroke', 'compression-rate', 
               'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']

# fit and transform numerical columns 
scaler = MinMaxScaler()
numeric_cars[target_cols] = scaler.fit_transform(numeric_cars[target_cols])

## Univariate Model
Lets start with univariate k-nearest neighbour models. Starting with a simple model before moving to a more complex mode with help to understand features better.

In [34]:
def knn_train_test(train_col, target_col, df):
    # Shuffle the rows in the dataframe
    np.random.seed(0)
    shuffled_index = np.random.permutation(df.index)
    shuffled_df = df.reindex(shuffled_index)

    # Split the data into a training set and a test set
    train_set = shuffled_df.iloc[0:round(len(df)*0.75)]
    test_set = shuffled_df.iloc[round(len(df)*0.75):]
    
    knn = KNeighborsRegressor()
    # Fit the model using the training set
    knn.fit(train_set[[train_col]], train_set[target_col])

    # Make predictions using the test set
    predictions = knn.predict(test_set[[train_col]])

    # Calculate and return the RMSE
    mse = mean_squared_error(test_set[target_col], predictions)
    rmse = np.sqrt(mse)
    return rmse

In [36]:
rmse_results = {}
train_cols = numeric_cars.columns.drop('price')

# for each column train a model and return RMSE value then add to dictionary
for col in train_cols:
    rmse_val = knn_train_test(col, 'price', numeric_cars)
    rmse_results[col] = rmse_val

In [38]:
rmse_results_series = pd.Series(rmse_results)
rmse_results_series.sort_values()

curb-weight          2866.041866
engine-size          3149.201099
width                3364.546069
length               3514.981602
highway-mpg          3652.971354
city-mpg             3760.024776
horsepower           3811.502611
bore                 4905.346787
stroke               5279.205097
height               5463.813539
normalized-losses    5673.533374
wheel-base           6053.594839
compression-rate     6620.560435
peak-rpm             6819.618749
dtype: float64

#### Trying Different Values of K

In [39]:
def knn_train_test(train_col, target_col, df):
    # Shuffle the rows in the dataframe
    np.random.seed(0)
    shuffled_index = np.random.permutation(df.index)
    shuffled_df = df.reindex(shuffled_index)

    # Split the data into a training set and a test set
    train_set = shuffled_df.iloc[0:round(len(df)*0.75)]
    test_set = shuffled_df.iloc[round(len(df)*0.75):]
    
    knn = KNeighborsRegressor()
    # Fit the model using the training set
    knn.fit(train_set[[train_col]], train_set[target_col])

    # Make predictions using the test set
    predictions = knn.predict(test_set[[train_col]])

    # Calculate and return the RMSE
    mse = mean_squared_error(test_set[target_col], predictions)
    rmse = np.sqrt(mse)
    return rmse 