In [1]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /opt/anaconda3/envs/mlopslab/lib/python3.8/site-packages (1.0.2)


# Used Car Price Prediction: KNN

### Load Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [3]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=10-R6GyVWjt_gjWEFD86mKHDvSWD9lp1z" )

In [4]:
cars_df.sample(5)

Unnamed: 0,index,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,age,KM_Driven,make,mileage_new,engine_new,power_new
1863,3650,Ford Fiesta Classic 1.4 Duratorq CLXI,Chennai,2013,123000,Diesel,Manual,Second,17.8 kmpl,1399 CC,67 bhp,5.0,,4.45,6,123,ford,17.8,1399.0,67.0
1870,3659,Honda Jazz 1.2 V CVT i VTEC,Delhi,2015,27000,Petrol,Automatic,First,19.0 kmpl,1199 CC,88.7 bhp,5.0,,5.95,4,27,honda,19.0,1199.0,88.7
1019,2006,Ford Ikon 1.4 TDCi DuraTorq,Bangalore,2009,75000,Diesel,Manual,First,13.8 kmpl,1399 CC,68 bhp,5.0,,1.99,10,75,ford,13.8,1399.0,68.0
692,1395,Maruti Ritz VDi,Pune,2010,71000,Diesel,Manual,First,21.1 kmpl,1248 CC,73.9 bhp,5.0,,3.0,9,71,maruti,21.1,1248.0,73.9
224,470,Maruti Ciaz 1.3 S,Coimbatore,2017,25142,Diesel,Manual,First,28.09 kmpl,1248 CC,88.5 bhp,5.0,,8.26,2,25,maruti,28.09,1248.0,88.5


In [5]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              3092 non-null   int64  
 1   Name               3092 non-null   object 
 2   Location           3092 non-null   object 
 3   Year               3092 non-null   int64  
 4   Kilometers_Driven  3092 non-null   int64  
 5   Fuel_Type          3092 non-null   object 
 6   Transmission       3092 non-null   object 
 7   Owner_Type         3092 non-null   object 
 8   Mileage            3092 non-null   object 
 9   Engine             3092 non-null   object 
 10  Power              3092 non-null   object 
 11  Seats              3091 non-null   float64
 12  New_Price          411 non-null    object 
 13  Price              3092 non-null   float64
 14  age                3092 non-null   int64  
 15  KM_Driven          3092 non-null   int64  
 16  make               3092 

### Feature Set Selection

In [6]:
cars_df.columns

Index(['index', 'Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'New_Price', 'Price', 'age', 'KM_Driven', 'make', 'mileage_new',
       'engine_new', 'power_new'],
      dtype='object')

In [7]:
x_features = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats', 
              'make', 'mileage_new', 'engine_new', 
              'power_new', 'Location']

In [8]:
cat_vars = ['Fuel_Type', 
                'Transmission', 'Owner_Type',
                'make', 'Location']

In [9]:
num_vars = list(set(x_features) - set(cat_vars))

In [10]:
num_vars

['age', 'engine_new', 'mileage_new', 'power_new', 'Seats', 'KM_Driven']

In [11]:
cars_df[x_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   KM_Driven     3092 non-null   int64  
 1   Fuel_Type     3092 non-null   object 
 2   age           3092 non-null   int64  
 3   Transmission  3092 non-null   object 
 4   Owner_Type    3092 non-null   object 
 5   Seats         3091 non-null   float64
 6   make          3092 non-null   object 
 7   mileage_new   3092 non-null   float64
 8   engine_new    3092 non-null   float64
 9   power_new     3092 non-null   float64
 10  Location      3092 non-null   object 
dtypes: float64(4), int64(2), object(5)
memory usage: 265.8+ KB


### Need for Data Transformation

1. Data imputation for Seats Column
    - Mean imputation 
2. Categorical Encoding for categorical columns
    - OHE Encoding
3. Data scaling
    - Standard scaling

### Setting X and y variables

In [12]:
X = cars_df[x_features]
y = cars_df['Price']

### Data Splitting

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [15]:
X_train.shape

(2473, 11)

In [16]:
X_test.shape

(619, 11)

### Data Imputation

In [17]:
from sklearn.impute import SimpleImputer

In [18]:
imputed_num_vars = ['Seats']

In [19]:
imputed_num_vars

['Seats']

In [20]:
non_imputed_num_vars = list(set(num_vars) - set(imputed_num_vars))

In [21]:
non_imputed_num_vars

['age', 'engine_new', 'mileage_new', 'power_new', 'KM_Driven']

In [22]:
mean_imputer = SimpleImputer(strategy='mean')

### Encode Categorical Variables

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore')

### Scaling numerical vars

In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

### Creating Pipelines

In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [27]:
imputed_num_transformer = Pipeline( steps = [  
        ('imputation', mean_imputer),
        ('scaler', scaler)])

In [28]:
non_imputed_num_transformer = Pipeline( steps = [('scaler', scaler)])

In [29]:
cat_transformer = Pipeline( steps = [('ohencoder', ohe_encoder)])

In [30]:
preprocessor = ColumnTransformer(
    transformers=[  
        ('num_imputed', imputed_num_transformer, imputed_num_vars),
        ('num_not_imputed', non_imputed_num_transformer, non_imputed_num_vars),
        ('catvars', cat_transformer, cat_vars)])

### KNN (K-Nearest Neighbor)


In [31]:
from sklearn.neighbors import KNeighborsRegressor

In [32]:
#knn = KNeighborsRegressor(n_neighbors=20)
knn = KNeighborsRegressor(n_neighbors=20, weights='distance')

In [33]:
knn_v1 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('knn', knn)])

In [34]:
knn_v1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_imputed',
                                                  Pipeline(steps=[('imputation',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Seats']),
                                                 ('num_not_imputed',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'engine_new',
                                                   'mileage_new', 'power_new',
                                                   'KM_Driven']),
                                                 ('catvars

In [35]:
from sklearn import set_config
set_config(display='diagram') 

In [36]:
knn_v1

### Predict on test set

In [37]:
y_pred = knn_v1.predict(X_test)

### K Fold Cross Validation

In [38]:
from sklearn.model_selection import cross_val_score

In [39]:
scores = cross_val_score( knn_v1,
                          X_train,
                          y_train,
                          cv = 10,
                          scoring = 'r2')

In [40]:
scores

array([0.80997331, 0.74171907, 0.81740538, 0.81873032, 0.78002919,
       0.81584538, 0.80065021, 0.77851958, 0.8050088 , 0.81670127])

In [41]:
scores.mean()

0.7984582511564021

In [42]:
scores.std()

0.023541233646453656

## Grid Search

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
knn_params = { "knn__n_neighbors": [5, 10, 15, 20, 25],
               "knn__weights": ['uniform', 'distance'],
               "knn__metric": ['minkowski', 'euclidean']}

In [45]:
knn_grid_v1 = GridSearchCV(knn_v1,
                           param_grid=knn_params,
                           cv = 10,
                           scoring = 'r2')



In [46]:
knn_grid_v1.fit(X_train, y_train)

In [47]:
knn_grid_v1.best_params_

{'knn__metric': 'minkowski',
 'knn__n_neighbors': 10,
 'knn__weights': 'distance'}

In [48]:
knn_grid_v1.best_score_

0.8152355641029411

In [49]:
knn_grid_results = pd.DataFrame( knn_grid_v1.cv_results_ )
knn_grid_results[['param_knn__n_neighbors', 'param_knn__weights', 'mean_test_score', 'std_test_score']]

Unnamed: 0,param_knn__n_neighbors,param_knn__weights,mean_test_score,std_test_score
0,5,uniform,0.795796,0.029375
1,5,distance,0.808355,0.026944
2,10,uniform,0.799726,0.024229
3,10,distance,0.815236,0.024537
4,15,uniform,0.787142,0.024172
5,15,distance,0.808183,0.024007
6,20,uniform,0.773485,0.023376
7,20,distance,0.798458,0.023541
8,25,uniform,0.76741,0.02301
9,25,distance,0.794187,0.022885


### Building the final model

In [50]:
final_model = KNeighborsRegressor(n_neighbors = 5, weights = 'distance', metric = 'minkowski')
knn_final = Pipeline(steps=[('preprocessor', preprocessor),
                          ('knn', final_model)])

In [51]:
knn_final.fit(X_train, y_train)

In [52]:
knn_final.score(X_test, y_test)

0.810485125157115

In [53]:
from sklearn.metrics import mean_squared_error

In [54]:
final_rmse = np.sqrt(mean_squared_error(y_test, knn_final.predict(X_test)))
final_rmse

0.9598310841411447

## Model Persistence

In [55]:
class CarPredictionModel():
    
    def __init__(self, model, features, rmse):
        self.model = model
        self.features = features
        self.rmse = rmse

In [56]:
my_model = CarPredictionModel(knn_final, list(X_train.columns), final_rmse)

In [57]:
from joblib import dump

In [59]:
dump(my_model, './cars_v1.pkl')

['./cars_v1.pkl']