In [1]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp38-cp38-macosx_10_13_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 6.3 MB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0
    Uninstalling scikit-learn-1.0:
      Successfully uninstalled scikit-learn-1.0
Successfully installed scikit-learn-1.0.2


# Used Car Price Prediction: KNN

### Load Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [3]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=10-R6GyVWjt_gjWEFD86mKHDvSWD9lp1z" )

In [4]:
cars_df.sample(5)

Unnamed: 0,index,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,age,KM_Driven,make,mileage_new,engine_new,power_new
1295,2589,Maruti Wagon R VXI AMT,Mumbai,2016,8000,Petrol,Automatic,First,22.5 kmpl,998 CC,67 bhp,5.0,6.14 Lakh,4.15,3,8,maruti,22.5,998.0,67.0
523,1070,Hyundai i20 Asta 1.4 CRDi (Diesel),Kochi,2010,93401,Diesel,Manual,First,23.0 kmpl,1396 CC,90 bhp,5.0,,3.48,9,93,hyundai,23.0,1396.0,90.0
1380,2733,Maruti Swift Dzire ZXI,Jaipur,2017,37282,Petrol,Manual,First,20.85 kmpl,1197 CC,83.14 bhp,5.0,,6.75,2,37,maruti,20.85,1197.0,83.14
1134,2251,Honda City i-DTEC ZX,Hyderabad,2017,46000,Diesel,Manual,First,25.6 kmpl,1498 CC,98.6 bhp,5.0,16.95 Lakh,12.9,2,46,honda,25.6,1498.0,98.6
1346,2674,Tata Indica V2 eLS,Chennai,2017,129000,Diesel,Manual,First,25.0 kmpl,1396 CC,69 bhp,5.0,,3.0,2,129,tata,25.0,1396.0,69.0


In [5]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              3092 non-null   int64  
 1   Name               3092 non-null   object 
 2   Location           3092 non-null   object 
 3   Year               3092 non-null   int64  
 4   Kilometers_Driven  3092 non-null   int64  
 5   Fuel_Type          3092 non-null   object 
 6   Transmission       3092 non-null   object 
 7   Owner_Type         3092 non-null   object 
 8   Mileage            3092 non-null   object 
 9   Engine             3092 non-null   object 
 10  Power              3092 non-null   object 
 11  Seats              3091 non-null   float64
 12  New_Price          411 non-null    object 
 13  Price              3092 non-null   float64
 14  age                3092 non-null   int64  
 15  KM_Driven          3092 non-null   int64  
 16  make               3092 

### Feature Set Selection

In [6]:
cars_df.columns

Index(['index', 'Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'New_Price', 'Price', 'age', 'KM_Driven', 'make', 'mileage_new',
       'engine_new', 'power_new'],
      dtype='object')

In [7]:
x_features = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats', 
              'make', 'mileage_new', 'engine_new', 
              'power_new', 'Location']

In [8]:
cat_vars = ['Fuel_Type', 
                'Transmission', 'Owner_Type',
                'make', 'Location']

In [10]:
num_vars = list(set(x_features) - set(cat_vars))

In [11]:
num_vars

['age', 'KM_Driven', 'mileage_new', 'Seats', 'power_new', 'engine_new']

In [12]:
cars_df[x_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   KM_Driven     3092 non-null   int64  
 1   Fuel_Type     3092 non-null   object 
 2   age           3092 non-null   int64  
 3   Transmission  3092 non-null   object 
 4   Owner_Type    3092 non-null   object 
 5   Seats         3091 non-null   float64
 6   make          3092 non-null   object 
 7   mileage_new   3092 non-null   float64
 8   engine_new    3092 non-null   float64
 9   power_new     3092 non-null   float64
 10  Location      3092 non-null   object 
dtypes: float64(4), int64(2), object(5)
memory usage: 265.8+ KB


### Need for Data Transformation

1. Data imputation for Seats Column
    - Mean imputation 
2. Categorical Encoding for categorical columns
    - OHE Encoding
3. Data scaling
    - Standard scaling

### Setting X and y variables

In [13]:
X = cars_df[x_features]
y = cars_df['Price']

### Data Splitting

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [16]:
X_train.shape

(2473, 11)

In [17]:
X_test.shape

(619, 11)

### Data Imputation

In [18]:
from sklearn.impute import SimpleImputer

In [19]:
imputed_num_vars = ['Seats']

In [20]:
imputed_num_vars

['Seats']

In [21]:
non_imputed_num_vars = list(set(num_vars) - set(imputed_num_vars))

In [22]:
non_imputed_num_vars

['age', 'KM_Driven', 'mileage_new', 'power_new', 'engine_new']

In [23]:
mean_imputer = SimpleImputer(strategy='mean')

### Encode Categorical Variables

In [24]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore')

### Scaling numerical vars

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

### Creating Pipelines

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [28]:
imputed_num_transformer = Pipeline( steps = [  
        ('imputation', mean_imputer),
        ('scaler', scaler)])

In [29]:
non_imputed_num_transformer = Pipeline( steps = [('scaler', scaler)])

In [30]:
cat_transformer = Pipeline( steps = [('ohencoder', ohe_encoder)])

In [31]:
preprocessor = ColumnTransformer(
    transformers=[  
        ('num_imputed', imputed_num_transformer, imputed_num_vars),
        ('num_not_imputed', non_imputed_num_transformer, non_imputed_num_vars),
        ('catvars', cat_transformer, cat_vars)])

### KNN (K-Nearest Neighbor)


In [32]:
from sklearn.neighbors import KNeighborsRegressor

In [33]:
#knn = KNeighborsRegressor(n_neighbors=20)
knn = KNeighborsRegressor(n_neighbors=20, weights='distance')

In [34]:
knn_v1 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('knn', knn)])

In [35]:
knn_v1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_imputed',
                                                  Pipeline(steps=[('imputation',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Seats']),
                                                 ('num_not_imputed',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'KM_Driven',
                                                   'mileage_new', 'power_new',
                                                   'engine_new']),
                                                 ('catvars

### Predict on test set

In [37]:
y_pred = knn_v1.predict(X_test)

### K Fold Cross Validation

In [40]:
from sklearn.model_selection import cross_val_score

In [41]:
scores = cross_val_score( knn_v1,
                          X_train,
                          y_train,
                          cv = 10,
                          scoring = 'r2')

In [42]:
scores

array([0.80997663, 0.74172349, 0.81740538, 0.81873032, 0.78002919,
       0.8157913 , 0.80065021, 0.77851958, 0.80501421, 0.816698  ])

In [43]:
scores.mean()

0.7984538304475222

In [44]:
scores.std()

0.023536240691094626

## Grid Search

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
knn_params = { "knn__n_neighbors": [5, 10, 15, 20, 25],
               "knn__weights": ['uniform', 'distance'],
               "knn__metric": ['minkowski', 'euclidean']}

In [47]:
knn_grid_v1 = GridSearchCV(knn_v1,
                           param_grid=knn_params,
                           cv = 10,
                           scoring = 'r2')



In [48]:
knn_grid_v1.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num_imputed',
                                                                         Pipeline(steps=[('imputation',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['Seats']),
                                                                        ('num_not_imputed',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                         

In [49]:
knn_grid_v1.best_params_

{'knn__metric': 'minkowski',
 'knn__n_neighbors': 10,
 'knn__weights': 'distance'}

In [50]:
knn_grid_v1.best_score_

0.8152181315879734

In [51]:
knn_grid_results = pd.DataFrame( knn_grid_v1.cv_results_ )
knn_grid_results[['param_knn__n_neighbors', 'param_knn__weights', 'mean_test_score', 'std_test_score']]

Unnamed: 0,param_knn__n_neighbors,param_knn__weights,mean_test_score,std_test_score
0,5,uniform,0.795797,0.029375
1,5,distance,0.808354,0.026944
2,10,uniform,0.799696,0.024235
3,10,distance,0.815218,0.024549
4,15,uniform,0.787142,0.024113
5,15,distance,0.808191,0.023978
6,20,uniform,0.773489,0.023372
7,20,distance,0.798454,0.023536
8,25,uniform,0.76744,0.022939
9,25,distance,0.794206,0.022837


### Building the final model

In [52]:
final_model = KNeighborsRegressor(n_neighbors = 5, weights = 'distance', metric = 'minkowski')
knn_final = Pipeline(steps=[('preprocessor', preprocessor),
                          ('knn', final_model)])

In [53]:
knn_final.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_imputed',
                                                  Pipeline(steps=[('imputation',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Seats']),
                                                 ('num_not_imputed',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'KM_Driven',
                                                   'mileage_new', 'power_new',
                                                   'engine_new']),
                                                 ('catvars

In [54]:
knn_final.score(X_test, y_test)

0.8104851251571208

In [56]:
from sklearn.metrics import mean_squared_error

In [57]:
final_rmse = np.sqrt(mean_squared_error(y_test, knn_final.predict(X_test)))
final_rmse

0.95983108414113

## Model Persistence

In [58]:
class CarPredictionModel():
    
    def __init__(self, model, features, rmse):
        self.model = model
        self.features = features
        self.rmse = rmse

In [59]:
my_model = CarPredictionModel(knn_final, list(X_train.columns), final_rmse)

In [60]:
from joblib import dump

In [61]:
dump(my_model, './cars_v1.pkl')

['./cars_v1.pkl']