# Used Car Price Prediction: Stochastic Regressor

### Load Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=1FFxZhi8p_cZM2bbSJRvnb3sev6dRCbsb" )

In [3]:
cars_df.sample(5)

Unnamed: 0,index,Name,Location,Year,Fuel_Type,Transmission,Owner_Type,Seats,Price,age,KM_Driven,make,mileage_new,engine_new,power_new
2311,4494,Fiat Grande Punto 1.4 Emotion,Coimbatore,2015,Petrol,Manual,Second,5.0,5.41,4,44,fiat,14.6,1368,88.7
950,1865,Maruti Ritz LDi,Delhi,2014,Diesel,Manual,First,5.0,3.6,5,32,maruti,23.2,1248,73.94
1311,2614,Maruti Swift Dzire VDI,Hyderabad,2013,Diesel,Manual,First,5.0,6.95,6,84,maruti,23.4,1248,74.0
2402,4693,Maruti Swift VDI,Mumbai,2013,Diesel,Manual,First,5.0,4.65,6,35,maruti,22.9,1248,74.0
3078,5995,Renault Duster 85PS Diesel RxL Explore,Pune,2015,Diesel,Manual,First,5.0,7.75,4,30,renault,19.87,1461,83.8


In [4]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         3092 non-null   int64  
 1   Name          3092 non-null   object 
 2   Location      3092 non-null   object 
 3   Year          3092 non-null   int64  
 4   Fuel_Type     3092 non-null   object 
 5   Transmission  3092 non-null   object 
 6   Owner_Type    3092 non-null   object 
 7   Seats         3091 non-null   float64
 8   Price         3092 non-null   float64
 9   age           3092 non-null   int64  
 10  KM_Driven     3092 non-null   int64  
 11  make          3092 non-null   object 
 12  mileage_new   3092 non-null   float64
 13  engine_new    3092 non-null   int64  
 14  power_new     3092 non-null   float64
dtypes: float64(4), int64(5), object(6)
memory usage: 362.5+ KB


### Feature Set Selection

In [5]:
cars_df.columns

Index(['index', 'Name', 'Location', 'Year', 'Fuel_Type', 'Transmission',
       'Owner_Type', 'Seats', 'Price', 'age', 'KM_Driven', 'make',
       'mileage_new', 'engine_new', 'power_new'],
      dtype='object')

In [6]:
x_features = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats', 
              'make', 'mileage_new', 'engine_new', 
              'power_new', 'Location']

In [7]:
cat_features = ['Fuel_Type', 
                'Transmission', 'Owner_Type',
                'make', 'Location']

In [8]:
num_features = list(set(x_features) - set(cat_features))

In [9]:
num_features

['power_new', 'Seats', 'age', 'engine_new', 'KM_Driven', 'mileage_new']

In [10]:
cars_df[x_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3092 entries, 0 to 3091
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   KM_Driven     3092 non-null   int64  
 1   Fuel_Type     3092 non-null   object 
 2   age           3092 non-null   int64  
 3   Transmission  3092 non-null   object 
 4   Owner_Type    3092 non-null   object 
 5   Seats         3091 non-null   float64
 6   make          3092 non-null   object 
 7   mileage_new   3092 non-null   float64
 8   engine_new    3092 non-null   int64  
 9   power_new     3092 non-null   float64
 10  Location      3092 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 265.8+ KB


### Dropping Null Values

In [11]:
cars_df = cars_df[x_features + ['Price']].dropna()

In [12]:
cars_df.shape

(3091, 12)

In [13]:
cars_df.sample(5)

Unnamed: 0,KM_Driven,Fuel_Type,age,Transmission,Owner_Type,Seats,make,mileage_new,engine_new,power_new,Location,Price
562,8,Petrol,2,Manual,First,5.0,hyundai,18.9,1197,81.86,Delhi,5.0
2877,86,Petrol,13,Manual,Second,5.0,maruti,18.9,998,67.1,Delhi,1.0
2329,68,Petrol,6,Manual,First,5.0,nissan,16.95,1498,97.7,Kochi,3.97
1922,59,Diesel,5,Manual,First,5.0,toyota,23.59,1364,67.1,Hyderabad,5.6
1309,23,Diesel,7,Manual,First,5.0,maruti,23.2,1248,73.94,Pune,3.52


### Setting X and y variables

In [14]:
X = cars_df[x_features]
y = cars_df['Price']

### Data Splitting

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

In [17]:
X_train

Unnamed: 0,KM_Driven,Fuel_Type,age,Transmission,Owner_Type,Seats,make,mileage_new,engine_new,power_new,Location
1458,81,Diesel,5,Manual,First,5.0,volkswagen,20.14,1498,88.80,Hyderabad
3079,70,Petrol,9,Manual,Second,5.0,volkswagen,17.24,1198,73.90,Pune
2276,75,Petrol,4,Manual,First,5.0,hyundai,20.36,1197,78.90,Mumbai
1599,95,Petrol,21,Manual,Third,5.0,maruti,17.30,993,60.00,Jaipur
2356,14,Petrol,2,Manual,First,5.0,hyundai,18.60,1197,81.83,Kochi
...,...,...,...,...,...,...,...,...,...,...,...
522,45,Diesel,5,Manual,First,5.0,maruti,23.40,1248,74.00,Kolkata
2260,16,Petrol,2,Manual,First,5.0,tata,23.84,1199,84.00,Hyderabad
2983,21,Petrol,4,Manual,First,5.0,hyundai,18.90,1197,82.00,Pune
1213,83,Petrol,12,Manual,First,5.0,maruti,14.00,1061,64.00,Pune


In [18]:
X_train[0:10]

Unnamed: 0,KM_Driven,Fuel_Type,age,Transmission,Owner_Type,Seats,make,mileage_new,engine_new,power_new,Location
1458,81,Diesel,5,Manual,First,5.0,volkswagen,20.14,1498,88.8,Hyderabad
3079,70,Petrol,9,Manual,Second,5.0,volkswagen,17.24,1198,73.9,Pune
2276,75,Petrol,4,Manual,First,5.0,hyundai,20.36,1197,78.9,Mumbai
1599,95,Petrol,21,Manual,Third,5.0,maruti,17.3,993,60.0,Jaipur
2356,14,Petrol,2,Manual,First,5.0,hyundai,18.6,1197,81.83,Kochi
737,12,Petrol,3,Manual,First,5.0,maruti,21.4,1197,83.1,Mumbai
265,50,Diesel,4,Manual,First,5.0,volkswagen,20.14,1498,88.8,Hyderabad
713,59,Petrol,12,Manual,First,4.0,maruti,16.1,796,37.0,Jaipur
82,88,Diesel,7,Manual,First,5.0,chevrolet,25.44,936,57.6,Jaipur
2918,27,Petrol,2,Manual,First,5.0,maruti,22.74,796,47.3,Kochi


In [19]:
X_test.shape

(619, 11)

### Defining Transformations

1. Data imputation for Seats Column
    - Mean imputation 
2. Categorical Encoding for categorical columns
    - OHE Encoding
3. Data scaling
    - Standard scaling

In [20]:
from sklearn.preprocessing import OneHotEncoder

ohe_encoder = OneHotEncoder(handle_unknown='ignore')

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [22]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')

In [23]:
imputed_num_features = ['Seats']
non_imputed_num_features = list(set(num_features) - set(imputed_num_features))

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [25]:
imputed_num_transformer = Pipeline( steps = [  
        ('imputation', mean_imputer),
        ('scaler', scaler)])

In [26]:
non_imputed_num_transformer = Pipeline( steps = [('scaler', scaler)])

In [27]:
cat_transformer = Pipeline( steps = [('ohencoder', ohe_encoder)])

In [28]:
preprocessor = ColumnTransformer(
    transformers=[  
        ('num_imputed', imputed_num_transformer, imputed_num_features),
        ('num_not_imputed', non_imputed_num_transformer, non_imputed_num_features),
        ('catvars', cat_transformer, cat_features)])

### Linear Models

Assumes linear relationship between features and outcome variable.

In [29]:
from sklearn.linear_model import SGDRegressor

In [30]:
lreg_v1 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', SGDRegressor(max_iter=100, eta0=0.01))])

In [31]:
from sklearn import set_config
set_config(display='diagram') 

In [32]:
lreg_v1.fit(X_train, y_train)

In [33]:
column_names = (imputed_num_features + 
                non_imputed_num_features + 
                list(lreg_v1['preprocessor'].transformers_[2][1].get_feature_names_out()))

In [34]:
column_names

['Seats',
 'power_new',
 'age',
 'engine_new',
 'KM_Driven',
 'mileage_new',
 'Fuel_Type_Diesel',
 'Fuel_Type_Petrol',
 'Transmission_Automatic',
 'Transmission_Manual',
 'Owner_Type_First',
 'Owner_Type_Fourth & Above',
 'Owner_Type_Second',
 'Owner_Type_Third',
 'make_chevrolet',
 'make_datsun',
 'make_fiat',
 'make_ford',
 'make_honda',
 'make_hyundai',
 'make_mahindra',
 'make_maruti',
 'make_mitsubishi',
 'make_nissan',
 'make_renault',
 'make_skoda',
 'make_tata',
 'make_toyota',
 'make_volkswagen',
 'Location_Ahmedabad',
 'Location_Bangalore',
 'Location_Chennai',
 'Location_Coimbatore',
 'Location_Delhi',
 'Location_Hyderabad',
 'Location_Jaipur',
 'Location_Kochi',
 'Location_Kolkata',
 'Location_Mumbai',
 'Location_Pune']

### Understanding model parameters

In [35]:
lreg_v1['regressor'].intercept_

array([0.02750958])

In [36]:
lreg_v1['regressor'].coef_

array([ 0.08318122,  0.62098982, -1.08301444,  0.30718744, -0.32234573,
       -0.08460021,  1.83265074,  0.90570177,  1.47730343,  1.26104907,
        1.07639139,  0.03461399,  0.90527879,  0.72206833, -0.28203036,
       -0.12564927, -0.0587081 ,  0.25202933,  0.24480123,  0.48573597,
       -0.22692982,  1.00952366,  0.08702958,  0.11626947,  0.32133233,
        0.06570771, -0.487497  ,  0.90889667,  0.42784111,  0.16158221,
        0.61530639,  0.38195242,  1.09469312, -0.22482504,  0.68411588,
        0.16643074,  0.5197492 , -0.8450688 ,  0.01205206,  0.17236433])

In [37]:
dict(zip(column_names, np.round(lreg_v1['regressor'].coef_, 2)))

{'Fuel_Type_Diesel': 1.83,
 'Fuel_Type_Petrol': 0.91,
 'KM_Driven': -0.32,
 'Location_Ahmedabad': 0.16,
 'Location_Bangalore': 0.62,
 'Location_Chennai': 0.38,
 'Location_Coimbatore': 1.09,
 'Location_Delhi': -0.22,
 'Location_Hyderabad': 0.68,
 'Location_Jaipur': 0.17,
 'Location_Kochi': 0.52,
 'Location_Kolkata': -0.85,
 'Location_Mumbai': 0.01,
 'Location_Pune': 0.17,
 'Owner_Type_First': 1.08,
 'Owner_Type_Fourth & Above': 0.03,
 'Owner_Type_Second': 0.91,
 'Owner_Type_Third': 0.72,
 'Seats': 0.08,
 'Transmission_Automatic': 1.48,
 'Transmission_Manual': 1.26,
 'age': -1.08,
 'engine_new': 0.31,
 'make_chevrolet': -0.28,
 'make_datsun': -0.13,
 'make_fiat': -0.06,
 'make_ford': 0.25,
 'make_honda': 0.24,
 'make_hyundai': 0.49,
 'make_mahindra': -0.23,
 'make_maruti': 1.01,
 'make_mitsubishi': 0.09,
 'make_nissan': 0.12,
 'make_renault': 0.32,
 'make_skoda': 0.07,
 'make_tata': -0.49,
 'make_toyota': 0.91,
 'make_volkswagen': 0.43,
 'mileage_new': -0.08,
 'power_new': 0.62}

### Predict on test set

In [38]:
y_pred = lreg_v1.predict(X_test)

In [39]:
y_df = pd.DataFrame({"actual": y_test,
                     "predicted": y_pred,
                     "residual": y_pred - y_test})

In [40]:
y_df.sample(10, random_state = 100)

Unnamed: 0,actual,predicted,residual
770,7.23,6.107342,-1.122658
1540,6.43,5.400535,-1.029465
2962,6.25,5.803981,-0.446019
2040,4.69,5.348265,0.658265
640,2.65,2.768402,0.118402
81,8.35,6.360655,-1.989345
2773,6.75,5.822801,-0.927199
816,3.0,4.009055,1.009055
305,1.96,0.887004,-1.072996
2907,4.0,4.418604,0.418604


### Measuring Accuracy

In [41]:
from sklearn.metrics import mean_squared_error, r2_score

In [42]:
mse_v1 = mean_squared_error(y_test, y_pred)

In [43]:
mse_v1

0.9940316675353137

In [44]:
rmse_v1 = np.sqrt(mse_v1)

In [45]:
rmse_v1

0.9970113678064627

In [46]:
r2_score(y_test, y_pred)

0.7815227452692964

### K Fold Cross Validation

In [47]:
from sklearn.model_selection import cross_val_score

In [48]:
scores = cross_val_score( lreg_v1,
                          X_train,
                          y_train,
                          cv = 10,
                          scoring = 'r2')

In [49]:
scores

array([0.77234385, 0.75191573, 0.80808045, 0.76659278, 0.78819072,
       0.76274831, 0.77364515, 0.77623006, 0.82199631, 0.77659713])

In [50]:
scores.mean()

0.7798340507233623

In [51]:
scores.std()

0.020024754236287395

### Ridge Regression

In [52]:
# Importing Ridge Regression 
from sklearn.linear_model import Ridge


ridge = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', Ridge(alpha = 1, max_iter = 500))])


ridge.fit( X_train, y_train )

In [53]:
y_pred_ridge = ridge.predict(X_test)

In [54]:
mse_v1 = mean_squared_error(y_test, y_pred_ridge)

In [55]:
mse_v1

0.9770219818515282

In [56]:
r2_score(y_test, y_pred_ridge)

0.7852612875646742

### Lasso Regression

In [57]:
# Importing Ridge Regression 
from sklearn.linear_model import Lasso


lasso = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', Lasso(alpha = 0.01, max_iter = 500))])


lasso.fit( X_train, y_train )

In [58]:
final_feature_names = num_features + list(lasso['preprocessor'].transformers_[1][1].get_feature_names_out())

In [59]:
final_feature_names

['power_new',
 'Seats',
 'age',
 'engine_new',
 'KM_Driven',
 'mileage_new',
 'power_new',
 'age',
 'engine_new',
 'KM_Driven',
 'mileage_new']

In [60]:
lasso['regressor'].intercept_
lasso['regressor'].coef_
features_df = pd.DataFrame(dict(zip(column_names,
                                               np.round(lasso['regressor'].coef_, 2))).items(),
                                     columns = ['feature', 'coefs'])

In [61]:
features_df[features_df.coefs != 0]

Unnamed: 0,feature,coefs
0,Seats,0.07
1,power_new,0.65
2,age,-1.12
3,engine_new,0.3
4,KM_Driven,-0.29
5,mileage_new,-0.02
6,Fuel_Type_Diesel,0.8
8,Transmission_Automatic,0.1
12,Owner_Type_Second,-0.1
14,make_chevrolet,-0.29


In [62]:
features_df[features_df.coefs == 0]

Unnamed: 0,feature,coefs
7,Fuel_Type_Petrol,-0.0
9,Transmission_Manual,-0.0
10,Owner_Type_First,0.0
11,Owner_Type_Fourth & Above,-0.0
13,Owner_Type_Third,0.0
15,make_datsun,-0.0
16,make_fiat,-0.0
17,make_ford,0.0
18,make_honda,-0.0
20,make_mahindra,-0.0


In [63]:
y_pred_lasso = lasso.predict(X_test)
mse_v1 = mean_squared_error(y_test, y_pred_lasso)
mse_v1

1.0400344769993286

In [64]:
r2_score(y_test, y_pred_lasso)

0.7714118324585226