In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
sns.set_style('whitegrid')
pd.set_option('display.max_column', None)

In [None]:
data = pd.read_csv('Car Price.csv')
data

Unnamed: 0,car_name,yr_mfr,fuel_type,kms_run,sale_price,city,times_viewed,body_type,transmission,variant,assured_buy,registered_city,registered_state,is_hot,rto,source,make,model,car_availability,total_owners,broker_quote,original_price,car_rating,ad_created_on,fitness_certificate,emi_starts_from,booking_down_pymnt,reserved,warranty_avail
0,maruti swift,2015,petrol,8063,386399,noida,18715,hatchback,manual,lxi opt,True,delhi,delhi,True,dl6c,inperson_sale,maruti,swift,in_stock,2,397677,404177.0,great,2021-04-04T07:09:18.583,True,8975,57960,False,False
1,maruti alto 800,2016,petrol,23104,265499,noida,2676,hatchback,manual,lxi,True,noida,uttar pradesh,True,up16,inperson_sale,maruti,alto 800,in_stock,1,272935,354313.0,great,2021-03-22T14:07:32.833,True,6167,39825,False,False
2,hyundai grand i10,2017,petrol,23402,477699,noida,609,hatchback,manual,sports 1.2 vtvt,True,agra,uttar pradesh,True,up80,inperson_sale,hyundai,grand i10,in_stock,1,469605,,great,2021-03-20T05:36:31.311,True,11096,71655,False,False
3,maruti swift,2013,diesel,39124,307999,noida,6511,hatchback,manual,vdi,True,delhi,delhi,True,dl1c,inperson_sale,maruti,swift,in_stock,1,294262,374326.0,great,2021-01-21T12:59:19.299,True,7154,46200,False,False
4,hyundai grand i10,2015,petrol,22116,361499,noida,3225,hatchback,manual,magna 1.2 vtvt,False,new delhi,delhi,True,dl12,inperson_sale,hyundai,grand i10,in_stock,1,360716,367216.0,great,2021-04-01T13:33:40.733,True,8397,54225,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7395,honda amaze,2018,diesel,53486,604299,ghaziabad,2756,sedan,,1.5 v cvt i-dtec,True,ghaziabad,uttar pradesh,True,up14,inperson_sale,honda,amaze,in_stock,1,630810,787750.0,great,2021-02-07T08:05:30.443,True,14036,90645,True,False
7396,maruti ignis,2018,petrol,8854,562599,chennai,640,hatchback,manual,delta 1.2 k12,True,chennai,tamil nadu,True,tn07,inperson_sale,maruti,ignis,in_stock,1,549440,,great,2021-03-31T10:21:56.289,True,13068,84390,False,False
7397,honda amaze,2015,petrol,46300,400499,pune,795,sedan,manual,1.2 smt i vtec,True,pune,maharashtra,True,mh12,inperson_sale,honda,amaze,in_stock,1,383419,,great,2021-03-04T12:40:38.652,True,9303,60075,True,False
7398,maruti alto k10,2016,petrol,27245,284099,new delhi,1155,hatchback,manual,lxi,True,delhi,delhi,True,dl8c,inperson_sale,maruti,alto k10,in_stock,1,286515,369885.0,great,2021-03-16T13:31:39.766,True,6599,42615,False,False


In [4]:
data.drop_duplicates(inplace=True)

In [5]:
data.drop(columns='original_price', inplace=True)

In [6]:
data.dropna(inplace=True)

In [7]:
X = data[['car_name', 'yr_mfr', 'fuel_type', 'kms_run', 'body_type', 'transmission', 'make', 'model', 'car_rating']]
y = data['sale_price']

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [9]:
X.loc[:,'car_name'] = le.fit_transform(X['car_name'])
X.loc[:,'fuel_type'] = le.fit_transform(X['fuel_type'])
X.loc[:,'body_type'] = le.fit_transform(X['body_type'])
X.loc[:,'transmission'] = le.fit_transform(X['transmission'])
X.loc[:,'make'] = le.fit_transform(X['make'])
X.loc[:,'model'] = le.fit_transform(X['model'])
X.loc[:,'car_rating'] = le.fit_transform(X['car_rating'])

In [10]:
scaler = StandardScaler()

In [11]:
X = scaler.fit_transform(X)

### Untunned ridge model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [15]:
model = Ridge(alpha=1.0)

In [16]:
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [17]:
y_pred = model.predict(X_test)

In [18]:
mean_absolute_error(y_test, y_pred)

116848.09960594121

In [19]:
r2_score(y_test, y_pred)

0.4870973805877622

In [40]:
params = {
    'alpha': [0.001,0.01,0.1,1.0,10.0,100.0,1000.0],
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'solver': ['auto'],
    'positive': [True, False]
}

In [27]:
model = Ridge(random_state=42)

In [41]:
tuned_model = RandomizedSearchCV(estimator=model,
                                 param_distributions=params,
                                 n_iter=20,
                                 n_jobs=2,
                                 cv=5,
                                 verbose=1)

In [42]:
tuned_model.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


0,1,2
,estimator,Ridge(random_state=42)
,param_distributions,"{'alpha': [0.001, 0.01, ...], 'copy_X': [True, False], 'fit_intercept': [True, False], 'positive': [True, False], ...}"
,n_iter,20
,scoring,
,n_jobs,2
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [43]:
tuned_model.best_estimator_

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [44]:
best_model = tuned_model.best_estimator_

In [45]:
y_pred = best_model.predict(X_test)

In [46]:
mean_absolute_error(y_test, y_pred)

116861.02134545862

In [47]:
r2_score(y_test, y_pred)

0.48708587212609455