In [194]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , PolynomialFeatures
from sklearn.linear_model import LinearRegression , Ridge
from sklearn.metrics import r2_score , mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xg

# Data Collection

In [195]:
df = pd.read_csv('/content/quikr_car.csv')
df.shape

(892, 6)

In [196]:
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


# Data Assessing

In [197]:
df['name'].duplicated().sum()

367

In [198]:
df['company'].unique()

array(['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'I', 'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat',
       'Commercial', 'MARUTI', 'Force', 'Mercedes', 'Land', 'Yamaha',
       'selling', 'URJENT', 'Swift', 'Used', 'Jaguar', 'Jeep', 'tata',
       'Sale', 'very', 'Volvo', 'i', '2012', 'Well', 'all', '7', '9',
       'scratch', 'urgent', 'sell', 'TATA', 'Any', 'Tara'], dtype=object)

In [199]:
df['year'].unique()

array(['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016',
       '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',
       '...', '150k', 'TOUR', '2003', 'r 15', '2004', 'Zest', '/-Rs',
       'sale', '1995', 'ara)', '2002', 'SELL', '2001', 'tion', 'odel',
       '2 bs', 'arry', 'Eon', 'o...', 'ture', 'emi', 'car', 'able', 'no.',
       'd...', 'SALE', 'digo', 'sell', 'd Ex', 'n...', 'e...', 'D...',
       ', Ac', 'go .', 'k...', 'o c4', 'zire', 'cent', 'Sumo', 'cab',
       't xe', 'EV2', 'r...', 'zest'], dtype=object)

In [200]:
df['Price'].unique()

array(['80,000', '4,25,000', 'Ask For Price', '3,25,000', '5,75,000',
       '1,75,000', '1,90,000', '8,30,000', '2,50,000', '1,82,000',
       '3,15,000', '4,15,000', '3,20,000', '10,00,000', '5,00,000',
       '3,50,000', '1,60,000', '3,10,000', '75,000', '1,00,000',
       '2,90,000', '95,000', '1,80,000', '3,85,000', '1,05,000',
       '6,50,000', '6,89,999', '4,48,000', '5,49,000', '5,01,000',
       '4,89,999', '2,80,000', '3,49,999', '2,84,999', '3,45,000',
       '4,99,999', '2,35,000', '2,49,999', '14,75,000', '3,95,000',
       '2,20,000', '1,70,000', '85,000', '2,00,000', '5,70,000',
       '1,10,000', '4,48,999', '18,91,111', '1,59,500', '3,44,999',
       '4,49,999', '8,65,000', '6,99,000', '3,75,000', '2,24,999',
       '12,00,000', '1,95,000', '3,51,000', '2,40,000', '90,000',
       '1,55,000', '6,00,000', '1,89,500', '2,10,000', '3,90,000',
       '1,35,000', '16,00,000', '7,01,000', '2,65,000', '5,25,000',
       '3,72,000', '6,35,000', '5,50,000', '4,85,000', '3,29,5

In [201]:
df['kms_driven'].unique()

array(['45,000 kms', '40 kms', '22,000 kms', '28,000 kms', '36,000 kms',
       '59,000 kms', '41,000 kms', '25,000 kms', '24,530 kms',
       '60,000 kms', '30,000 kms', '32,000 kms', '48,660 kms',
       '4,000 kms', '16,934 kms', '43,000 kms', '35,550 kms',
       '39,522 kms', '39,000 kms', '55,000 kms', '72,000 kms',
       '15,975 kms', '70,000 kms', '23,452 kms', '35,522 kms',
       '48,508 kms', '15,487 kms', '82,000 kms', '20,000 kms',
       '68,000 kms', '38,000 kms', '27,000 kms', '33,000 kms',
       '46,000 kms', '16,000 kms', '47,000 kms', '35,000 kms',
       '30,874 kms', '15,000 kms', '29,685 kms', '1,30,000 kms',
       '19,000 kms', nan, '54,000 kms', '13,000 kms', '38,200 kms',
       '50,000 kms', '13,500 kms', '3,600 kms', '45,863 kms',
       '60,500 kms', '12,500 kms', '18,000 kms', '13,349 kms',
       '29,000 kms', '44,000 kms', '42,000 kms', '14,000 kms',
       '49,000 kms', '36,200 kms', '51,000 kms', '1,04,000 kms',
       '33,333 kms', '33,600 kms', '5,

In [202]:
df['fuel_type'].unique()

array(['Petrol', 'Diesel', nan, 'LPG'], dtype=object)

In [203]:
df.isnull().sum()

name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64

# Issue with Data  
## name column  
- name has 'Ta' and 'Tata' name car , should be remove

## company column
- 'MARUTI' should be replace with 'Maruti'
- 'tata' and 'TATA' should be replace with 'Tata'
- here too many non car campanies , remove it.  **car companies = ['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota','Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',    'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',    'Mercedes', 'Land Rover', 'Jaguar', 'Jeep', 'Volvo']**  

## year column
- non numeric values should be removed  

## price column
- 'Ask For Price' and comma "," should be removed

## kms_driven column
- 'Petrol' , "," and " kms" should be removed
- here NAN value

## fuel_type column
- has NAN



# Data Cleaning

In [204]:
# backup the data set
backup = df.copy()

In [205]:
# replacing 'MARUTI' with 'Maruti'
df['company'] = df['company'].str.replace('MARUTI' , 'Maruti')

In [206]:
# replaceing 'tata' and 'TATA' with 'Tata'
df['company'] = df['company'].str.replace('tata' , 'Tata')
df['company'] = df['company'].str.replace('TATA' , 'Tata')

In [207]:
# removing non company names
car_companies = ['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota','Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen', 'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force', 'Mercedes', 'Land Rover', 'Jaguar', 'Jeep', 'Volvo']

df = df[df['company'].isin(car_companies)]

In [208]:
# checking...
df['company'].unique()

array(['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',
       'Mercedes', 'Jaguar', 'Jeep', 'Volvo'], dtype=object)

In [209]:
# removing non year values
df = df[df['year'].str.isnumeric()]

In [210]:
# removing 'Ask For Price' and comma
df['Price'] = df[df['Price'] != 'Ask For Price' ]['Price'].str.replace(',' , '')

In [211]:
df['kms_driven'] = df[df['kms_driven'] != 'Petrol']['kms_driven'].str.replace(' kms' , '')
df['kms_driven'] = df['kms_driven'].str.replace(',' ,'')

In [212]:
# droping all NAN values
df.dropna(inplace =True)

In [213]:
df.shape

(815, 6)

In [214]:
df

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz VXI ABS,Maruti,2011,270000,50000,Petrol
885,Tata Indica V2 DLE BS III,Tata,2009,110000,30000,Diesel
886,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
888,Tata Zest XM Diesel,Tata,2018,260000,27000,Diesel


In [215]:
def get_short_name(line):
  temp = line.split(' ')

  if len(temp) < 3:
    return line
  else :
    return temp[0] + ' ' + temp[1] + ' ' + temp[2]

In [216]:
df['name'] = df['name'].apply(get_short_name)

In [217]:
convert_dtype = {
    'name' : 'category',
    'company' : 'category',
    'year' : int,
    'Price' : int,
    'kms_driven' : int ,
    'fuel_type' : 'category'
}

df = df.astype(convert_dtype)

In [218]:
df.describe()

Unnamed: 0,year,Price,kms_driven
count,815.0,815.0,815.0
mean,2012.441718,409646.1,46295.50184
std,4.004448,471775.1,34313.741453
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,490000.0,56879.0
max,2019.0,8500003.0,400000.0


In [219]:
df = df[df['Price'] < 8e6]

In [220]:
# fixing indexing
df = df.reset_index(drop = True)

In [221]:
df

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
809,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
810,Tata Indica V2,Tata,2009,110000,30000,Diesel
811,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
812,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [222]:
# saving cleaned data
df.to_csv('cleaned quikr_car.csv' , index = False)

# Model Building

In [223]:
df.head(1)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol


In [224]:
encoder0 = LabelEncoder()
encoder1 = LabelEncoder()
encoder2 = LabelEncoder()

df['name'] = encoder0.fit_transform(df['name'])

df['company'] = encoder1.fit_transform(df['company'])

df['fuel_type'] = encoder2.fit_transform(df['fuel_type'])

In [225]:
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,91,9,2007,80000,45000,2
1,117,12,2006,425000,40,0
2,88,9,2014,325000,28000,2
3,40,6,2014,575000,36000,0
4,45,6,2012,175000,41000,0


In [226]:
x = df[['name' , 'company' ,'fuel_type', 'year' , 'kms_driven']]
y = df['Price']

x.shape , y.shape

((814, 5), (814,))

In [227]:
x

Unnamed: 0,name,company,fuel_type,year,kms_driven
0,91,9,2,2007,45000
1,117,12,0,2006,40
2,88,9,2,2014,28000
3,40,6,0,2014,36000
4,45,6,0,2012,41000
...,...,...,...,...,...
809,158,13,2,2011,50000
810,202,20,0,2009,30000
811,230,21,2,2009,132000
812,228,20,0,2018,27000


In [228]:
y

0       80000
1      425000
2      325000
3      575000
4      175000
        ...  
809    270000
810    110000
811    300000
812    260000
813    390000
Name: Price, Length: 814, dtype: int64

In [229]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2 , random_state = 2)

In [230]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(651, 5)
(163, 5)
(651,)
(163,)


In [231]:
x_train

Unnamed: 0,name,company,fuel_type,year,kms_driven
370,239,21,0,2012,0
272,16,2,2,2010,43000
800,151,13,0,2016,150000
602,238,21,0,2012,80000
333,88,9,2,2017,6821
...,...,...,...,...,...
534,147,13,2,2014,37000
584,15,2,0,2011,56000
493,55,8,2,2014,6000
527,107,9,2,2010,10750


In [232]:
x_test

Unnamed: 0,name,company,fuel_type,year,kms_driven
197,220,20,0,2012,65000
250,46,6,0,2014,50000
556,204,20,0,2016,104000
465,153,13,0,2016,55000
15,7,0,0,2014,16934
...,...,...,...,...,...
603,162,13,0,2010,74000
231,54,8,0,2013,46000
205,143,12,0,2012,35000
537,147,13,2,2019,9800


# LInear Regression

In [233]:
lr = LinearRegression()
lr.fit(x_train ,y_train)

In [234]:
lr = LinearRegression()
lr.fit(x_train ,y_train)

print(lr.coef_)
print(lr.intercept_)

print('R2 score : ',r2_score(y_test , lr.predict(x_test)))
print('MSE : ', mean_squared_error(y_test , lr.predict(x_test)))
print('RMSE : ', np.sqrt(mean_squared_error(y_test , lr.predict(x_test))))

[ 1.69505535e+03 -2.76065034e+04 -8.05373198e+04  2.69533068e+04
 -1.33272727e+00]
-53583126.82143236
R2 score :  0.13077541794574654
MSE :  132912046549.04947
RMSE :  364571.04458397336


<p color = "green" > Linear Regresion <b>without hyperparameter tuning </b> accuracy : 13 %  

# Ridge Regression

In [235]:
rid = Ridge(alpha = 0.0001)
rid.fit(x_train,y_train)
print('R2 score : ',r2_score(y_test , rid.predict(x_test)))
print('MSE : ', mean_squared_error(y_test , rid.predict(x_test)))
print('RMSE : ', np.sqrt(mean_squared_error(y_test , rid.predict(x_test))))

R2 score :  0.1307754194022962
MSE :  132912046326.33028
RMSE :  364571.0442785196


<p color = "green" > Ridge Regresion <b>without hyperparameter tuning </b>  accuracy : 13 %  

# Decision Tree

In [236]:
dt = DecisionTreeRegressor()
dt.fit(x_train ,y_train)
print('R2 score : ',r2_score(y_test , dt.predict(x_test)))
print('MSE : ', mean_squared_error(y_test , dt.predict(x_test)))
print('RMSE : ', np.sqrt(mean_squared_error(y_test , dt.predict(x_test))))

R2 score :  0.28803665999840034
MSE :  108865426198.454
RMSE :  329947.6112937537


<p color = "red" > Decision Tree Regressor <b>without hyperparameter tuning </b>  accuracy : 28.8 %  

# Polynomial Linear Regression

In [237]:
poly = PolynomialFeatures(degree = 2)

x_train_trans = poly.fit_transform(x_train)
x_test_trans = poly.fit_transform(x_test)

poly_lr = LinearRegression()
poly_lr.fit(x_train_trans , y_train)

print('R2 score : ',r2_score(y_test , poly_lr.predict(x_test_trans)))
print('MSE : ', mean_squared_error(y_test , poly_lr.predict(x_test_trans)))
print('RMSE : ', np.sqrt(mean_squared_error(y_test , poly_lr.predict(x_test_trans))))

R2 score :  0.13994391290690422
MSE :  131510103421.55281
RMSE :  362643.2178071897


<p color = "green" > Polynomial Linear Regresion accuracy : 13.99 %  

# Random Forest

In [241]:
rf_accuracy = []
for i in range(1,21):
  rf = RandomForestRegressor(n_estimators=i , random_state = 10)
  rf.fit(x_train , y_train)

  rf_accuracy.append((i ,r2_score(y_test , rf.predict(x_test) )))

  print('R2 score : ',r2_score(y_test , rf.predict(x_test)))
  print('MSE : ', mean_squared_error(y_test , rf.predict(x_test)))
  print('RMSE : ', np.sqrt(mean_squared_error(y_test , rf.predict(x_test))))

  print("\n\n")

R2 score :  0.39404351821129624
MSE :  92656049744.77301
RMSE :  304394.5626071087



R2 score :  0.3791517863198759
MSE :  94933125891.96472
RMSE :  308112.1969217784



R2 score :  0.4201264256042013
MSE :  88667744911.80641
RMSE :  297771.29631951835



R2 score :  0.41842855936070145
MSE :  88927363521.13843
RMSE :  298206.9139391949



R2 score :  0.43392364810252104
MSE :  86558028830.59485
RMSE :  294207.4588289611



R2 score :  0.43324066479680834
MSE :  86662462955.90062
RMSE :  294384.88914327894



R2 score :  0.43280820766809025
MSE :  86728589435.99487
RMSE :  294497.18069277826



R2 score :  0.43112437719796104
MSE :  86986061852.72935
RMSE :  294933.9957562189



R2 score :  0.46270405734663156
MSE :  82157252354.49289
RMSE :  286630.86427405704



R2 score :  0.45139138364538234
MSE :  83887059178.43382
RMSE :  289632.62795899535



R2 score :  0.457325161696041
MSE :  82979732578.65338
RMSE :  288062.029046963



R2 score :  0.45674436720739486
MSE :  83068541139.414

In [242]:
# n_estimators , accuracy
rf_accuracy

[(1, 0.39404351821129624),
 (2, 0.3791517863198759),
 (3, 0.4201264256042013),
 (4, 0.41842855936070145),
 (5, 0.43392364810252104),
 (6, 0.43324066479680834),
 (7, 0.43280820766809025),
 (8, 0.43112437719796104),
 (9, 0.46270405734663156),
 (10, 0.45139138364538234),
 (11, 0.457325161696041),
 (12, 0.45674436720739486),
 (13, 0.45033797442213896),
 (14, 0.44721581264523935),
 (15, 0.44269535048377995),
 (16, 0.44741723462671),
 (17, 0.445062045874124),
 (18, 0.4439554762963205),
 (19, 0.446179541895),
 (20, 0.451965717506551)]

n_estimators = 9 then it is giving 46 %

In [243]:
rf = RandomForestRegressor(n_estimators = 9)
rf.fit(x_train , y_train)

print('R2 score : ',r2_score(y_test , rf.predict(x_test)))
print('MSE : ', mean_squared_error(y_test , rf.predict(x_test)))
print('RMSE : ', np.sqrt(mean_squared_error(y_test , rf.predict(x_test))))

R2 score :  0.4542105456461255
MSE :  83455984633.59462
RMSE :  288887.49476845586


<p color='green' > Random Forest accuracy is increased 45.42 %

# XG boost

In [244]:
xg_accuracy = []

for i in range (1, 21):
  reg = xg.XGBRegressor()
  reg.fit(x_train , y_train)

  xg_accuracy.append((i , r2_score(y_test , reg.predict(x_test))))
  print('R2 score : ',r2_score(y_test , reg.predict(x_test)))
  print('MSE : ', mean_squared_error(y_test , reg.predict(x_test)))
  print('RMSE : ', np.sqrt(mean_squared_error(y_test , reg.predict(x_test))))
  print("\n\n")

R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.5734
RMSE :  311665.17252265033



R2 score :  0.36475068870752647
MSE :  97135179763.573

In [245]:
xg_accuracy

[(1, 0.36475068870752647),
 (2, 0.36475068870752647),
 (3, 0.36475068870752647),
 (4, 0.36475068870752647),
 (5, 0.36475068870752647),
 (6, 0.36475068870752647),
 (7, 0.36475068870752647),
 (8, 0.36475068870752647),
 (9, 0.36475068870752647),
 (10, 0.36475068870752647),
 (11, 0.36475068870752647),
 (12, 0.36475068870752647),
 (13, 0.36475068870752647),
 (14, 0.36475068870752647),
 (15, 0.36475068870752647),
 (16, 0.36475068870752647),
 (17, 0.36475068870752647),
 (18, 0.36475068870752647),
 (19, 0.36475068870752647),
 (20, 0.36475068870752647)]

<p color = "green" > Xg boost has accuracy = 36.47 %

<p color='green' > Random Forest accuracy is increased 45.42 %

i am trying to find ERROR so,  i can improve the accuracy. working on it....