In [113]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

Read data



In [114]:
data_url = "data_car_price_prediction.csv"

In [115]:

df = pd.read_csv(data_url)
df.head()


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


Exploratory Data Analysis

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [117]:
df.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


Cleaning and fearture Engineering

In [118]:
display(df['fuel'].value_counts())
display(df['seller_type'].value_counts())
display(df['transmission'].value_counts())
display(df['owner'].value_counts())


Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,2153
Petrol,2123
CNG,40
LPG,23
Electric,1


Unnamed: 0_level_0,count
seller_type,Unnamed: 1_level_1
Individual,3244
Dealer,994
Trustmark Dealer,102


Unnamed: 0_level_0,count
transmission,Unnamed: 1_level_1
Manual,3892
Automatic,448


Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,2832
Second Owner,1106
Third Owner,304
Fourth & Above Owner,81
Test Drive Car,17


Encoding

In [119]:
#We take the first data from the name
df['brand'] = df['name'].str.split(' ').str[0]
df = df.drop(columns=['name'])
#Map to  manually
df['transmission'] = df['transmission'].map({'Manual': 0, 'Automatic': 1})
#One-hot encoding to split more than 3 unique type
df_final = pd.get_dummies(df,columns = ['fuel','seller_type','owner',], drop_first=True)

df_final.head()

Unnamed: 0,year,selling_price,km_driven,transmission,brand,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,60000,70000,0,Maruti,False,False,False,True,True,False,False,False,False,False
1,2007,135000,50000,0,Maruti,False,False,False,True,True,False,False,False,False,False
2,2012,600000,100000,0,Hyundai,True,False,False,False,True,False,False,False,False,False
3,2017,250000,46000,0,Datsun,False,False,False,True,True,False,False,False,False,False
4,2014,450000,141000,0,Honda,True,False,False,False,True,False,False,True,False,False


In [120]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   year                          4340 non-null   int64 
 1   selling_price                 4340 non-null   int64 
 2   km_driven                     4340 non-null   int64 
 3   transmission                  4340 non-null   int64 
 4   brand                         4340 non-null   object
 5   fuel_Diesel                   4340 non-null   bool  
 6   fuel_Electric                 4340 non-null   bool  
 7   fuel_LPG                      4340 non-null   bool  
 8   fuel_Petrol                   4340 non-null   bool  
 9   seller_type_Individual        4340 non-null   bool  
 10  seller_type_Trustmark Dealer  4340 non-null   bool  
 11  owner_Fourth & Above Owner    4340 non-null   bool  
 12  owner_Second Owner            4340 non-null   bool  
 13  owner_Test Drive C

In [121]:
df_final = pd.get_dummies(df_final,columns=['brand'],drop_first= True)
df_final.head()

Unnamed: 0,year,selling_price,km_driven,transmission,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,...,brand_Mercedes-Benz,brand_Mitsubishi,brand_Nissan,brand_OpelCorsa,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo
0,2007,60000,70000,0,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,2007,135000,50000,0,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
2,2012,600000,100000,0,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,2017,250000,46000,0,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
4,2014,450000,141000,0,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [122]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 42 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   year                          4340 non-null   int64
 1   selling_price                 4340 non-null   int64
 2   km_driven                     4340 non-null   int64
 3   transmission                  4340 non-null   int64
 4   fuel_Diesel                   4340 non-null   bool 
 5   fuel_Electric                 4340 non-null   bool 
 6   fuel_LPG                      4340 non-null   bool 
 7   fuel_Petrol                   4340 non-null   bool 
 8   seller_type_Individual        4340 non-null   bool 
 9   seller_type_Trustmark Dealer  4340 non-null   bool 
 10  owner_Fourth & Above Owner    4340 non-null   bool 
 11  owner_Second Owner            4340 non-null   bool 
 12  owner_Test Drive Car          4340 non-null   bool 
 13  owner_Third Owner             434

In [123]:
df_final.describe()

Unnamed: 0,year,selling_price,km_driven,transmission
count,4340.0,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419,0.103226
std,4.215344,578548.7,46644.102194,0.304289
min,1992.0,20000.0,1.0,0.0
25%,2011.0,208749.8,35000.0,0.0
50%,2014.0,350000.0,60000.0,0.0
75%,2016.0,600000.0,90000.0,0.0
max,2020.0,8900000.0,806599.0,1.0


Splitting data

In [124]:
X = df_final.drop(columns=['selling_price'])
y = df_final['selling_price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

Training

In [125]:
#Linear Regressiom
lr = LinearRegression()
lr.fit(X_train, y_train)
y_prediction = lr.predict(X_test)


In [126]:
r2 = metrics.r2_score(y_test,y_prediction)
mae = metrics.mean_absolute_error(y_test,y_prediction)
mse = np.sqrt(metrics.mean_squared_error(y_test,y_prediction))

print(f"R2: {r2:.4f}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

R2: 0.5299
MAE: 184882.00841644255
MSE: 378760.3688518392


In [127]:
#Random Forest
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
y_prediction_rf = rf.predict(X_test)

r2 = metrics.r2_score(y_test,y_prediction_rf)
mae = metrics.mean_absolute_error(y_test,y_prediction_rf)
mse = np.sqrt(metrics.mean_squared_error(y_test,y_prediction_rf))

print(f"R2: {r2:.4f}")
print(f"MAE: {mae*100:.2f}")
print(f"MSE: {mse*100:.2f}")

R2: 0.7310
MAE: 11725092.81
MSE: 28650132.27


Using Model

In [134]:
all_feature_columns = X_train.columns
new_car = pd.DataFrame(columns=all_feature_columns)
new_car.loc[0]=0

new_car['year'] = 2022
new_car['km_driven'] = 10000
new_car['transmission'] = 0
new_car['fuel_Petrol'] = 1
new_car['seller_type_Individual'] = 1
# Removed the incorrect line for owner_First Owner as it was dropped during one-hot encoding
# To represent a 'First Owner' car, all other owner columns should be 0, which is the default
new_car['brand_Maruti'] = 1
#first owner
display(new_car)
predicted_price = rf.predict(new_car)
print(f"Predicted price: {predicted_price[0]:.2f}")

Unnamed: 0,year,km_driven,transmission,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,owner_Fourth & Above Owner,...,brand_Mercedes-Benz,brand_Mitsubishi,brand_Nissan,brand_OpelCorsa,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo
0,2022,10000,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Predicted price: 494097.50
