# Bikes Price Prediction Modelling

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

# importing data
df = pd.read_csv("bikes_data_cleaned.csv")
df.head()

Unnamed: 0,Company,Country of Origin,Horsepower (hp),Torque (nm),Number of Seating,Price (INR),Year,Looks,Body Type,Number of Cylinders,Electric Vehicle,Engine Displacement,Motor Power
0,Aprilia,Italy,100.0,67.0,2,1099000.0,2021,Sport,naked,2.0,0,659.0,0.0
1,Aprilia,Italy,100.0,67.0,2,1199000.0,2021,Sport,naked,2.0,0,659.0,0.0
2,Aprilia,Italy,15.0,12.0,2,449000.0,2022,Sport,racing,1.0,0,124.9,0.0
3,Aprilia,Italy,95.0,90.0,2,1399000.0,2022,Adventure,naked,2.0,0,896.0,0.0
4,Aprilia,Italy,175.0,121.0,2,1999000.0,2022,Adventure,naked,2.0,0,1077.0,0.0


In [2]:
# Sepearting X and Y
X = df.drop("Price (INR)",axis=1)
Y = df["Price (INR)"]

In [3]:
# Creating List for Numerical and Categorical Variables
numerical = ["Horsepower (hp)","Torque (nm)","Number of Seating","Year","Number of Cylinders","Electric Vehicle","Engine Displacement","Motor Power"]
categorical = ["Company","Country of Origin","Looks","Body Type"]

Lets first define a function that can gives us mean absolute error of model which can help us in deciding strategies while cleaning data

In [4]:
def getMeanAbsoluteError(X,Y):
    model = DecisionTreeRegressor()
    scores = -1 * cross_val_score(model,X,Y,cv=5,scoring="neg_mean_absolute_error")
    return scores.mean()

In [5]:
# Lets look at missing values from dataset
df.isnull().sum()

Company                0
Country of Origin      0
Horsepower (hp)        0
Torque (nm)            0
Number of Seating      0
Price (INR)            0
Year                   0
Looks                  0
Body Type              0
Number of Cylinders    0
Electric Vehicle       0
Engine Displacement    0
Motor Power            0
dtype: int64

We can see that dataset does not have missing value so we can start building our model

First we will create our model with only numerical Data and the results that we get will be our baseline score

### Model with only numerical data

In [6]:
print("Error(Only Numerical Data) : ",getMeanAbsoluteError(X[numerical],Y))

Error(Only Numerical Data) :  556846.5893472221


Now lets build a model with categorical variables

As all the categorical variables are nominal i.e they do not have any order in them we will use One Hot Encoding to build model with categorical Variables

### One Hot Encoding

In [7]:
onehot_encoder = OneHotEncoder(handle_unknown="ignore",sparse_output=False)
onehot_columns = pd.DataFrame(onehot_encoder.fit_transform(X[categorical]))

#Adding Index
onehot_columns.index = X.index

# Getting Numerical Column
numerical_columns = X.drop(categorical,axis=1)

# Concatenating DataFrames
X_onehot = pd.concat([numerical_columns,onehot_columns],axis=1)

X_onehot.columns = X_onehot.columns.astype(str)

print("Error (One Hot Encoding) : ",getMeanAbsoluteError(X_onehot,Y))

Error (One Hot Encoding) :  427648.1665277778


We have brought down MAE, there is a significant decrease and we can say that categorical variables are pretty useful to predict our target variable

In [8]:
# Saving OneHotEncoder for Future use
pickle.dump(onehot_encoder,open("onehot_encoder.pkl","wb"))

Now lets see how different model perform compared to each other

In [9]:
def getMeanAbsoluteErrorModel(X,Y,model):
    scores = -1 * cross_val_score(model,X,Y,cv=5,scoring="neg_mean_absolute_error")
    return scores.mean()

print("MAE DecisionTreeRegressor : ",getMeanAbsoluteErrorModel(X_onehot,Y,DecisionTreeRegressor()))
print("MAE RandomForestRegressor : ",getMeanAbsoluteErrorModel(X_onehot,Y,RandomForestRegressor()))
print("MAE Gradient Boosting : ",getMeanAbsoluteErrorModel(X_onehot,Y,XGBRegressor()))

MAE DecisionTreeRegressor :  425817.42002777784
MAE RandomForestRegressor :  388095.2354327646
MAE Gradient Boosting :  438328.7839174533


We can see that RandomForestRegressor is giving us best results so we select it as our model now we will perform hyperparameter tuning with cross validation to further improve th performance of our model

### Hyper Parameter Tuning

For RandomForestRegressor model we have to major hyperparameters that we can tune which are n_estimators and max_depth so we will try a number of values for this parameters and see which one gives better performance

In [10]:
def getError(n_estimators,max_depth,X=X_onehot,Y=Y):
    model = RandomForestRegressor(n_estimators = n_estimators,max_depth=max_depth)
    scores = -1 * cross_val_score(model,X,Y,cv=5,scoring="neg_mean_absolute_error")
    return scores.mean()

bestMae = float("inf")
parameters = None
for estimator in [10,50,100,250]:
    for depth in [10,15,20,30,40,50]:
        error = getError(n_estimators=estimator,max_depth=depth)
        if error < bestMae:
            bestMae = error
            parameters = (estimator,depth)
        print("MAE with n_estimators:{} and max_depth = {} : {}".format(estimator,depth,error))
    
print("------------------------------------------------------------------------------")
print("Best Performane with n_estimators:{} and max_depth:{} , MAE : {}".format(parameters[0],parameters[1],bestMae))

MAE with n_estimators:10 and max_depth = 10 : 390513.5837338763
MAE with n_estimators:10 and max_depth = 15 : 405063.59659042076
MAE with n_estimators:10 and max_depth = 20 : 390035.3839652465
MAE with n_estimators:10 and max_depth = 30 : 397214.8989659188
MAE with n_estimators:10 and max_depth = 40 : 382976.77052037034
MAE with n_estimators:10 and max_depth = 50 : 389019.590862963
MAE with n_estimators:50 and max_depth = 10 : 380739.76030642673
MAE with n_estimators:50 and max_depth = 15 : 388927.55055963853
MAE with n_estimators:50 and max_depth = 20 : 389242.6777442838
MAE with n_estimators:50 and max_depth = 30 : 398682.5417178675
MAE with n_estimators:50 and max_depth = 40 : 383038.8976762963
MAE with n_estimators:50 and max_depth = 50 : 393810.7115766137
MAE with n_estimators:100 and max_depth = 10 : 384904.9715824629
MAE with n_estimators:100 and max_depth = 15 : 383426.92834813026
MAE with n_estimators:100 and max_depth = 20 : 378262.13544619613
MAE with n_estimators:100 and ma

We have got the best performance with parameter n_estimators=100 and max_depth = 20 , so now we will train the model on the entire dataset using theese parameters and save the model

In [12]:
my_model = RandomForestRegressor(n_estimators=100,max_depth=20)
my_model.fit(X_onehot,Y)
pickle.dump(my_model,open("model.pkl","wb"))
print("Model Saved")

Model Saved
