In [1]:
## Importing Libraries
import numpy as np
import pandas as pd
import pickle

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

print("Libraries Successfully Imported")

Libraries Successfully Imported


In [2]:
## Loading Dataset
df = pd.read_csv(r"cardekho_imputated.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


## Data Cleaning

### Handling Missing Values

In [3]:
## Checking Null Values
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [4]:
## Removing Unnecessary Values
df = df.drop(columns= ['car_name', 'brand'])

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [6]:
df['model'].unique()

<StringArray>
[        'Alto',        'Grand',          'i20',     'Ecosport',
      'Wagon R',          'i10',        'Venue',        'Swift',
        'Verna',       'Duster',
 ...
     'Panamera',      'Alturas',       'Altroz',           'NX',
     'Carnival',            'C',           'RX',        'Ghost',
 'Quattroporte',       'Gurkha']
Length: 120, dtype: str

In [7]:
## Getting all Different Types of features
num_features = [feature for feature in df.columns if df[feature].dtype != 'str']
print("Numerical Features: ", len(num_features))

## Categorical Features
cat_features = [i for i in df.columns if df[i].dtype == 'str']
print('Categorical Features: ', len(cat_features))

## Discrete Features
discrete_features = [feature for feature in num_features if len(df[feature].unique()) <= 25]
print('Discrete Features: ', len(discrete_features))

## Continous Features
continous_features = [feature for feature in num_features if feature not in discrete_features]
print("Continous Features: ", len(continous_features))

Numerical Features:  8
Categorical Features:  4
Discrete Features:  2
Continous Features:  6


In [8]:
## Dividing the data into Dependent and Independent Features
x = df.drop(columns = ['selling_price'])
y = df['selling_price']

### Feature Encoding and Scaling

In [9]:
len(df['model'].unique())

120

In [10]:
df['model'].value_counts()

model
i20             906
Swift Dzire     890
Swift           781
Alto            778
City            757
               ... 
Altroz            1
C                 1
Ghost             1
Quattroporte      1
Gurkha            1
Name: count, Length: 120, dtype: int64

In [11]:
le = LabelEncoder()
x['model'] = le.fit_transform(x['model'])

In [12]:
## Create Column Transformer with 3 types of transformers
num_features = x.select_dtypes(exclude='object').columns
onehot_columns = ['seller_type', 'fuel_type', 'transmission_type']

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first', sparse_output=False)

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
    ], remainder='passthrough'
)

In [13]:
x = preprocessor.fit_transform(x)

In [14]:
features = preprocessor.get_feature_names_out()
pd.DataFrame(x, columns= features)

Unnamed: 0,OneHotEncoder__seller_type_Individual,OneHotEncoder__seller_type_Trustmark Dealer,OneHotEncoder__fuel_type_Diesel,OneHotEncoder__fuel_type_Electric,OneHotEncoder__fuel_type_LPG,OneHotEncoder__fuel_type_Petrol,OneHotEncoder__transmission_type_Manual,StandardScaler__Unnamed: 0,StandardScaler__model,StandardScaler__vehicle_age,StandardScaler__km_driven,StandardScaler__mileage,StandardScaler__engine,StandardScaler__max_power,StandardScaler__seats
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738694,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738516,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738339,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738162,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.737985,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.723327,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.723859,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.724036,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.724213,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [15]:
## Seperating Data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape

((12328, 15), (3083, 15))

## Model Training and Model Selection

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [20]:
## Create a function to evaluate Model
def evaluate_model(true, predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square, mse

In [22]:
models = {
    "Random Forest" : RandomForestRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Linear Regression" : LinearRegression(),
    "Ridge" : Ridge(),
    "Lasso" : Lasso(),
    "K-Neighbours Regressor" : KNeighborsRegressor(),
    "AdaBoost Regressor" : AdaBoostRegressor(),
    "Gradient Boost" : GradientBoostingRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    ## Make Prediction
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    ## Evaluate Train and Test Dataset
    model_train_mae, model_train_rmse, model_train_r2, model_train_mse = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2, model_test_mse = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])

    print("Model Perfromance for Training Set")
    print("- Root mean Squared Error : {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error : {:.4f}".format(model_train_mae))
    print("- R-Squared : {:.4f}".format(model_train_r2))
    print("- Mean Squared Error : {:.4f}".format(model_train_mse))

    print("---------------------------------------------------------------")

    print("Model Perfromance for Test Set")
    print("- Root mean Squared Error : {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error : {:.4f}".format(model_test_mae))
    print("- R-Squared : {:.4f}".format(model_test_r2))
    print("- Mean Squared Error : {:.4f}".format(model_test_mse))

    print('=' * 35)
    print("\n")

Random Forest
Model Perfromance for Training Set
- Root mean Squared Error : 130346.3669
- Mean Absolute Error : 36497.8245
- R-Squared : 0.9791
- Mean Squared Error : 16990175362.8001
---------------------------------------------------------------
Model Perfromance for Test Set
- Root mean Squared Error : 234071.0298
- Mean Absolute Error : 98054.8605
- R-Squared : 0.9272
- Mean Squared Error : 54789246973.8647


Decision Tree
Model Perfromance for Training Set
- Root mean Squared Error : 0.0000
- Mean Absolute Error : 0.0000
- R-Squared : 1.0000
- Mean Squared Error : 0.0000
---------------------------------------------------------------
Model Perfromance for Test Set
- Root mean Squared Error : 326367.7247
- Mean Absolute Error : 129474.1729
- R-Squared : 0.8585
- Mean Squared Error : 106515891750.1216


Linear Regression
Model Perfromance for Training Set
- Root mean Squared Error : 553850.0494
- Mean Absolute Error : 268104.1303
- R-Squared : 0.6218
- Mean Squared Error : 30674987

In [23]:
## Initialize few parameter for HyperParameter Tuning
rf_params = {"max_depth" : [5,8,15,None,10],
             "max_features" : [5,7,"auto", 8],
             "min_samples_split" : [2,8,15,20],
             "n_estimators" : [100,200,500,1000]}

gradient_params = {
    "loss" : ['squared_error', 'huber', 'absolute_error'],
    "criterion" : ['friedman_mse', 'squared_error', 'mse'],
    "min_samples_split" : [2, 8, 15, 20],
    "max_depth" : [5, 8, 15, None, 10],
    "learning_rate" : [0.1, 0.01, 0.001]
}

In [24]:
## Model List for Hyperparameter tuning
randomcv_model = [
                  ('Random Forest', RandomForestRegressor(), rf_params),
                  ('Gradient Boost', GradientBoostingRegressor(), gradient_params)]

In [25]:
from sklearn.model_selection import RandomizedSearchCV
model_params = {}

for name, model, params in randomcv_model:
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=100,         
        cv=3,
        verbose=2,
        n_jobs=-1,
        random_state=42
    )
    
    random.fit(x_train, y_train)
    model_params[name] = random.best_params_

# Print best parameters
for model_name, params in model_params.items():
    print(f"\n------------------- Best Params for {model_name} -------------------")
    print(params)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits

------------------- Best Params for Random Forest -------------------
{'n_estimators': 200, 'min_samples_split': 2, 'max_features': 7, 'max_depth': 15}

------------------- Best Params for Gradient Boost -------------------
{'min_samples_split': 2, 'max_depth': 5, 'loss': 'squared_error', 'learning_rate': 0.1, 'criterion': 'squared_error'}


In [26]:
## Retrainig the Model with best Parameters
models = {
    "Random Forest Regressor" : RandomForestRegressor(n_estimators=200, min_samples_split=2,
                                                      max_features=7, max_depth=15),
    "Gradient Boost Regressor" : GradientBoostingRegressor(min_samples_split=2, max_depth=5, loss='squared_error',
                                                           learning_rate=0.1, criterion='squared_error')
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    ## Make Prediction
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    ## Evaluate Train and Test Dataset
    model_train_mae, model_train_rmse, model_train_r2, model_train_mse = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2, model_test_mse = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])

    print("Model Perfromance for Training Set")
    print("- Root mean Squared Error : {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error : {:.4f}".format(model_train_mae))
    print("- R-Sqaured : {:.4f}".format(model_train_r2))
    print("- Mean Squared Error : {:.4f}".format(model_train_mse))

    print("---------------------------------------------------------------")

    print("Model Perfromance for Test Set")
    print("- Root mean Squared Error : {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error : {:.4f}".format(model_test_mae))
    print("- R-Sqaured : {:.4f}".format(model_test_r2))
    print("- Mean Squared Error : {:.4f}".format(model_test_mse))

    print('=' * 35)
    print("\n")

Random Forest Regressor
Model Perfromance for Training Set
- Root mean Squared Error : 135223.3306
- Mean Absolute Error : 50807.9600
- R-Sqaured : 0.9775
- Mean Squared Error : 18285349148.3317
---------------------------------------------------------------
Model Perfromance for Test Set
- Root mean Squared Error : 220763.5459
- Mean Absolute Error : 96532.9384
- R-Sqaured : 0.9353
- Mean Squared Error : 48736543202.7874


Gradient Boost Regressor
Model Perfromance for Training Set
- Root mean Squared Error : 128009.1680
- Mean Absolute Error : 79662.5868
- R-Sqaured : 0.9798
- Mean Squared Error : 16386347090.7421
---------------------------------------------------------------
Model Perfromance for Test Set
- Root mean Squared Error : 223488.3612
- Mean Absolute Error : 103189.6980
- R-Sqaured : 0.9337
- Mean Squared Error : 49947047585.8994


