# Used Car Price Prediction

## 1) Problem statement.

This dataset comprises used cars sold on cardehko.com in India as well as important features of these cars.
If user can predict the price of the car based on input features.
Prediction results can be used to give new seller the price suggestion based on market condition.

## 2) Data Collection.

The Dataset is collected from scrapping from cardheko webiste
The data consists of 13 column and 15411 rows.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:

df = pd.read_csv(r"D:\Machine Learning\Gradient Boosting\Projects\Regression\cardekho_imputated.csv", index_col=[0])

In [4]:
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


# Data Cleaning

## Handling Missing values

.Handling Missing values

.Handling Duplicates

.Check data type

.Understand the dataset


In [5]:
## Check Null Values
##Check features with nan value
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [6]:
## Remove Unnecessary Columns
df.drop('car_name', axis=1, inplace=True)
df.drop('brand', axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [8]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [9]:
## Getting All Different Types OF Features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :',len(continuous_features))

Num of Numerical Features : 7
Num of Categorical Features : 4
Num of Discrete Features : 2
Num of Continuous Features : 5


In [10]:
## Indpendent and dependent features
from sklearn.model_selection import train_test_split
X = df.drop(['selling_price'], axis=1)
y = df['selling_price']

# Feature Encoding and Scaling

## One Hot Encoding for Columns which had lesser unique values and not ordinal

One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction.

In [11]:
len(df['model'].unique())

120

In [12]:
df['model'].value_counts()

model
i20            906
Swift Dzire    890
Swift          781
Alto           778
City           757
              ... 
Ghibli           1
Altroz           1
GTC4Lusso        1
Aura             1
Gurkha           1
Name: count, Length: 120, dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X['model']=le.fit_transform(X['model'])

In [15]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [16]:
len(df['seller_type'].unique()),len(df['fuel_type'].unique()),len(df['transmission_type'].unique())

(3, 5, 2)

In [17]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
onehot_columns = ['seller_type','fuel_type','transmission_type']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
        
    ],remainder='passthrough'
    
)

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = [0, 1]  # column indices for numeric columns
categorical_features = [2, 3]  # column indices for categorical columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)


In [22]:
X=preprocessor.fit_transform(X)

In [23]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5
0,1.305434,-0.106551,1.0,0.0,1.0,0.0
1,1.305434,-0.106551,1.0,0.0,1.0,0.0
2,1.305434,-0.106551,1.0,0.0,1.0,0.0
3,1.305434,-0.106551,1.0,0.0,1.0,0.0
4,-0.766029,-0.106551,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...
15406,-0.766029,-0.106551,1.0,0.0,1.0,0.0
15407,-0.766029,-0.106551,1.0,0.0,1.0,0.0
15408,-0.766029,-0.106551,0.0,1.0,1.0,0.0
15409,-0.766029,-0.106551,0.0,1.0,1.0,0.0


In [24]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((12328, 6), (3083, 6))

In [25]:
X_train

array([[-0.76602861, -0.10655138,  0.        ,  1.        ,  1.        ,
         0.        ],
       [ 1.30543427, -0.10655138,  1.        ,  0.        ,  1.        ,
         0.        ],
       [-0.76602861, -0.10655138,  0.        ,  1.        ,  1.        ,
         0.        ],
       ...,
       [ 1.30543427, -0.10655138,  1.        ,  0.        ,  1.        ,
         0.        ],
       [-0.76602861, -0.10655138,  1.        ,  0.        ,  1.        ,
         0.        ],
       [ 1.30543427, -0.10655138,  1.        ,  0.        ,  1.        ,
         0.        ]])

# Model Training And Model Selection

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [27]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [28]:
## Beginning Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor":AdaBoostRegressor(),
    "Graident BoostRegressor":GradientBoostingRegressor()
   
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 868292.9324
- Mean Absolute Error: 418716.1343
- R2 Score: 0.0704
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 829556.9756
- Mean Absolute Error: 434497.1797
- R2 Score: 0.0858


Lasso
Model performance for Training set
- Root Mean Squared Error: 866433.7135
- Mean Absolute Error: 418646.1757
- R2 Score: 0.0744
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 827625.0633
- Mean Absolute Error: 433767.5577
- R2 Score: 0.0901


Ridge
Model performance for Training set
- Root Mean Squared Error: 866437.5266
- Mean Absolute Error: 418681.9461
- R2 Score: 0.0744
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 827655.7858
- Mean Absolute Error: 433832.0161
- R2 Score: 0.0900


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 1308723.6742
- Mean

In [29]:
#Initialize few parameter for Hyperparamter tuning

rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

gradient_params={"loss": ['squared_error','huber','absolute_error'],
             "criterion": ['friedman_mse','squared_error','mse'],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500],
              "max_depth": [5, 8, 15, None, 10],
            }

In [30]:
# Models list for Hyperparameter tuning
randomcv_models = [
                   ("RF", RandomForestRegressor(), rf_params),
                   ("GradientBoost",GradientBoostingRegressor(),gradient_params)
                   
                   ]


In [31]:
##Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for RF -------------------
{'n_estimators': 100, 'min_samples_split': 2, 'max_features': 5, 'max_depth': 15}
---------------- Best Params for GradientBoost -------------------
{'n_estimators': 200, 'min_samples_split': 20, 'max_depth': 5, 'loss': 'squared_error', 'criterion': 'squared_error'}


In [36]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Corrected models
models = {
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=100, 
        min_samples_split=2, 
        max_features=None,   # fixed 'auto' → None
        max_depth=None, 
        n_jobs=-1
    ),
    "GradientBoost Regressor": GradientBoostingRegressor(
        n_estimators=200,
        min_samples_split=8, 
        max_depth=10, 
        loss='huber', 
        criterion='squared_error'  # fixed 'mse' → 'squared_error'
    )
}

# Loop to train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate model performance (assuming evaluate_model function exists)
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    # Print results
    print(f"{name}")
    print('Model performance for Training set')
    print(f"- Root Mean Squared Error: {model_train_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_train_mae:.4f}")
    print(f"- R2 Score: {model_train_r2:.4f}")
    print('----------------------------------')
    print('Model performance for Test set')
    print(f"- Root Mean Squared Error: {model_test_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_test_mae:.4f}")
    print(f"- R2 Score: {model_test_r2:.4f}")
    print('='*35, '\n')


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 865205.0135
- Mean Absolute Error: 415641.9801
- R2 Score: 0.0770
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 826762.7584
- Mean Absolute Error: 431399.5183
- R2 Score: 0.0920

GradientBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 880788.6731
- Mean Absolute Error: 377435.4537
- R2 Score: 0.0435
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 848059.0574
- Mean Absolute Error: 398899.8588
- R2 Score: 0.0446

