# Used Car Price Prediction

## 1) Problem statement.

* This dataset comprises used cars sold on cardehko.com in India as well as important features of these cars.
* If user can predict the price of the car based on input features.
* Prediction results can be used to give new seller the price suggestion based on market condition.

## 2) Data Collection.
* The Dataset is collected from scrapping from cardheko webiste
* The data consists of 13 column and 15411 rows.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [5]:
df=pd.read_csv('cardekho_imputated.csv',index_col=[0])

In [6]:
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [7]:
df.drop('car_name',axis=1,inplace=True)


In [8]:
df.drop('brand',axis=1,inplace=True)


In [9]:
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [10]:
df.isnull().sum()

model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [11]:
## get all the numeric features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

Num of Numerical Features : 7


In [12]:
num_features

['vehicle_age',
 'km_driven',
 'mileage',
 'engine',
 'max_power',
 'seats',
 'selling_price']

In [13]:
## get all the categorical features.
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of categorical Features :', len(cat_features))

Num of categorical Features : 4


In [14]:
cat_features

['model', 'seller_type', 'fuel_type', 'transmission_type']

In [15]:
## independent and dependent features.
X=df.drop(['selling_price'],axis=1)
y=df['selling_price']

In [16]:
X.shape,y.shape

((15411, 10), (15411,))

In [14]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


## feature encoding and scaling.
### One hot encoding for columns which had lesser unique values and not ordinal.

In [17]:
df['model'].unique().size

120

In [18]:
from sklearn.preprocessing import LabelEncoder
le1=LabelEncoder()
X['model']=le1.fit_transform(X['model'])

In [19]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [20]:
cat_features

['model', 'seller_type', 'fuel_type', 'transmission_type']

In [21]:
X.seller_type.unique()

array(['Individual', 'Dealer', 'Trustmark Dealer'], dtype=object)

In [24]:
X.fuel_type.unique()

array(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'], dtype=object)

In [25]:
X.transmission_type.unique()

array(['Manual', 'Automatic'], dtype=object)

### for the above all 3 categorical features we can use one hot encoding.

In [22]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
num_features=X.select_dtypes(exclude='object').columns
ohc=['seller_type','fuel_type','transmission_type']

oh_transformer=OneHotEncoder()
numeric_transformer=StandardScaler()

preprocessor=ColumnTransformer(

          [
              ("OneHotEncoder",oh_transformer,ohc),
              ("StandardScaler",numeric_transformer,num_features)
          ], remainder='passthrough'

)




In [23]:
X=preprocessor.fit_transform(X)

In [29]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [25]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((10787, 17), (10787,), (4624, 17), (4624,))

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score,f1_score,precision_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import  GradientBoostingRegressor

In [27]:
## create a function to evaluate the model.
def evaluate_model(y_true,y_pred):
    mse=mean_squared_error(y_true,y_pred)
    mae=mean_absolute_error(y_true,y_pred)
    score=r2_score(y_true,y_pred)

    return mae,mse,score

In [28]:
## Beginning model training.
models={
    "Linear Regression":LinearRegression(),
     "Lasso":Lasso(),
     "Ridge":Ridge(),
     "K-Neighbour":KNeighborsRegressor(),
     "DecisionTree":DecisionTreeRegressor(),
     "RandomForest":RandomForestRegressor(),
     "Adaboost":AdaBoostRegressor(),
     "Gradient-Boosting":GradientBoostingRegressor()
}


In [40]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Calculate Regression Metrics for Training set
        mse_train = mean_squared_error(y_train, y_train_pred)
        rmse_train = mse_train ** 0.5  # Square root of MSE
        mae_train = mean_absolute_error(y_train, y_train_pred)
        r2_train = r2_score(y_train, y_train_pred)

        # Calculate Regression Metrics for Test set
        mse_test = mean_squared_error(y_test, y_test_pred)
        rmse_test = mse_test ** 0.5
        mae_test = mean_absolute_error(y_test, y_test_pred)
        r2_test = r2_score(y_test, y_test_pred)

        # Print Model Name
        print(name)

        # Training Performance
        print("Model performance for Training set:")
        print("- R² Score: {:.4f}".format(r2_train))
        print("- MSE: {:.4f}".format(mse_train))
        print("- RMSE: {:.4f}".format(rmse_train))
        print("- MAE: {:.4f}".format(mae_train))

        print('----------------------------------')

        # Test Performance
        print("Model performance for Test set:")
        print("- R² Score: {:.4f}".format(r2_test))
        print("- MSE: {:.4f}".format(mse_test))
        print("- RMSE: {:.4f}".format(rmse_test))
        print("- MAE: {:.4f}".format(mae_test))

        print("=" * 35)
        print("\n")

   


Linear Regression
Model performance for Training set:
- R² Score: 0.6183
- MSE: 312836513229.4384
- RMSE: 559317.9000
- MAE: 268133.8322
----------------------------------
Model performance for Test set:
- R² Score: 0.6576
- MSE: 257519230184.6559
- RMSE: 507463.5260
- MAE: 281026.0283


Lasso
Model performance for Training set:
- R² Score: 0.6183
- MSE: 312831832102.4724
- RMSE: 559313.7153
- MAE: 268436.2654
----------------------------------
Model performance for Test set:
- R² Score: 0.6575
- MSE: 257613099821.9339
- RMSE: 507556.0066
- MAE: 281328.0471


Ridge
Model performance for Training set:
- R² Score: 0.6183
- MSE: 312832804775.1844
- RMSE: 559314.5848
- MAE: 268394.1969
----------------------------------
Model performance for Test set:
- R² Score: 0.6575
- MSE: 257596522203.1087
- RMSE: 507539.6755
- MAE: 281279.8973


K-Neighbour
Model performance for Training set:
- R² Score: 0.8600
- MSE: 114737695681.8392
- RMSE: 338729.5318
- MAE: 92597.1447
---------------------------

In [3]:
models=[features for features in df.columns if df[features].dtype!='O']

NameError: name 'df' is not defined