In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [4]:
df = pd.read_csv(r"C:\Users\nico_\Desktop\Data science bootcamp\Random Forest\cardekho_imputated.csv", index_col=[0])

In [6]:
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [8]:
df.shape

(15411, 13)

In [10]:
df.dtypes

car_name              object
brand                 object
model                 object
vehicle_age            int64
km_driven              int64
seller_type           object
fuel_type             object
transmission_type     object
mileage              float64
engine                 int64
max_power            float64
seats                  int64
selling_price          int64
dtype: object

# Data Cleaning

In [13]:
df.isna().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [15]:
df.duplicated().sum()

167

In [17]:
df.drop_duplicates(inplace=True)

In [19]:
df.duplicated().sum()

0

In [21]:
df.columns

Index(['car_name', 'brand', 'model', 'vehicle_age', 'km_driven', 'seller_type',
       'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power',
       'seats', 'selling_price'],
      dtype='object')

In [23]:
# Remove unnecessary columns
df.drop("car_name", axis=1, inplace=True)
df.drop("brand", axis=1, inplace=True)

In [25]:
df["model"].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [27]:
# Independent and dependent features
X=df.drop("selling_price", axis=1)
y=df["selling_price"]

# Features encoding ans scaling

In [30]:
le=LabelEncoder()
X['model']=le.fit_transform(X['model'])

In [32]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [34]:
X.columns

Index(['model', 'vehicle_age', 'km_driven', 'seller_type', 'fuel_type',
       'transmission_type', 'mileage', 'engine', 'max_power', 'seats'],
      dtype='object')

In [36]:
# Column Transformer
num_features = X.select_dtypes(exclude='object').columns
onehot_columns = ['seller_type', 'fuel_type','transmission_type']
label_encoder_columns = ['model']

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
    ], remainder='passthrough'   # Garde les colonnes non spécifiées sans les transformer et les ajoute à la sortie
)

In [38]:
X=preprocessor.fit_transform(X)

In [40]:
X

array([[ 1.        ,  0.        ,  0.        , ..., -1.32622688,
        -1.26549143, -0.40329882],
       [ 1.        ,  0.        ,  0.        , ..., -0.5556692 ,
        -0.43360049, -0.40329882],
       [ 1.        ,  0.        ,  0.        , ..., -0.5556692 ,
        -0.48020502, -0.40329882],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.02272945,
         0.06786431, -0.40329882],
       [ 0.        ,  0.        ,  1.        , ...,  1.33133238,
         0.91793102,  2.06970488],
       [ 0.        ,  0.        ,  0.        , ...,  0.02080786,
         0.39596023, -0.40329882]])

In [42]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519581,0.981015,1.243329,0.000640,-1.326227,-1.265491,-0.403299
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.225821,-0.345188,-0.688493,-0.191245,-0.555669,-0.433600,-0.403299
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.535894,1.644117,0.084236,-0.646971,-0.555669,-0.480205,-0.403299
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519581,0.981015,-0.360084,0.293264,-0.938066,-0.780804,-0.403299
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.666250,-0.013637,-0.495311,0.736997,0.022729,-0.047016,-0.403299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15239,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508367,0.981015,-0.867708,0.027024,-0.768966,-0.758667,-0.403299
15240,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.556143,-1.339840,-0.727130,-0.527043,-0.217469,-0.221550,2.069705
15241,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.407295,-0.013637,0.219463,0.346032,0.022729,0.067864,-0.403299
15242,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.425787,-0.345188,72.334381,-0.886827,1.331332,0.917931,2.069705


# Split the data

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [47]:
print(len(X_train))
print(len(X_test))

12195
3049


# Model training and model selection

In [50]:
# Function to evaluate model
def evaluate_model(true, predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae, mse,rmse,r2_square

In [52]:
# Model training
models = {
    "AdaBoost Regressor": AdaBoostRegressor(),
    "linear Regression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Rdge" : Ridge(),
    "K-Neighbors Regressor" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor()
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    model_train_mae, model_train_mse,model_train_rmse,model_train_r2_square=evaluate_model(y_train,y_train_pred)
    model_test_mae, model_test_mse,model_test_rmse,model_test_r2_square=evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])

    print("Model performance for training set")

    print("mae :", model_train_mae)
    print("mse :", model_train_mse)
    print("rmse :", model_train_rmse)
    print("r2 :", model_train_r2_square)

    print("Model performance for test set")

    print("mae :", model_test_mae)
    print("mse :", model_test_mse)
    print("rmse :", model_test_rmse)
    print("r2 :", model_test_r2_square)
    

AdaBoost Regressor
Model performance for training set
mae : 378010.70781972917
mse : 223556825484.10468
rmse : 472817.9623111887
r2 : 0.7365237364296146
Model performance for test set
mae : 379036.0810277118
mse : 263943009504.94583
rmse : 513753.8413529828
r2 : 0.5658980756187737
linear Regression
Model performance for training set
mae : 275284.38995814946
mse : 320371249674.76685
rmse : 566013.4712838263
r2 : 0.6224216387180521
Model performance for test set
mae : 262717.1886108069
mse : 199599130834.0928
rmse : 446765.1853424714
r2 : 0.6717231990253705
Lasso
Model performance for training set
mae : 275282.4735057536
mse : 320371256546.31946
rmse : 566013.477353958
r2 : 0.6224216306194804
Model performance for test set
mae : 262719.99864067894
mse : 199598832952.59692
rmse : 446764.8519664421
r2 : 0.671723688945262
Rdge
Model performance for training set
mae : 275246.7337111309
mse : 320371758692.06116
rmse : 566013.9209348664
r2 : 0.6224210388080539
Model performance for test set
ma

## Knn & Random Forest

In [54]:
#Hyperparameters
knn_params = {"n_neighbors" : [2,3,10,20,40,50]}
rf_params = {"max_depth" : [5,8,10,15,None],
             "max_features" : [5,7,8, "auto"],
             "min_samples_split" : [2,8,15,20],
             "n_estimators" : [100,200,500,1000]}
ada_params = {"n_estimators" : [50,60,70,80],
              "loss" : ["linear", "square", "exponential"]}

In [56]:
# Models list for hyperparamter training (tuple)
randomcv_models = [('KNN', KNeighborsRegressor(), knn_params),
                   ('RF', RandomForestRegressor(), rf_params),
                   ('AB', AdaBoostRegressor(), ada_params)]

In [58]:
# Hyperparameter tuning
model_param = {}

for name, model, params in randomcv_models:
    random=RandomizedSearchCV(estimator=model,
                              param_distributions=params,
                              n_iter=100,
                              cv=3,
                              verbose=2,
                              n_jobs=-1)

    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"Best params for {model_name}")
    print(model_param[model_name])



Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits


93 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\nico_\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\nico_\anaconda3\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\nico_\anaconda3\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\nico_\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParamete

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best params for KNN
{'n_neighbors': 3}
Best params for RF
{'n_estimators': 100, 'min_samples_split': 2, 'max_features': 7, 'max_depth': None}
Best params for AB
{'n_estimators': 50, 'loss': 'square'}


In [60]:
model_param

{'KNN': {'n_neighbors': 3},
 'RF': {'n_estimators': 100,
  'min_samples_split': 2,
  'max_features': 7,
  'max_depth': None},
 'AB': {'n_estimators': 50, 'loss': 'square'}}

In [62]:
model_name

'AB'

In [64]:
model_param

{'KNN': {'n_neighbors': 3},
 'RF': {'n_estimators': 100,
  'min_samples_split': 2,
  'max_features': 7,
  'max_depth': None},
 'AB': {'n_estimators': 50, 'loss': 'square'}}

In [207]:
# Retraining models with best parameters
models = {
    "K-Neighbors Regressor" : KNeighborsRegressor(n_neighbors=3),
    "Random Forest Regressor" : RandomForestRegressor(n_estimators=200, min_samples_split=2, max_features=8, max_depth=15),
    "AdaBoost Regressor" : AdaBoostRegressor(n_estimators= 50, loss= 'square'}
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    model_train_mae, model_train_mse,model_train_rmse,model_train_r2_square=evaluate_model(y_train,y_train_pred)
    model_test_mae, model_test_mse,model_test_rmse,model_test_r2_square=evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])

    print("Model performance for training set")

    print("mae :", model_train_mae)
    print("mse :", model_train_mse)
    print("rmse :", model_train_rmse)
    print("r2 :", model_train_r2_square)

    print("Model performance for test set")

    print("mae :", model_test_mae)
    print("mse :", model_test_mse)
    print("rmse :", model_test_rmse)
    print("r2 :", model_test_r2_square)

K-Neighbors Regressor
Model performance for training set
mae : 77891.45824791581
mse : 65700722327.85522
rmse : 256321.52139033354
r2 : 0.9225674242093337
Model performance for test set
mae : 110499.18005903575
mse : 86359006377.3186
rmse : 293869.02929250407
r2 : 0.8579670250545426
Random Forest Regressor
Model performance for training set
mae : 54098.01883931636
mse : 17132282688.970398
rmse : 130890.3460495479
r2 : 0.9798084902147509
Model performance for test set
mae : 100747.80911937074
mse : 85308941334.82706
rmse : 292076.9442027683
r2 : 0.8596940465677321
