In [9]:
import os

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error, r2_score
from sklearn import metrics
from sklearn.impute import SimpleImputer, KNNImputer
import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('fillKNN_data_final_final.csv')
df = df.drop(['curbWeight', 'vEengineDisplacement'], axis=1)

In [11]:
df.isna().sum()

accelerationTime           0
bodyType                   0
brand                      0
cargoVolume                0
driveWheelConfiguration    0
eLabel                     0
emissionsCO2               0
engineCapacity             0
fuelCapacity               0
fuelConsumption            0
fuelType                   0
height                     0
length                     0
modelDate                  0
numberOfDoors              0
numberOfForwardGears       0
payload                    0
price                      0
roofLoad                   0
seatingCapacity            0
speed                      0
torque                     0
vEenginePower              0
vEengineType               0
vehicleTransmission        0
weightTotal                0
wheelbase                  0
width                      0
dtype: int64

In [12]:
# creating X and y variables
X = df.drop('price', axis=1)
# price column
y = np.log(df['price'])

In [13]:
linear_train_score = []
knn_train_score = []
etr_train_score = []
lgbm_train_score = []
rf_train_score = []
xgb_train_score = []
cat_train_score = []

In [14]:
linear_test_pred = []
knn_test_pred = []
etr_test_pred = []
lgbm_test_pred = []
rf_test_pred = []
xgb_test_pred = []
cat_test_pred = []
y_tests = []

In [16]:

for _ in range(10):
    print(f'LOOP: {_+1}\n\n')
    #train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)
    # feature scale the X_train and X_test values

    norm = StandardScaler().fit(X_train)

    # transform training data
    X_train = norm.transform(X_train)

    # transform testing data
    X_test = norm.transform(X_test)

    linear = LinearRegression()
    linear.fit(X_train, y_train)
    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)
    etr = ExtraTreesRegressor(random_state = 123 , max_depth = 45  , n_estimators = 400)
    etr.fit(X_train,y_train)
    lgbm = LGBMRegressor(random_state = 123 ,  num_leaves = 750 , learning_rate = 0.01, max_bin = 1200 , n_estimators = 1000)
    lgbm.fit(X_train,y_train)
    xgb = XGBRegressor(random_state = 123 , max_depth = 7 , learning_rate = 0.2 , n_estimators = 1500)
    xgb.fit(X_train,y_train)
    rf = RandomForestRegressor(random_state = 123 , max_depth = 45 , n_estimators = 600)
    rf.fit(X_train,y_train)
    cat = CatBoostRegressor(random_state = 123  , max_depth = 14 )
    cat.fit(X_train,y_train, verbose = False)

    linear_train_score.append(linear.score(X_train, y_train))
    knn_train_score.append(knn.score(X_train, y_train))
    etr_train_score.append(etr.score(X_train, y_train))
    lgbm_train_score.append(lgbm.score(X_train, y_train))
    xgb_train_score.append(xgb.score(X_train, y_train))
    rf_train_score.append(rf.score(X_train, y_train))
    cat_train_score.append(cat.score(X_train, y_train))

    linear_test_pred.append(np.exp(linear.predict(X_test)))
    knn_test_pred.append(np.exp(knn.predict(X_test)))
    etr_test_pred.append(np.exp(etr.predict(X_test)))
    lgbm_test_pred.append(np.exp(lgbm.predict(X_test)))
    xgb_test_pred.append(np.exp(xgb.predict(X_test)))
    rf_test_pred.append(np.exp(rf.predict(X_test)))
    cat_test_pred.append(np.exp(cat.predict(X_test)))

    y_tests.append(np.exp(y_test))


LOOP: 1


LOOP: 2


LOOP: 3


LOOP: 4


LOOP: 5


LOOP: 6


LOOP: 7


LOOP: 8


LOOP: 9


LOOP: 10




In [19]:
mae_s = []
mse_s = []
rmse_s = []
r2_score_s = []
for linear_pred, knn_pred, etr_pred, lgbm_pred, xgb_pred, rf_pred, cat_pred, y_test in zip(linear_test_pred, knn_test_pred, etr_test_pred, lgbm_test_pred, xgb_test_pred, rf_test_pred, cat_test_pred, y_tests):
    mse_s.append([mean_squared_error(y_test, linear_pred), mean_squared_error(y_test, knn_pred), mean_squared_error(y_test, etr_pred),
                  mean_squared_error(y_test, lgbm_pred), mean_squared_error(y_test, xgb_pred), mean_squared_error(y_test, rf_pred),
                  mean_squared_error(y_test, cat_pred)])
    rmse_s.append([np.sqrt(mean_squared_error(y_test, linear_pred)),
             np.sqrt(mean_squared_error(y_test, knn_pred)),
             np.sqrt(mean_squared_error(y_test, etr_pred)),
             np.sqrt(mean_squared_error(y_test, lgbm_pred)),
             np.sqrt(mean_squared_error(y_test, xgb_pred)),
             np.sqrt(mean_squared_error(y_test, rf_pred)),
             np.sqrt(mean_squared_error(y_test, cat_pred))])
    mae_s.append([mean_absolute_error(y_test, linear_pred), mean_absolute_error(y_test, knn_pred),
            mean_absolute_error(y_test, etr_pred), mean_absolute_error(y_test, lgbm_pred),
            mean_absolute_error(y_test, xgb_pred), mean_absolute_error(y_test, rf_pred),
            mean_absolute_error(y_test, cat_pred)])
    r2_score_s.append([r2_score(y_test, linear_pred), r2_score(y_test, knn_pred),
                 r2_score(y_test, etr_pred), r2_score(y_test, lgbm_pred),
                 r2_score(y_test, xgb_pred), r2_score(y_test, rf_pred),
                 r2_score(y_test, cat_pred)])
mae_s = np.swapaxes(np.array(mae_s), 0, 1)
mse_s = np.swapaxes(np.array(mse_s), 0, 1)
rmse_s = np.swapaxes(np.array(rmse_s), 0, 1)
r2_score_s = np.swapaxes(np.array(r2_score_s), 0, 1)

In [21]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['LinearRegressor', 'KNeighborsRegressor', 'ExtraTreesRegressor', 'LGBMRegressor', 'XGBRegressor', 'RandomForestRegressor', 'CatBoostRegressor'],

    'mae': np.mean(mae_s, axis=1),

    'mse': np.mean(mse_s, axis=1),

    'rmse': np.mean(rmse_s, axis=1),
    'r2_score': np.mean(r2_score_s, axis=1)
})
best_model

Unnamed: 0,model,mae,mse,rmse,r2_score
0,LinearRegressor,4.985909,127.931424,11.310677,0.866144
1,KNeighborsRegressor,2.958373,62.410658,7.900042,0.934699
2,ExtraTreesRegressor,2.50842,40.376127,6.354221,0.957754
3,LGBMRegressor,2.303172,36.166327,6.013845,0.962159
4,XGBRegressor,2.334482,36.669758,6.055556,0.961632
5,RandomForestRegressor,2.462064,38.316124,6.190002,0.959909
6,CatBoostRegressor,2.2725,51.727216,7.192164,0.945877
