# ML Models

In [1]:
import pandas as pd
import numpy as np

#### Helper functions

In [2]:
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.pyplot as plt

from pickle import dump

def evaluate_model(y_test, y_pred, return_=False, res_plot=False):
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    residuals = y_test - y_pred
    res_mean = np.mean(residuals)
    res_std = np.std(residuals)

    if return_ == False:
        print(f'Mean Absolute Error: {mae:,.2f}')
        print(f'R-squared: {r2:.3%}')
        print(f'Residuals Mean: {res_mean:,.2f}, Residuals Std: {res_std:,.2f}')
    else:    
        return r2, mae, res_mean, res_std
    
    if res_plot:
        fig, ax = plt.subplots(1, 2, figsize=(15, 6))
        
        ax[0].scatter(y_test, y_pred, alpha=0.2, s=5, label='Predicted vs Target data')
        ax[0].axline((0,0), slope=1, lw=1, ls=':', c='red', label='y=x')
        ax[0].legend()
        ax[0].set_title('Residuals Plot')

        ax[1].hist(residuals, density=True, bins=100)
        ax[1].set_title('Residuals Distribution')
        
        plt.show()

def save_model(model, filename):
    MODEL_SAVE_PATH = r'C:/Users/Nick/Documents/Data Science/Personal projects/car-price-prediction/models/'
    with open(MODEL_SAVE_PATH + filename + '.pkl', 'wb') as f:
        dump(model, f, protocol=5)
    print(f'model saved at {MODEL_SAVE_PATH}')

#### Load & Process data

In [3]:
DATA_PATH = r'C:/Users/Nick/Documents/Data Science/Personal projects/car-price-prediction/data/clean/'
FILE_NAME = 'data_clean_20240811.csv'

data = pd.read_csv(DATA_PATH+FILE_NAME, sep=';')

data.drop(['FullName', 'ProductionDate'], axis=1, inplace=True)

In [4]:
from sklearn.model_selection import train_test_split

data_processed = pd.get_dummies(data, prefix_sep = '_')

X = data_processed.drop('Price', axis=1)
y = data_processed['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

## Models building

[*State of Competitive ML report 2023*](https://mlcontests.com/state-of-competitive-machine-learning-2023/)

### LightGBM

In [5]:
from lightgbm import LGBMRegressor
import optuna

In [6]:
lgbm_reg = LGBMRegressor(verbose=-1, n_jobs=-1)
lgbm_reg.fit(X_train, y_train)
y_pred = lgbm_reg.predict(X_test)
evaluate_model(y_test, y_pred)

Mean Absolute Error: 2,584.07
R-squared: 88.219%
Residuals Mean: -34.23, Residuals Std: 5,548.64


In [7]:
# optuna tuning
def objective(trial):
    params = {'n_estimators' : trial.suggest_int('n_estimators', 100, 5_000),
              'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.5),
              'max_depth' : trial.suggest_int('max_depth', 1, 15)
            }
    
    lgbm_reg = LGBMRegressor(verbose=-1, n_jobs=-1, **params)
    lgbm_reg.fit(X_train, y_train)
    y_pred = lgbm_reg.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    
    return mae

study = optuna.create_study(direction='minimize', study_name='lgbm_opt')
study.optimize(objective, n_trials=100, n_jobs=-1)

[I 2024-08-20 00:47:29,690] A new study created in memory with name: lgbm_opt
[I 2024-08-20 00:47:31,674] Trial 0 finished with value: 2622.308620700378 and parameters: {'n_estimators': 164, 'learning_rate': 0.45515013708498286, 'max_depth': 10}. Best is trial 0 with value: 2622.308620700378.
[I 2024-08-20 00:47:32,401] Trial 2 finished with value: 2568.6973373564433 and parameters: {'n_estimators': 247, 'learning_rate': 0.3750173000206759, 'max_depth': 10}. Best is trial 2 with value: 2568.6973373564433.
[I 2024-08-20 00:47:33,196] Trial 1 finished with value: 2661.2224334156795 and parameters: {'n_estimators': 2029, 'learning_rate': 0.28553903578620177, 'max_depth': 2}. Best is trial 2 with value: 2568.6973373564433.
[I 2024-08-20 00:47:35,913] Trial 4 finished with value: 2467.121110405737 and parameters: {'n_estimators': 816, 'learning_rate': 0.16167423689239363, 'max_depth': 7}. Best is trial 4 with value: 2467.121110405737.
[I 2024-08-20 00:47:36,660] Trial 6 finished with value:

In [8]:
# best_params = study.best_params
# print(f'Best Parameters: {best_params}')
best_params = {'n_estimators': 4317, 'learning_rate': 0.07676003542347755, 'max_depth': 4}
tuned_lgbm_reg = LGBMRegressor(verbose=-1, n_jobs=-1, **best_params)
tuned_lgbm_reg.fit(X_train, y_train)
y_pred = tuned_lgbm_reg.predict(X_test)
evaluate_model(y_test, y_pred)

Mean Absolute Error: 2,444.79
R-squared: 89.104%
Residuals Mean: -33.97, Residuals Std: 5,336.35


In [9]:
save_model(tuned_lgbm_reg, 'tuned_lgbm_20240820_2120')

model saved at C:/Users/Nick/Documents/Data Science/Personal projects/car-price-prediction/models/


### XGBoost [TO DO]

In [5]:
from xgboost import XGBRegressor

Trees to dataframe function

In [124]:
def xgb_trees_to_dataframe(xgbregressor):

    trees_dump = xgbregressor.get_booster().get_dump(with_stats=True) # each line is a tree

    trees_df = pd.DataFrame(columns=['Tree_Id', 'Node', 'Feature', 'Condition','Yes', 'No', 'Missing', 'Gain', 'Leaf', 'Cover'])

    for tree_id, tree in enumerate(trees_dump):
        for line in tree.splitlines():
            stripped_line = line.strip()
            stripped_line = stripped_line.replace(' ', ',')
            split = stripped_line.split(',')
            split_ = split[1:]
            node = split[0][0]
            feature_leaf = split[0][2:]
            final_split = [node, feature_leaf] + split_
            
            node_ = node
            
            if final_split[1][0] == '[':
                feature_split = final_split[1].split('<')
                feature_ = feature_split[0].strip('[').strip(']')
                if len(feature_split) > 1:
                    condition_ = f'<{feature_split[1]}'.strip(']')
                else:
                    condition_ = np.nan
                yes_ = final_split[2]
                no_ = final_split[3]
                if final_split[4].startswith('missing'):
                    missing_ = final_split[4]
                    gain_ = final_split[5][5:]
                    cover_ = final_split[6][6:]
                else:
                    missing_ = np.nan
                    gain_ = final_split[4][5:]
                    cover_ = final_split[5][6:]
                
                leaf_ = np.nan
            
            else:
                feature_ = np.nan
                condition_ = np.nan
                yes_ = np.nan
                no_ = np.nan
                missing_ = np.nan
                gain_ = np.nan
                leaf_ = final_split[1][5:]
                cover_ = final_split[2][6:]
            
            # print(final_split)
            trees_df.loc[len(trees_df)] = (tree_id, node_, feature_, condition_, yes_, no_, missing_, gain_, leaf_, cover_)
    
    return trees_df

In [126]:
xgb_reg = XGBRegressor(max_depth=2)
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)
evaluate_model(y_test, y_pred)

trees_structure = xgb_trees_to_dataframe(xgb_reg)
trees_structure

Mean Absolute Error: 3,051.20
R-squared: 84.480%
Residuals Mean: -2.24, Residuals Std: 6,368.81


Unnamed: 0,Tree_Id,Node,Feature,Condition,Yes,No,Missing,Gain,Leaf,Cover
0,0,0,Horsepower,<306,yes=1,no=2,missing=2,1.10603928e+12,,14164
1,0,1,GearBox_Manual,,yes=3,no=4,,3.09877211e+11,,13496
2,0,3,,,,,,,1856.77527,3462
3,0,4,,,,,,,-1434.5332,10034
4,0,2,Age,<12,yes=5,no=6,missing=6,4.02776064e+11,,668
...,...,...,...,...,...,...,...,...,...,...
683,99,3,,,,,,,-0.562374294,14099
684,99,4,,,,,,,1357.11743,25
685,99,2,CubicCapacity,<1994,yes=5,no=6,missing=6,1.53136627e+09,,40
686,99,5,,,,,,,909.307068,24


In [14]:
save_model(xgb_reg, 'xgb_reg_20240820_2123')

model saved at C:/Users/Nick/Documents/Data Science/Personal projects/car-price-prediction/models/


In [35]:
# optuna tuning
def objective(trial):
    params = {'n_estimators' : trial.suggest_int('n_estimators', 100, 5_000),
              'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.5),
              'max_depth' : trial.suggest_int('max_depth', 1, 15)
            }
    
    xgb_reg = XGBRegressor(**params)
    xgb_reg.fit(X_train, y_train)
    y_pred = xgb_reg.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    
    return mae

study = optuna.create_study(direction='minimize', study_name='xgb_opt')
study.optimize(objective, n_trials=100, n_jobs=-1)

[I 2024-08-20 00:34:51,272] A new study created in memory with name: xgb_opt
[I 2024-08-20 00:35:03,879] Trial 3 finished with value: 4355.758518610051 and parameters: {'n_estimators': 893, 'learning_rate': 0.1551686214737238, 'max_depth': 1}. Best is trial 3 with value: 4355.758518610051.
[I 2024-08-20 00:35:24,021] Trial 1 finished with value: 4383.768679261914 and parameters: {'n_estimators': 2911, 'learning_rate': 0.16529716099808972, 'max_depth': 1}. Best is trial 3 with value: 4355.758518610051.
[I 2024-08-20 00:35:24,292] Trial 0 finished with value: 4393.196522384941 and parameters: {'n_estimators': 2921, 'learning_rate': 0.2552310267893954, 'max_depth': 1}. Best is trial 3 with value: 4355.758518610051.
[I 2024-08-20 00:36:21,450] Trial 5 finished with value: 2596.3189754513014 and parameters: {'n_estimators': 2952, 'learning_rate': 0.3798868538293556, 'max_depth': 4}. Best is trial 5 with value: 2596.3189754513014.
[I 2024-08-20 00:36:58,758] Trial 2 finished with value: 2455

In [33]:
best_params = study.best_params
print(f'Best Parameters: {best_params}')
lgbm_reg = LGBMRegressor(verbose=-1, n_jobs=-1, **best_params)
lgbm_reg.fit(X_train, y_train)
y_pred = lgbm_reg.predict(X_test)
evaluate_model(y_test, y_pred)

Best Parameters: {'n_estimators': 2761, 'learning_rate': 0.2221187740532873, 'max_depth': 8}
Mean Absolute Error: 2,595.74
R-squared: 83.889%
Residuals Mean: -26.76, Residuals Std: 6,488.82


### CatBoost [TO DO]

### SHAP values [TO DO]