In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
df = pd.read_csv('finalCarDataset.csv')

In [3]:
X = df.drop(columns=['Registration', 'Price(£)', 'ModelFull', 'logPrice'])
y = df['logPrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def pipe_model(model):
    categorical_features = ['Brand', 'Model', 'Control','Fuel', 'Location', 'Emission']
    categorical_transforemer = OneHotEncoder(handle_unknown='ignore')

    numerical_features = ['Year', 'Miles(k miles)', 'EngineSize', 'Doors']
    numerical_transformer = StandardScaler()

    preprocessor = ColumnTransformer([
        ('cat', categorical_transforemer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

    regression_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])

    return regression_model


In [5]:
def calculate_metrics(model_name, y_actual_train, y_prediction_train, y_actual_test, y_prediction_test):

    mae_train = mean_absolute_error(y_actual_train, y_prediction_train)
    mae_test = mean_absolute_error(y_actual_test, y_prediction_test)
    mse_train = mean_squared_error(y_actual_train, y_prediction_train)
    mse_test = mean_squared_error(y_actual_test, y_prediction_test)
    r2_train = r2_score(y_actual_train, y_prediction_train)
    r2_test = r2_score(y_actual_test, y_prediction_test)

    print(f'Metrics for {model_name}: ')
    print(f'MAE train = {mae_train:.2f}\nMAE test = {mae_test:.2f}')
    print(f'MSE train = {mse_train:.2f}\nMSE test = {mse_test:.2f}')
    print(f'R2 Score train = {r2_train:.3f}\nR2 Score test = {r2_test:.3f}')

    return  mae_test, mse_test, r2_test

In [6]:
rf_model = pipe_model(RandomForestRegressor(n_estimators=300, max_depth=32, random_state=42))
rf_model.fit(X_train, y_train)

# ==== Predict ==== #
y_prediction_train_rf = np.exp(rf_model.predict(X_train))
y_prediction_test_rf = np.exp(rf_model.predict(X_test))
y_actual_train = np.exp(y_train)
y_actual_test = np.exp(y_test)

# ==== Evaluate ==== #
rf_test_mae, rf_test_mse, rf_test_r2 = calculate_metrics(
    'Random Forest Regression',
    y_actual_train, y_prediction_train_rf,
    y_actual_test, y_prediction_test_rf
)

Metrics for Random Forest Regression: 
MAE train = 1547.09
MAE test = 3172.53
MSE train = 39953941.75
MSE test = 63570496.85
R2 Score train = 0.933
R2 Score test = 0.836


In [7]:
joblib.dump(rf_model, "random_forest_carPrice.pkl")

['random_forest_carPrice.pkl']