In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge

In [2]:
df = pd.read_csv('final_data_set_ready_for_model.csv')

x = df.drop(columns=['price'])
df['price'] = np.log1p(df['price'])
y = df['price']

x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.4, random_state=42, stratify=df['engine_type']
)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=0.5, random_state=42, stratify=x_temp['engine_type']
)

In [3]:
one_hot_encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
one_hot_encoder.fit(df[['brand', 'engine_type']])

encoded_train = one_hot_encoder.transform(x_train[['brand', 'engine_type']]).toarray()
encoded_val = one_hot_encoder.transform(x_val[['brand', 'engine_type']]).toarray()
encoded_test = one_hot_encoder.transform(x_test[['brand', 'engine_type']]).toarray()

columns = one_hot_encoder.get_feature_names_out(['brand', 'engine_type'])
encoded_train = pd.DataFrame(encoded_train, columns=columns, index=x_train.index)
encoded_val = pd.DataFrame(encoded_val, columns=columns, index=x_val.index)
encoded_test = pd.DataFrame(encoded_test, columns=columns, index=x_test.index)

x_train = pd.concat([x_train.drop(columns=['brand', 'engine_type']), encoded_train], axis=1)
x_val = pd.concat([x_val.drop(columns=['brand', 'engine_type']), encoded_val], axis=1)
x_test = pd.concat([x_test.drop(columns=['brand', 'engine_type']), encoded_test], axis=1)

In [4]:
def baseline():
    mean_price = y_train.mean()

    baseline_predictions_mean = [mean_price] * len(y_val)

    mse_mean = mean_squared_error(y_val, baseline_predictions_mean)
    mae_mean = mean_absolute_error(y_val, baseline_predictions_mean)
    print(f"Baseline (Mean) -> MSE: {mse_mean}, MAE: {mae_mean}")


    median_price = y_train.median()

    baseline_predictions_median = [median_price] * len(y_val)

    mse_median = mean_squared_error(y_val, baseline_predictions_median)
    mae_median = mean_absolute_error(y_val, baseline_predictions_median)
    print(f"Baseline (Median) -> MSE: {mse_median}, MAE: {mae_median}")

In [5]:
def linear_model():
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(x_train, y_train)

    val_predictions = linear_regression_model.predict(x_val)

    mse = mean_squared_error(y_val, val_predictions)
    mae = mean_absolute_error(y_val, val_predictions)

    print(f"Validation MSE: {mse}, Validation MAE: {mae}")

In [6]:
def grid_search():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ])

    param_grid = {
        'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
        'model__fit_intercept': [True, False]
    }

    grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(x_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV MSE: {-grid_search.best_score_}")

    best_model = grid_search.best_estimator_
    val_predictions_gs = best_model.predict(x_val)
    mse_gs = mean_squared_error(y_val, val_predictions_gs)
    print(f"Validation (GridSearch) MSE: {mse_gs}")

In [7]:
def evaluate_on_test():
    best_model = GridSearchCV(
        Pipeline([('scaler', StandardScaler()), ('model', Ridge())]),
        {'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100], 'model__fit_intercept': [True, False]},
        scoring='neg_mean_squared_error',
        cv=5
    ).fit(x_train, y_train).best_estimator_

    test_predictions = best_model.predict(x_test)
    mse_test = mean_squared_error(y_test, test_predictions)
    mae_test = mean_absolute_error(y_test, test_predictions)

    print(f"Test MSE: {mse_test}, Test MAE: {mae_test}")

In [8]:
baseline()
linear_model()
grid_search()
evaluate_on_test()

Baseline (Mean) -> MSE: 0.8131543996071204, MAE: 0.6971718741229858
Baseline (Median) -> MSE: 0.8221096629435507, MAE: 0.6963094041481983
Validation MSE: 0.1529364753735695, Validation MAE: 0.2680316449441729
Best parameters: {'model__alpha': 10, 'model__fit_intercept': True}
Best CV MSE: 0.15497540254998526
Validation (GridSearch) MSE: 0.12659523427313948
Test MSE: 0.11844389087995234, Test MAE: 0.25126200846354585
