In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
import os 

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.decomposition import PCA

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

warnings.filterwarnings("ignore")



In [2]:
link = r"D:/clean_df.csv"

df = pd.read_csv(link)

### Create man_period, stk_period

In [3]:
df.head()

Unnamed: 0,maker,model,mileage,manufacture_year,engine_displacement,engine_power,body_type,stk_year,transmission,door_count,seat_count,fuel_type,date_created,date_last_seen,price_eur
0,ford,galaxy,151000.0,2011.0,2000.0,138.12506,compact,,man,5.0,7.0,diesel,2015-11-14 18:10:06.838319+00,2016-01-27 20:40:15.46361+00,10584.75
1,skoda,octavia,143476.0,2012.0,2000.0,108.62262,compact,,man,5.0,5.0,diesel,2015-11-14 18:10:06.853411+00,2016-01-27 20:40:15.46361+00,8882.31
2,bmw,,97676.0,2010.0,1995.0,113.9867,compact,,man,5.0,5.0,diesel,2015-11-14 18:10:06.861792+00,2016-01-27 20:40:15.46361+00,12065.06
3,skoda,fabia,111970.0,2004.0,1200.0,84.48426,compact,,man,5.0,5.0,gasoline,2015-11-14 18:10:06.872313+00,2016-01-27 20:40:15.46361+00,2960.77
4,skoda,fabia,128886.0,2004.0,1200.0,84.48426,compact,,man,5.0,5.0,gasoline,2015-11-14 18:10:06.880335+00,2016-01-27 20:40:15.46361+00,2738.71


In [4]:
df['date_created'] = pd.to_datetime(df['date_created'])
df['year_created'] = df['date_created'].dt.year

df['man_period'] = df['year_created'] - df['manufacture_year']
df['stk_period'] = df['year_created'] - df['stk_year']

In [5]:
df = df.drop(['manufacture_year', 'stk_year', 'date_created', 'date_last_seen', 'year_created'], axis=1)
df.iloc[:,[0,1,5,6,9]] = df.iloc[:,[0,1,5,6,9]].fillna('unknown')

## Feature processing

In [6]:
X = df.drop('price_eur', axis=1).to_numpy()
y = df['price_eur'].to_numpy()

In [7]:
class Indicator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        nonnull_X = np.nan_to_num(X.astype(float), nan=0).astype(int)

        missing_indicator = MissingIndicator()
        indicator_values = missing_indicator.fit_transform(X).astype(int)

        return np.c_[nonnull_X, indicator_values]

In [8]:
num_cols = [2,3,4,7,8,10,11]
cat_cols = [0,5,6,9]

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
indicator = Indicator()

full_pipeline = ColumnTransformer([
    ("num", indicator, num_cols),
    ("cat", one_hot_encoder , cat_cols),
])

In [9]:
full_pipeline.fit(X)
X_trans = full_pipeline.transform(X)

## Model

### Linear regression

In [10]:
eva_df = pd.DataFrame(columns = ["Model", "RMSE", "MAE", "R2" ])     

In [11]:
lin_reg = LinearRegression(fit_intercept=True)

# K-fold cross validation with k=5
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(lin_reg, X_trans, y, cv=kfold, scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error','r2'] )

mse = -scores['test_neg_mean_squared_error'].mean()
mae = -scores['test_neg_mean_absolute_error'].mean()
R2 = scores['test_r2'].mean()

eva_df = eva_df.append({"Model": "Linear Regression", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,6697.777555,4215.4448,0.722627


### Lasso

In [12]:
# # Create a model instance
# lasso_reg = Lasso()

# # Define the alpha values to be tested
# alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# # GridSearchCV
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# lasso_grid = GridSearchCV(estimator=lasso_reg, param_grid={'alpha': alphas}, cv=kfold, return_train_score=True)

# # Fit
# lasso_grid.fit(X_trans,y)

# # Alpha
# alpha = lasso_grid.best_params_['alpha']
# alpha

In [13]:
alpha = 0.1
# K-fold cross validation with k=10
lasso_reg = Lasso(alpha=alpha)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(lasso_reg, X_trans, y, cv=kfold, scoring = ['neg_mean_squared_error','neg_mean_absolute_error', 'r2'] )

# Adding metrics to the evaluation dataframe
mse = -scores['test_neg_mean_squared_error'].mean()
mae = -scores['test_neg_mean_absolute_error'].mean()
R2 = scores['test_r2'].mean()

eva_df = eva_df.append({"Model": "LASSO regression", "RMSE" : mse**0.5, "MAE":mae, "R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,6697.777555,4215.4448,0.722627
1,LASSO regression,6697.431707,4215.383759,0.722656


In [14]:
# # Create a model instance
# ridge_reg = Ridge()

# # Define the alpha values to be tested
# alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# # GridSearchCV for alpha
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# ridge_grid = GridSearchCV(estimator=ridge_reg, param_grid={'alpha': alphas}, cv=kfold, return_train_score=True)

# # Fit
# ridge_grid.fit(X_trans,y)

# # Best alpha
# alpha = ridge_grid.best_params_['alpha']
# alpha

In [15]:
alpha=10
# K-fold cross validation with k=10
ridge_reg = Ridge(alpha=alpha)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(ridge_reg, X_trans, y, cv=kfold, scoring = ['neg_mean_squared_error','neg_mean_absolute_error', 'r2'] )
# Adding metrics to the evaluation dataframe
mse = -scores['test_neg_mean_squared_error'].mean()
mae = -scores['test_neg_mean_absolute_error'].mean()
R2 = scores['test_r2'].mean()

eva_df = eva_df.append({"Model": "Ridge regression", "RMSE" : mse**0.5, "MAE":mae, "R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,6697.777555,4215.4448,0.722627
1,LASSO regression,6697.431707,4215.383759,0.722656
2,Ridge regression,6697.496975,4215.592141,0.722651


### PCA

In [16]:
pca = PCA(n_components=10)
pca.fit(X_trans)
X_pca = pca.transform(X_trans)

In [17]:
pca_lin_reg = LinearRegression(fit_intercept=True)

# K-fold cross validation with k=5
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(pca_lin_reg, X_pca, y, cv=kfold, scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error','r2'] )

mse = -scores['test_neg_mean_squared_error'].mean()
mae = -scores['test_neg_mean_absolute_error'].mean()
R2 = scores['test_r2'].mean()

eva_df = eva_df.append({"Model": "PCA Linear Regression", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,6697.777555,4215.4448,0.722627
1,LASSO regression,6697.431707,4215.383759,0.722656
2,Ridge regression,6697.496975,4215.592141,0.722651
3,PCA Linear Regression,7371.57394,4665.662816,0.66401


### Neural network

In [18]:
import tensorflow.keras as keras




In [20]:
X_train, X_val, y_train, y_val = train_test_split(X_trans, y, test_size=0.2, random_state=42)

a_model = keras.models.Sequential([
    keras.layers.Dense(units = 40, activation="relu", kernel_initializer = "he_normal"),
    keras.layers.Dense(units = 10, activation="relu", kernel_initializer = "he_normal"),
    keras.layers.Dense(1)
])

a_model.compile(loss="mean_squared_error",
                optimizer='adam')

early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

a_model.fit(X_train, y_train,
            epochs = 500,
            validation_data=(X_val, y_val),
            callbacks = [early_stopping_cb])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500


<keras.src.callbacks.History at 0x2a3562a9340>