In [17]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import joblib

import os 

import warnings
warnings.filterwarnings("ignore")

### Import data

In [18]:
link = r"D:/clean_df.csv"
df = pd.read_csv(link)

df.head()

Unnamed: 0,maker,model,mileage,manufacture_year,engine_displacement,engine_power,body_type,stk_year,transmission,door_count,seat_count,fuel_type,date_created,date_last_seen,price_eur
0,ford,galaxy,151000.0,2011.0,2000.0,138.12506,compact,,man,5.0,7.0,diesel,2015-11-14 18:10:06.838319+00,2016-01-27 20:40:15.46361+00,10584.75
1,skoda,octavia,143476.0,2012.0,2000.0,108.62262,compact,,man,5.0,5.0,diesel,2015-11-14 18:10:06.853411+00,2016-01-27 20:40:15.46361+00,8882.31
2,bmw,,97676.0,2010.0,1995.0,113.9867,compact,,man,5.0,5.0,diesel,2015-11-14 18:10:06.861792+00,2016-01-27 20:40:15.46361+00,12065.06
3,skoda,fabia,111970.0,2004.0,1200.0,84.48426,compact,,man,5.0,5.0,gasoline,2015-11-14 18:10:06.872313+00,2016-01-27 20:40:15.46361+00,2960.77
4,skoda,fabia,128886.0,2004.0,1200.0,84.48426,compact,,man,5.0,5.0,gasoline,2015-11-14 18:10:06.880335+00,2016-01-27 20:40:15.46361+00,2738.71


### Create man_period, stk_period

In [19]:
df['date_created'] = pd.to_datetime(df['date_created'])
df['year_created'] = df['date_created'].dt.year

df['man_period'] = df['year_created'] - df['manufacture_year']
df['stk_period'] = df['year_created'] - df['stk_year']

### Drop and impute column for faster one hot encoding

In [20]:
df = df.drop(['manufacture_year', 'stk_year', 'date_created', 'date_last_seen', 'year_created'], axis=1)
df.iloc[:,[0,1,5,6,9]] = df.iloc[:,[0,1,5,6,9]].fillna('unknown')

### Split

In [21]:
X = df.drop('price_eur', axis=1).to_numpy()
y = df['price_eur'].to_numpy()

### Feature processing

In [22]:
class Indicator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        nonnull_X = np.nan_to_num(X.astype(float), nan=0).astype(int)

        missing_indicator = MissingIndicator()
        indicator_values = missing_indicator.fit_transform(X).astype(int)

        return np.c_[nonnull_X, indicator_values]

In [23]:
num_cols = [2,3,4,7,8,10,11]
cat_cols = [0,1,5,6,9]

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
indicator = Indicator()

full_pipeline = ColumnTransformer([
    ("num", indicator, num_cols),
    ("cat", one_hot_encoder , cat_cols),
])

In [24]:
full_pipeline.fit(X)
X_trans = full_pipeline.transform(X)

In [25]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_trans, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

## Model

In [26]:
eva_df = pd.DataFrame(columns = ["Model", "RMSE", "MAE", "R2" ])     

### Linear regression

In [27]:
# lin_reg = LinearRegression(fit_intercept=True)

# # K-fold cross validation with k=5
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# scores = cross_validate(lin_reg, X_trans, y, cv=kfold, scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error','r2'] )

# mse = -scores['test_neg_mean_squared_error'].mean()
# mae = -scores['test_neg_mean_absolute_error'].mean()
# R2 = scores['test_r2'].mean()

# eva_df = eva_df.append({"Model": "Linear Regression", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
# eva_df

In [28]:
lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X_train_val, y_train_val)

y_pred = lin_reg.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

eva_df = eva_df.append({"Model": "Linear Regression", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,6418.692939,4005.508958,0.743824


In [29]:
joblib.dump(lin_reg, './model/linear_regression.joblib')

['linear_regression.joblib']

### Lasso

In [30]:
# # Create a model instance
# lasso_reg = Lasso()

# # Define the alpha values to be tested
# alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# # GridSearchCV
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# lasso_grid = GridSearchCV(estimator=lasso_reg, param_grid={'alpha': alphas}, cv=kfold, return_train_score=True)

# # Fit
# lasso_grid.fit(X_trans,y)

# # Alpha
# alpha = lasso_grid.best_params_['alpha']
# alpha = 0.1

In [31]:
# alpha = 0.1
# # K-fold cross validation with k=10
# lasso_reg = Lasso(alpha=alpha)

# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# scores = cross_validate(lasso_reg, X_trans, y, cv=kfold, scoring = ['neg_mean_squared_error','neg_mean_absolute_error', 'r2'] )

# # Adding metrics to the evaluation dataframe
# mse = -scores['test_neg_mean_squared_error'].mean()
# mae = -scores['test_neg_mean_absolute_error'].mean()
# R2 = scores['test_r2'].mean()

# eva_df = eva_df.append({"Model": "LASSO regression", "RMSE" : mse**0.5, "MAE":mae, "R2" : R2},ignore_index=True)
# eva_df

In [32]:
lasso_reg = Lasso(fit_intercept=True, alpha = 0.1)
lasso_reg.fit(X_train_val, y_train_val)

y_pred = lasso_reg.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

eva_df = eva_df.append({"Model": "LASSO regression", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
eva_df

In [None]:
joblib.dump(lasso_reg, './model/LASSO.joblib')

### Ridge

In [None]:
# # Create a model instance
# ridge_reg = Ridge()

# # Define the alpha values to be tested
# alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# # GridSearchCV for alpha
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# ridge_grid = GridSearchCV(estimator=ridge_reg, param_grid={'alpha': alphas}, cv=kfold, return_train_score=True)

# # Fit
# ridge_grid.fit(X_trans,y)

# # Best alpha
# alpha = ridge_grid.best_params_['alpha']
# alpha

In [None]:
# alpha=10
# # K-fold cross validation with k=10
# ridge_reg = Ridge(alpha=alpha)

# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# scores = cross_validate(ridge_reg, X_trans, y, cv=kfold, scoring = ['neg_mean_squared_error','neg_mean_absolute_error', 'r2'] )
# # Adding metrics to the evaluation dataframe
# mse = -scores['test_neg_mean_squared_error'].mean()
# mae = -scores['test_neg_mean_absolute_error'].mean()
# R2 = scores['test_r2'].mean()

# eva_df = eva_df.append({"Model": "Ridge regression", "RMSE" : mse**0.5, "MAE":mae, "R2" : R2},ignore_index=True)
# eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,6697.777555,4215.4448,0.722627
1,LASSO regression,6697.431707,4215.383759,0.722656
2,Ridge regression,6697.496975,4215.592141,0.722651


In [None]:
ridge_reg = Ridge(fit_intercept=True, alpha = 10)
ridge_reg.fit(X_train_val, y_train_val)

y_pred = ridge_reg.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

eva_df = eva_df.append({"Model": "Ridge regression", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,6691.190191,4213.63101,0.721611
1,LASSO regression,6690.413671,4213.44705,0.721676
2,Ridge regression,6690.620707,4213.699945,0.721659


In [None]:
joblib.dump(ridge_reg, './model/ridge_regression.joblib')

### PCA

In [None]:
pca = PCA(n_components=10)
pca.fit(X_train_val)
X_train_pca = pca.transform(X_train_val)

In [None]:
pca.transform(X_test).shape

(449643, 10)

In [None]:
pca_lin_reg = LinearRegression(fit_intercept=True)
pca_lin_reg.fit(X_train_pca, y_train_val)

X_test_pca = pca.transform(X_test)
y_pred = pca_lin_reg.predict(X_test_pca)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

eva_df = eva_df.append({"Model": "Linear regression with PCA", "RMSE" : mse**0.5, "MAE":mae ,"R2" : R2},ignore_index=True)
eva_df

Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,6691.190191,4213.63101,0.721611
1,LASSO regression,6690.413671,4213.44705,0.721676
2,Ridge regression,6690.620707,4213.699945,0.721659
3,Linear regression with PCA,7369.2715,4661.315633,0.662329


In [None]:
joblib.dump(pca_lin_reg, './model/pca_linear_regression.joblib')