In [2]:
import os
import sys
from pathlib import Path

PATH_BASE = Path().resolve().parent.parent
sys.path.append(PATH_BASE)

import pandas as pd
import numpy as np
import json

from datetime import datetime
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
import joblib
from feature_engineering import CustomFeatureEngineering

RANDOM_STATE = 31415

In [3]:
with open(os.path.join(PATH_BASE, "config/db_credentials.json"), "r") as f:
    db_credentials = json.load(f)

In [4]:
def create_engine_connection(db_credentials: dict):
    return create_engine(
        f"postgresql+psycopg2://{db_credentials['user']}:{db_credentials['password']}@{db_credentials['host']}:{db_credentials['port']}/{db_credentials['dbname']}"
    )

In [5]:
query = "SELECT * FROM public.cars_scraped"
engine = create_engine_connection(db_credentials)

In [6]:
df = pd.read_sql(query, engine)
# df = pd.read_parquet("/home/ubuntu/car_price_checker_2/data/feature_engineering/df.parquet")

In [7]:
df

Unnamed: 0,id,created_at,manufacturer,model,version,month,year,kms,fuel,transmission,power_hp,no_doors,color,seller,price_cash,price_financed,link
0,1,2025-02-11 23:50:19.307619,audi,a4,AVANT ADVANCED EDITION 2.0 TDI 190 CV 5P,5,2018,133382,d,a,190,5,gris,prof,17200.0,248.24,https://www.coches.com/coches-segunda-mano/oca...
1,2,2025-02-11 23:50:19.307619,audi,a4,s-line,11,2022,47800,hg,a,136,5,gris,prof,27128.0,25772.00,https://www.coches.com/coches-segunda-mano/oca...
2,3,2025-02-11 23:50:19.307619,audi,a4,S line 35 TDI 120kW (163CV) S tronic,3,2024,18120,hd,a,163,4,blanco,prof,35900.0,34900.00,https://www.coches.com/coches-segunda-mano/oca...
3,4,2025-02-11 23:50:19.307619,audi,a4,2.0 TDI 143cv DPF,1,2011,211158,d,a,143,4,negro,prof,9800.0,,https://www.coches.com/coches-segunda-mano/oca...
4,5,2025-02-11 23:50:19.307619,audi,a4,s-line,8,2019,106014,hg,a,150,5,negro,prof,19289.0,18324.00,https://www.coches.com/coches-segunda-mano/oca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438961,438981,2025-03-23 11:50:18.954672,renault,r5,R5 E-Tech Iconic Cinq 110 kW (150 CV) Autonomí...,10,2024,10000,e,a,150,5,azul,prof,35400.0,31700.00,https://www.coches.com/coches-segunda-mano/oca...
438962,438982,2025-03-23 11:50:18.954672,renault,r5,R5 E-Tech Iconic Cinq 110 kW (150 CV) Autonomí...,10,2024,10000,e,a,150,5,blanco,prof,35700.0,32000.00,https://www.coches.com/coches-segunda-mano/oca...
438963,438983,2025-03-23 11:50:18.954672,renault,r5,1.1 TL,6,1984,154000,g,m,46,5,rojo,prof,135000.0,,https://www.coches.com/coches-segunda-mano/oca...
438964,438984,2025-03-23 11:50:18.954672,renault,r5,Supercinco 1.1 Five,1,1988,170000,g,m,49,3,blanco,part,4550.0,,https://www.coches.com/coches-segunda-mano/oca...


# Preprocessing

In [8]:
class PreprocessingPipeline:
    
	def __init__(self):
		self._pipeline = None

	def create_pipeline(self):

		def _create_feature_engineering_pipeline(self):
			return Pipeline(steps=[
				("feature_engineering", CustomFeatureEngineering())
			])

		def _create_preprocessing_transformers(self, y_train=None):
			"""
			Create the preprocessing pipeline

			Parameters:
			- y_train (pd.Series): Target variable.
			"""

			# Numeric features
			numeric_transformer = Pipeline(steps=[
				("scaler", StandardScaler())
			])
			
			# One hot encoding for low cardinality features
			low_cardinality_transformer = Pipeline(steps=[
				("onehot", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"))
			])

			# Target encoding for high cardinality features
			high_cardinality_transformer = Pipeline(steps=[
				("target", TargetEncoder(smoothing=10))
			])

			# Define the transformers
			transformers = [
				("num", numeric_transformer, self.numeric_features),
				("low_card", low_cardinality_transformer, self.low_cardinality_features),
				("high_card", high_cardinality_transformer, self.high_cardinality_features)
			]

			return ColumnTransformer(
				transformers=transformers,
				remainder="passthrough"
			)
		
		self._pipeline = Pipeline(steps=[
			("feature_engineering", _create_feature_engineering_pipeline(self)),
			("preprocessing", _create_preprocessing_transformers(self))
		])

		return self._pipeline

In [8]:
df.drop(columns=["price_cash"])

Unnamed: 0,id,created_at,manufacturer,model,version,month,year,kms,fuel,transmission,power_hp,no_doors,color,seller,price_financed,link
0,1,2025-02-11 23:50:19.307619,audi,a4,AVANT ADVANCED EDITION 2.0 TDI 190 CV 5P,5,2018,133382,d,a,190,5,gris,prof,248.24,https://www.coches.com/coches-segunda-mano/oca...
1,2,2025-02-11 23:50:19.307619,audi,a4,s-line,11,2022,47800,hg,a,136,5,gris,prof,25772.00,https://www.coches.com/coches-segunda-mano/oca...
2,3,2025-02-11 23:50:19.307619,audi,a4,S line 35 TDI 120kW (163CV) S tronic,3,2024,18120,hd,a,163,4,blanco,prof,34900.00,https://www.coches.com/coches-segunda-mano/oca...
3,4,2025-02-11 23:50:19.307619,audi,a4,2.0 TDI 143cv DPF,1,2011,211158,d,a,143,4,negro,prof,,https://www.coches.com/coches-segunda-mano/oca...
4,5,2025-02-11 23:50:19.307619,audi,a4,s-line,8,2019,106014,hg,a,150,5,negro,prof,18324.00,https://www.coches.com/coches-segunda-mano/oca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438961,438981,2025-03-23 11:50:18.954672,renault,r5,R5 E-Tech Iconic Cinq 110 kW (150 CV) Autonomí...,10,2024,10000,e,a,150,5,azul,prof,31700.00,https://www.coches.com/coches-segunda-mano/oca...
438962,438982,2025-03-23 11:50:18.954672,renault,r5,R5 E-Tech Iconic Cinq 110 kW (150 CV) Autonomí...,10,2024,10000,e,a,150,5,blanco,prof,32000.00,https://www.coches.com/coches-segunda-mano/oca...
438963,438983,2025-03-23 11:50:18.954672,renault,r5,1.1 TL,6,1984,154000,g,m,46,5,rojo,prof,,https://www.coches.com/coches-segunda-mano/oca...
438964,438984,2025-03-23 11:50:18.954672,renault,r5,Supercinco 1.1 Five,1,1988,170000,g,m,49,3,blanco,part,,https://www.coches.com/coches-segunda-mano/oca...


In [9]:
preprocessor = PreprocessingPipeline()
X_train, X_test, y_train, y_test = preprocessor.get_preprocessed_data(df)



In [15]:
pd.DataFrame(X_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
count,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,...,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0,351172.0
mean,9.218368e-17,-7.817241e-15,1.425245e-16,2.767938e-17,1.180825e-16,0.000362,0.387511,0.026383,0.361823,1.4e-05,...,0.00817,0.00856,0.134678,0.033912,0.074118,0.007936,0.978085,21845.580122,21820.457991,21663.008311
std,1.000001,1.000001,1.000001,1.000001,1.000001,0.019014,0.487183,0.160272,0.480529,0.003773,...,0.090017,0.092123,0.341379,0.181003,0.261962,0.088732,0.146407,8266.82701,10623.396764,9012.796938
min,-1.274419,-13.81071,-1.38346,-2.260296,-7.211179,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9149.445605,4834.807382,3139.720641
25%,-0.9984382,-0.2526068,-0.7834014,-0.5282137,0.4296203,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,15030.649745,15502.596234,16204.534231
50%,-0.1704961,0.02408912,-0.1706466,-0.2104004,0.4296203,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19706.80532,19671.032632,20495.656997
75%,0.9334267,0.577481,0.5629883,0.1074128,0.4296203,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,28239.920224,25515.991543,24356.980685
max,1.761369,2.791049,13.62336,32.68327,0.4296203,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,264462.079471,236607.330753,201259.831336


In [12]:
y_train

23919     17599.0
375712    27850.0
214480    35995.0
166621    33000.0
111847    10799.0
           ...   
3685      11000.0
288781    37900.0
14051     38890.0
399527    18290.0
286270    15599.0
Name: price_cash, Length: 351172, dtype: float64

In [11]:
X = df.drop(columns=["id", "created_at", "price_cash", "price_financed", "link"]).copy()
y = df["price_cash"].copy()

In [None]:
results = {}
PATH_MODELS_SCORES = "results/models_scores.parquet"
if os.path.exists(PATH_MODELS_SCORES):
    scores = pd.read_parquet(PATH_MODELS_SCORES)
    if scores["test_score_new"].max() > scores["test_score_old"].max():
        scores["test_score_old"] = scores["test_score_new"]
        scores["train_score_old"] = scores["train_score_new"]
else:
    scores = pd.DataFrame(index=models.keys(), columns=["test_score_new", "test_score_old", "diff_test", "train_score_new", "train_score_old", "diff_train"])
    scores["test_score_old"] = 0
    scores["train_score_old"] = 0
for model in models:
    print(f"Training model: {model}...")
    results[model] = {}
    results[model]["model"] = models["model"]
    results[model]["pipeline"] = CarPriceTrainingPipeline(models[model])
    results[model]["results"], results[model]["datasets"] = results[model]["pipeline"].train(X, y)
    scores.loc[model, "test_score_new"] = results[model]["results"]["test_score"]
    scores.loc[model, "train_score_new"] = results[model]["results"]["train_score"]
scores["diff_test"] = scores["test_score_new"] - scores["test_score_old"]
scores["diff_train"] = scores["train_score_new"] - scores["train_score_old"]
scores.sort_values(by="test_score_new", ascending=False, inplace=True)
scores.to_parquet(PATH_MODELS_SCORES)
best_model = scores.index[0]
scores

Training model: Linear Regression...
Training model: Ridge...
Training model: Lasso...


  model = cd_fast.enet_coordinate_descent(


Training model: Random Forest...
Training model: XGBoost...
Training model: GradientBoosting...
Training model: LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2260
[LightGBM] [Info] Number of data points in the train set: 186452, number of used features: 94
[LightGBM] [Info] Start training from score 21996.935453




Unnamed: 0,test_score_new,test_score_old,diff_test,train_score_new,train_score_old,diff_train
GradientBoosting,0.748079,0.751419,-0.003341,0.936414,0.934166,0.002248
XGBoost,0.694211,0.54483,0.149381,0.914676,0.858957,0.055718
Random Forest,0.648122,0.711332,-0.06321,0.873951,0.891645,-0.017694
LightGBM,0.434798,0.428168,0.00663,0.758297,0.741212,0.017086
Lasso,0.392805,0.371672,0.021133,0.593617,0.552379,0.041238
Linear Regression,0.3928,0.371672,0.021128,0.593618,0.552379,0.041239
Ridge,0.392791,0.371672,0.02112,0.593598,0.552379,0.041219


In [11]:
scores

Unnamed: 0,test_score_new,test_score_old,diff_test,train_score_new,train_score_old,diff_train
GradientBoosting,0.748079,0.751419,-0.003341,0.936414,0.934166,0.002248
XGBoost,0.694211,0.54483,0.149381,0.914676,0.858957,0.055718
Random Forest,0.648122,0.711332,-0.06321,0.873951,0.891645,-0.017694
LightGBM,0.434798,0.428168,0.00663,0.758297,0.741212,0.017086
Lasso,0.392805,0.371672,0.021133,0.593617,0.552379,0.041238
Linear Regression,0.3928,0.371672,0.021128,0.593618,0.552379,0.041239
Ridge,0.392791,0.371672,0.02112,0.593598,0.552379,0.041219


In [13]:
results[best_model]["model"].save_model(f"models/{best_model}.joblib")

AttributeError: 'str' object has no attribute 'save_model'

In [15]:
X_predict = pd.DataFrame({
    "manufacturer": ["audi"],
    "model": ["a4"],
    "version": [None],
    "month": [11],
    "year": [2017],
    "fuel": ["d"],
    "transmission": ["m"],
    "color": ["negro"],
    "kms": [128000],
    "power_hp": [150],
    "no_doors": [5],
    "seller": ["part"]
})

In [16]:
results[best_model]["pipeline"].predict(X_predict)

ValueError: columns are missing: {'age', 'brand_group', 'brand_country', 'kms_per_year', 'model_rel_freq', 'kms_per_year_bins', 'brand_rel_freq', 'model_segment', 'brand_exclusivity', 'age_bins', 'avg_model_price'}