In [3]:
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
import joblib
from feature_engineering import CustomFeatureEngineering

RANDOM_STATE = 31415

In [4]:
with open("config/db_credentials.json", "r") as f:
    db_credentials = json.load(f)

In [5]:
def create_engine_connection(db_credentials: dict):
    return create_engine(
        f"postgresql+psycopg2://{db_credentials['user']}:{db_credentials['password']}@{db_credentials['host']}:{db_credentials['port']}/{db_credentials['dbname']}"
    )

In [6]:
query = "SELECT * FROM public.cars_scraped"
engine = create_engine_connection(db_credentials)

In [7]:
# df = pd.read_sql(query, engine)
df = pd.read_parquet("/home/ubuntu/car_price_checker_2/data/feature_engineering/df.parquet")

In [8]:
df

Unnamed: 0,id,created_at,manufacturer,model,version,month,year,kms,fuel,transmission,...,age_bins,kms_per_year,kms_per_year_bins,avg_model_price,model_segment,brand_rel_freq,brand_country,brand_exclusivity,brand_group,model_rel_freq
0,1,2025-02-11 23:50:19.307619,audi,a4,AVANT ADVANCED EDITION 2.0 TDI 190 CV 5P,5,2018,133382,d,a,...,[6-9),19284.144578,medium,23708.674474,d,0.064398,germany,premium,volkswagen_group,0.007144
1,2,2025-02-11 23:50:19.307619,audi,a4,s-line,11,2022,47800,hg,a,...,[0-3),19779.310345,medium,23708.674474,d,0.064398,germany,premium,volkswagen_group,0.007144
2,3,2025-02-11 23:50:19.307619,audi,a4,S line 35 TDI 120kW (163CV) S tronic,3,2024,18120,hd,a,...,[0-3),16726.153846,medium,23708.674474,d,0.064398,germany,premium,volkswagen_group,0.007144
3,4,2025-02-11 23:50:19.307619,audi,a4,2.0 TDI 143cv DPF,1,2011,211158,d,a,...,[12-15),14818.105263,medium,23708.674474,d,0.064398,germany,premium,volkswagen_group,0.007144
4,5,2025-02-11 23:50:19.307619,audi,a4,s-line,8,2019,106014,hg,a,...,[3-6),18708.352941,medium,23708.674474,d,0.064398,germany,premium,volkswagen_group,0.007144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233061,233081,2025-03-02 00:56:46.320460,opel,kaddet comercial,Combi 9 1.6CDTi Biturbo S-S 29 L2 125,5,2018,87000,d,m,...,[6-9),12578.313253,medium,17114.382166,unknown,0.058331,germany,mainstream,stellantis,0.002695
233062,233082,2025-03-02 00:56:46.320460,opel,kaddet comercial,"Cargo L1H1 2,2t",11,2018,119005,d,m,...,[6-9),18546.233766,medium,17114.382166,unknown,0.058331,germany,mainstream,stellantis,0.002695
233063,233083,2025-03-02 00:56:46.320460,opel,kaddet comercial,Tour 1.3CDTI Expression L1H1 95,8,2017,56330,d,m,...,[6-9),7347.391304,low,17114.382166,unknown,0.058331,germany,mainstream,stellantis,0.002695
233064,233084,2025-03-02 00:56:46.320460,opel,kaddet comercial,CARGO L 650KG DIESEL 1.5 100HPS&S MT E6,3,2023,40064,d,m,...,[0-3),19230.720000,medium,17114.382166,unknown,0.058331,germany,mainstream,stellantis,0.002695


# Preprocessing

In [None]:
class CarPriceTrainingPipeline:

    def __init__(self, model=None):
        """
        Initialize the pipeline

        Parameters:
        - model: Model to use
        """
        # Define columns regarding their type
        cols_to_exclude = ["id", "created_at", "link", "price_cash", "price_financed"]
        self.numeric_features = [col for col in df.columns if (df[col].dtype in [np.int64, np.float64]) and col not in cols_to_exclude]
        # Separating high and low cardinality features
        self.high_cardinality_features = ["manufacturer", "model", "version"]
        self.low_cardinality_features = [col for col in df.columns if col not in self.numeric_features and col not in self.high_cardinality_features and col not in cols_to_exclude]

        # Initialize the pipeline
        self.preprocessor = None
        # Initialize the model
        if model is None:
            self.model = XGBRegressor(
                n_estimators=100,
                max_depth=10,
                random_state=31415
            )
        else:
            self.model = model

    def create_feature_engineering_pipeline(self):
        self.feature_engineering = Pipeline(steps=[
            ("feature_engineering", CustomFeatureEngineering())
        ])


    def create_preprocessing_pipeline(self, y_train=None):
        """
        Create the preprocessing pipeline

        Parameters:
        - y_train (pd.Series): Target variable.
        """

        # Numeric features
        numeric_transformer = Pipeline(steps=[
            ("scaler", StandardScaler())
        ])
        
        # One hot encoding for low cardinality features
        low_cardinality_transformer = Pipeline(steps=[
            ("onehot", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"))
        ])

        # Target encoding for high cardinality features
        high_cardinality_transformer = Pipeline(steps=[
            ("target", TargetEncoder(smoothing=10))
        ])

        # Define the transformers
        transformers = [
            ("num", numeric_transformer, self.numeric_features),
            ("low_card", low_cardinality_transformer, self.low_cardinality_features),
        ]
        # Add high cardinality transformer if target variable is provided
        if y_train is not None and len(self.high_cardinality_features) > 0:
            transformers.append(("high_card", high_cardinality_transformer, self.high_cardinality_features))

        self.preprocessor = ColumnTransformer(transformers=transformers)

    def get_preprocessed_data(self, data, target_column='price_cash'):
        """
        Apply the preprocessing pipeline to the data and return the transformed data
        """
        X = data.drop(columns=[target_column])
        y = data[target_column]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31415)

        # Fit the preprocessor if it has not been fitted yet
        if self.preprocessor is None:
            self.create_preprocessing_pipeline(y_train)
            self.preprocessor.fit(X_train, y_train)

        # Transform the data
        X_train_transformed = self.preprocessor.transform(X_train)
        X_test_transformed = self.preprocessor.transform(X_test)
        
        return X_train_transformed, X_test_transformed, y_train, y_test

    def create_full_pipeline(self, y_train=None):
        """
        Create the full pipeline

        Parameters:
        - y_train (pd.Series): Target variable.
        """

        # Create the feature engineering pipeline
        self.create_feature_engineering_pipeline()

        # Create the preprocessing pipeline
        self.create_preprocessing_pipeline(y_train=y_train)

        # Create the full pipeline
        self.model = Pipeline(steps=[
            ("feature_engineering", self.feature_engineering),
            ("preprocessor", self.preprocessor),
            ("model", self.model)
        ])

    def train(self, X, y, verbose=False):
        """
        Train the model

        Parameters:
        - X (pd.DataFrame): Features.
        - y (pd.Series): Target variable.
        """

        # Divide the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31415)

        # Create and train the full pipeline
        self.create_full_pipeline(y_train=y_train)
        self.model.fit(X_train, y_train)

        # Evaluate the model
        train_score = self.model.score(X_train, y_train)
        test_score = self.model.score(X_test, y_test)

        if verbose:
            print(f"\tR² train score: {train_score:.4f}")
            print(f"\tR² test score: {test_score:.4f}")

        return (
            ({"train_score": train_score, "test_score": test_score}),
            ({"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test})
        )
    
    def save_model(self, path: str = "car_price_model.joblib"):
        """
        Save the model

        Parameters:
        - path (str): Path to save the model.
        """

        if self.model is None:
            raise Exception("Model has not been trained yet")
        joblib.dump(self.model, path)

    def load_model(self, path: str = "car_price_model.joblib"):
        """
        Load the model

        Parameters:
        - path (str): Path to load the model.
        """

        self.model = joblib.load(path)
        self.preprocessor = self.model.named_steps["preprocessor"]

    def predict(self, X):
        """
        Predict the target variable

        Parameters:
        - X (pd.DataFrame): Features.
        """

        if self.model is None:
            raise Exception("Model has not been trained yet")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Predict the target variable probabilities

        Parameters:
        - X (pd.DataFrame): Features.
        """

        if self.model is None:
            raise Exception("Model has not been trained yet")
        return self.model.predict_proba(X)

In [10]:

models = {
    "Linear Regression": LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.01, max_iter=10000),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=RANDOM_STATE),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=RANDOM_STATE),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=RANDOM_STATE),
    'LightGBM': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=RANDOM_STATE),
    # 'CatBoost': CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, random_seed=42, verbose=0),
}

In [11]:
X = df.drop(columns=["id", "created_at", "price_cash", "price_financed", "link"]).copy()
y = df["price_cash"].copy()

In [None]:
results = {}
PATH_MODELS_SCORES = "results/models_scores.parquet"
if os.path.exists(PATH_MODELS_SCORES):
    scores = pd.read_parquet(PATH_MODELS_SCORES)
    if scores["test_score_new"].max() > scores["test_score_old"].max():
        scores["test_score_old"] = scores["test_score_new"]
        scores["train_score_old"] = scores["train_score_new"]
else:
    scores = pd.DataFrame(index=models.keys(), columns=["test_score_new", "test_score_old", "diff_test", "train_score_new", "train_score_old", "diff_train"])
    scores["test_score_old"] = 0
    scores["train_score_old"] = 0
for model in models:
    print(f"Training model: {model}...")
    results[model] = {}
    results[model]["model"] = models["model"]
    results[model]["pipeline"] = CarPriceTrainingPipeline(models[model])
    results[model]["results"], results[model]["datasets"] = results[model]["pipeline"].train(X, y)
    scores.loc[model, "test_score_new"] = results[model]["results"]["test_score"]
    scores.loc[model, "train_score_new"] = results[model]["results"]["train_score"]
scores["diff_test"] = scores["test_score_new"] - scores["test_score_old"]
scores["diff_train"] = scores["train_score_new"] - scores["train_score_old"]
scores.sort_values(by="test_score_new", ascending=False, inplace=True)
scores.to_parquet(PATH_MODELS_SCORES)
best_model = scores.index[0]
scores

Training model: Linear Regression...
Training model: Ridge...
Training model: Lasso...


  model = cd_fast.enet_coordinate_descent(


Training model: Random Forest...
Training model: XGBoost...
Training model: GradientBoosting...
Training model: LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2260
[LightGBM] [Info] Number of data points in the train set: 186452, number of used features: 94
[LightGBM] [Info] Start training from score 21996.935453




Unnamed: 0,test_score_new,test_score_old,diff_test,train_score_new,train_score_old,diff_train
GradientBoosting,0.748079,0.751419,-0.003341,0.936414,0.934166,0.002248
XGBoost,0.694211,0.54483,0.149381,0.914676,0.858957,0.055718
Random Forest,0.648122,0.711332,-0.06321,0.873951,0.891645,-0.017694
LightGBM,0.434798,0.428168,0.00663,0.758297,0.741212,0.017086
Lasso,0.392805,0.371672,0.021133,0.593617,0.552379,0.041238
Linear Regression,0.3928,0.371672,0.021128,0.593618,0.552379,0.041239
Ridge,0.392791,0.371672,0.02112,0.593598,0.552379,0.041219


In [11]:
scores

Unnamed: 0,test_score_new,test_score_old,diff_test,train_score_new,train_score_old,diff_train
GradientBoosting,0.748079,0.751419,-0.003341,0.936414,0.934166,0.002248
XGBoost,0.694211,0.54483,0.149381,0.914676,0.858957,0.055718
Random Forest,0.648122,0.711332,-0.06321,0.873951,0.891645,-0.017694
LightGBM,0.434798,0.428168,0.00663,0.758297,0.741212,0.017086
Lasso,0.392805,0.371672,0.021133,0.593617,0.552379,0.041238
Linear Regression,0.3928,0.371672,0.021128,0.593618,0.552379,0.041239
Ridge,0.392791,0.371672,0.02112,0.593598,0.552379,0.041219


In [13]:
results[best_model]["model"].save_model(f"models/{best_model}.joblib")

AttributeError: 'str' object has no attribute 'save_model'

In [15]:
X_predict = pd.DataFrame({
    "manufacturer": ["audi"],
    "model": ["a4"],
    "version": [None],
    "month": [11],
    "year": [2017],
    "fuel": ["d"],
    "transmission": ["m"],
    "color": ["negro"],
    "kms": [128000],
    "power_hp": [150],
    "no_doors": [5],
    "seller": ["part"]
})

In [16]:
results[best_model]["pipeline"].predict(X_predict)

ValueError: columns are missing: {'age', 'brand_group', 'brand_country', 'kms_per_year', 'model_rel_freq', 'kms_per_year_bins', 'brand_rel_freq', 'model_segment', 'brand_exclusivity', 'age_bins', 'avg_model_price'}