In [38]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
import joblib

In [14]:
with open("config/db_credentials.json", "r") as f:
    db_credentials = json.load(f)

In [15]:
def create_engine_connection(db_credentials: dict):
    return create_engine(
        f"postgresql+psycopg2://{db_credentials['user']}:{db_credentials['password']}@{db_credentials['host']}:{db_credentials['port']}/{db_credentials['dbname']}"
    )

In [16]:
query = "SELECT * FROM public.cars_scraped"
engine = create_engine_connection(db_credentials)

In [17]:
df = pd.read_sql(query, engine)

In [18]:
df

Unnamed: 0,id,created_at,manufacturer,model,version,month,year,kms,fuel,transmission,power_hp,no_doors,color,seller,price_cash,price_financed,link
0,1,2025-02-11 23:50:19.307619,audi,a4,AVANT ADVANCED EDITION 2.0 TDI 190 CV 5P,5,2018,133382,d,a,190,5,gris,prof,17200.0,248.24,https://www.coches.com/coches-segunda-mano/oca...
1,2,2025-02-11 23:50:19.307619,audi,a4,s-line,11,2022,47800,hg,a,136,5,gris,prof,27128.0,25772.00,https://www.coches.com/coches-segunda-mano/oca...
2,3,2025-02-11 23:50:19.307619,audi,a4,S line 35 TDI 120kW (163CV) S tronic,3,2024,18120,hd,a,163,4,blanco,prof,35900.0,34900.00,https://www.coches.com/coches-segunda-mano/oca...
3,4,2025-02-11 23:50:19.307619,audi,a4,2.0 TDI 143cv DPF,1,2011,211158,d,a,143,4,negro,prof,9800.0,,https://www.coches.com/coches-segunda-mano/oca...
4,5,2025-02-11 23:50:19.307619,audi,a4,s-line,8,2019,106014,hg,a,150,5,negro,prof,19289.0,18324.00,https://www.coches.com/coches-segunda-mano/oca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154653,154673,2025-02-20 21:29:38.344217,tesla,model y,Tracción Trasera RWD 188 kW (256 CV),3,2024,11421,e,a,256,5,blanco,prof,39990.0,36355.00,https://www.coches.com/coches-segunda-mano/oca...
154654,154674,2025-02-20 21:29:38.344217,tesla,model y,Gran Autonomía 4WD 378 kW (514 CV),11,2021,84299,e,a,514,5,negro,prof,39490.0,35900.00,https://www.coches.com/coches-segunda-mano/oca...
154655,154675,2025-02-20 21:29:38.344217,tesla,model y,Gran Autonomia AWD,1,2022,45000,e,a,351,5,negro,part,37850.0,489.69,https://www.coches.com/coches-segunda-mano/oca...
154656,154676,2025-02-20 21:29:38.344217,tesla,model y,RWD,1,2024,2280,e,a,255,5,gris,part,44000.0,485.60,https://www.coches.com/coches-segunda-mano/oca...


In [33]:
df_audi = df[(df["manufacturer"]=="audi")]
df_audi

Unnamed: 0,id,created_at,manufacturer,model,version,month,year,kms,fuel,transmission,power_hp,no_doors,color,seller,price_cash,price_financed,link
0,1,2025-02-11 23:50:19.307619,audi,a4,AVANT ADVANCED EDITION 2.0 TDI 190 CV 5P,5,2018,133382,d,a,190,5,gris,prof,17200.0,248.24,https://www.coches.com/coches-segunda-mano/oca...
1,2,2025-02-11 23:50:19.307619,audi,a4,s-line,11,2022,47800,hg,a,136,5,gris,prof,27128.0,25772.00,https://www.coches.com/coches-segunda-mano/oca...
2,3,2025-02-11 23:50:19.307619,audi,a4,S line 35 TDI 120kW (163CV) S tronic,3,2024,18120,hd,a,163,4,blanco,prof,35900.0,34900.00,https://www.coches.com/coches-segunda-mano/oca...
3,4,2025-02-11 23:50:19.307619,audi,a4,2.0 TDI 143cv DPF,1,2011,211158,d,a,143,4,negro,prof,9800.0,,https://www.coches.com/coches-segunda-mano/oca...
4,5,2025-02-11 23:50:19.307619,audi,a4,s-line,8,2019,106014,hg,a,150,5,negro,prof,19289.0,18324.00,https://www.coches.com/coches-segunda-mano/oca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14770,6431,2025-02-12 06:18:08.863357,audi,q3,45 TFSI e 180kW (245CV) S tronic,7,2021,25000,Híbrido Enchufable Gasolina,a,244,5,gris,prof,32850.0,,https://www.coches.com/coches-segunda-mano/oca...
14771,6449,2025-02-12 06:18:08.863357,audi,q3,Advanced 45 TFSIe 180 kW (245 CV) S tronic,10,2021,43377,Híbrido Enchufable Gasolina,a,245,5,blanco,prof,31300.0,,https://www.coches.com/coches-segunda-mano/oca...
14772,6451,2025-02-12 06:18:08.863357,audi,q3,45 TFSI e 180kW (245CV) S tronic,7,2021,25000,Híbrido Enchufable Gasolina,a,244,5,gris,prof,32850.0,,https://www.coches.com/coches-segunda-mano/oca...
14773,6469,2025-02-12 06:18:08.863357,audi,q3,Advanced 45 TFSIe 180 kW (245 CV) S tronic,10,2021,43377,Híbrido Enchufable Gasolina,a,245,5,blanco,prof,31300.0,,https://www.coches.com/coches-segunda-mano/oca...


In [34]:
df_audi_a4 = df[(df["manufacturer"]=="audi") & (df["model"]=="a4")]
df_audi_a4

Unnamed: 0,id,created_at,manufacturer,model,version,month,year,kms,fuel,transmission,power_hp,no_doors,color,seller,price_cash,price_financed,link
0,1,2025-02-11 23:50:19.307619,audi,a4,AVANT ADVANCED EDITION 2.0 TDI 190 CV 5P,5,2018,133382,d,a,190,5,gris,prof,17200.0,248.24,https://www.coches.com/coches-segunda-mano/oca...
1,2,2025-02-11 23:50:19.307619,audi,a4,s-line,11,2022,47800,hg,a,136,5,gris,prof,27128.0,25772.00,https://www.coches.com/coches-segunda-mano/oca...
2,3,2025-02-11 23:50:19.307619,audi,a4,S line 35 TDI 120kW (163CV) S tronic,3,2024,18120,hd,a,163,4,blanco,prof,35900.0,34900.00,https://www.coches.com/coches-segunda-mano/oca...
3,4,2025-02-11 23:50:19.307619,audi,a4,2.0 TDI 143cv DPF,1,2011,211158,d,a,143,4,negro,prof,9800.0,,https://www.coches.com/coches-segunda-mano/oca...
4,5,2025-02-11 23:50:19.307619,audi,a4,s-line,8,2019,106014,hg,a,150,5,negro,prof,19289.0,18324.00,https://www.coches.com/coches-segunda-mano/oca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,823,2025-02-11 23:50:19.307619,audi,a4,a4 2.0 tdi 110kw150cv advanced edition,2,2016,145000,d,m,1,4,blanco,prof,15900.0,13900.00,https://www.coches.com/coches-segunda-mano/oca...
823,824,2025-02-11 23:50:19.307619,audi,a4,Advanced 35TFSI 150cv MHEV,1,2021,37150,hg,m,150,5,negro,prof,27490.0,,https://www.coches.com/coches-segunda-mano/oca...
824,825,2025-02-11 23:50:19.307619,audi,a4,2.0TDI DPF 143,3,2008,148841,d,m,143,4,gris,prof,7900.0,7800.00,https://www.coches.com/coches-segunda-mano/oca...
825,826,2025-02-11 23:50:19.307619,audi,a4,3.0 TFSI quattro 245 kW (333 CV) S tronic,12,2014,195100,g,a,333,5,blanco,prof,22900.0,592.26,https://www.coches.com/coches-segunda-mano/oca...


# Preprocessing

In [24]:
class CarPriceTrainingPipeline:

    def __init__(self, model=None):
        """
        Initialize the pipeline

        Parameters:
        - model: Model to use
        """
        # Define columns regarding their type
        self.numeric_features = ["month", "year", "kms", "power_hp", "no_doors"]
        # Separating high and low cardinality features
        self.high_cardinality_features = ["manufacturer", "model", "version"]
        self.low_cardinality_features = ["fuel", "transmission", "color", "seller"]

        # Initialize the pipeline
        self.preprocessor = None
        # Initialize the model
        if model is None:
            self.model = XGBRegressor(
                n_estimators=100,
                max_depth=10,
                random_state=31415
            )
        else:
            self.model = model

        # self.model = RandomForestRegressor(
        #     n_estimators=100,
        #     max_depth=10,
        #     random_state=31415
        # )
        # self.model = LinearRegression()

    def create_preprocessing_pipeline(self, y_train=None):
        """
        Create the preprocessing pipeline

        Parameters:
        - y_train (pd.Series): Target variable.
        """

        # Numeric features
        numeric_transformer = Pipeline(steps=[
            ("scaler", StandardScaler())
        ])
        
        # One hot encoding for low cardinality features
        low_cardinality_transformer = Pipeline(steps=[
            ("onehot", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"))
        ])

        # Target encoding for high cardinality features
        high_cardinality_transformer = Pipeline(steps=[
            ("target", TargetEncoder(smoothing=10))
        ])

        # Define the transformers
        transformers = [
            ("num", numeric_transformer, self.numeric_features),
            ("low_card", low_cardinality_transformer, self.low_cardinality_features),
        ]

        # Add high cardinality transformer if target variable is provided
        if y_train is not None and len(self.high_cardinality_features) > 0:
            transformers.append(("high_card", high_cardinality_transformer, self.high_cardinality_features))

        self.preprocessor = ColumnTransformer(transformers=transformers)

    def get_preprocessed_data(self, data, target_column='price_cash'):
        """
        Apply the preprocessing pipeline to the data and return the transformed data
        """
        X = data.drop(columns=[target_column])
        y = data[target_column]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31415)

        # Fit the preprocessor if it has not been fitted yet
        if self.preprocessor is None:
            self.create_preprocessing_pipeline(y_train)
            self.preprocessor.fit(X_train, y_train)

        # Transform the data
        X_train_transformed = self.preprocessor.transform(X_train)
        X_test_transformed = self.preprocessor.transform(X_test)
        
        return X_train_transformed, X_test_transformed, y_train, y_test

    def create_full_pipeline(self, y_train=None):
        """
        Create the full pipeline

        Parameters:
        - y_train (pd.Series): Target variable.
        """

        self.create_preprocessing_pipeline(y_train=y_train)

        # Add the model to the pipeline
        self.model = Pipeline(steps=[
            ("preprocessor", self.preprocessor),
            ("model", self.model)
        ])

    def train(self, X, y):
        """
        Train the model

        Parameters:
        - X (pd.DataFrame): Features.
        - y (pd.Series): Target variable.
        """

        # Divide the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31415)

        # Create and train the full pipeline
        self.create_full_pipeline(y_train=y_train)
        self.model.fit(X_train, y_train)

        # Evaluate the model
        train_score = self.model.score(X_train, y_train)
        test_score = self.model.score(X_test, y_test)

        print(f"R² train score: {train_score:.4f}")
        print(f"R² test score: {test_score:.4f}")

        return {
            "train_score": train_score,
            "test_score": test_score,
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }
    
    def save_model(self, path: str = "car_price_model.joblib"):
        """
        Save the model

        Parameters:
        - path (str): Path to save the model.
        """

        if self.model is None:
            raise Exception("Model has not been trained yet")
        joblib.dump(self.model, path)

    def load_model(self, path: str = "car_price_model.joblib"):
        """
        Load the model

        Parameters:
        - path (str): Path to load the model.
        """

        self.model = joblib.load(path)
        self.preprocessor = self.model.named_steps["preprocessor"]

    def predict(self, X):
        """
        Predict the target variable

        Parameters:
        - X (pd.DataFrame): Features.
        """

        if self.model is None:
            raise Exception("Model has not been trained yet")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Predict the target variable probabilities

        Parameters:
        - X (pd.DataFrame): Features.
        """

        if self.model is None:
            raise Exception("Model has not been trained yet")
        return self.model.predict_proba(X)

In [15]:
models = {
    "linear_regression": LinearRegression(),
    "random_forest": RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=31415
    ),
    "xgboost": XGBRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=31415
    ),
}

In [20]:
X = df.drop(columns=["id", "created_at", "price_cash", "price_financed", "link"]).copy()
y = df["price_cash"].copy()

In [28]:
results = {}
for model in models:
    print(f"{model}")
    pipeline = CarPriceTrainingPipeline(models[model])
    results[model] = pipeline.train(X, y)
    print("\n")

linear_regression
R² train score: 0.4878
R² test score: 0.6700


random_forest
R² train score: 0.9024
R² test score: 0.2786


xgboost
R² train score: 0.9979
R² test score: -0.1481




In [31]:
X = df_audi.drop(columns=["id", "created_at", "price_cash", "price_financed", "link"]).copy()
y = df_audi["price_cash"].copy()
results = {}
for model in models:
    print(f"{model}")
    pipeline = CarPriceTrainingPipeline(models[model])
    results[model] = pipeline.train(X, y)
    print("\n")

linear_regression
R² train score: 0.8330
R² test score: 0.0393


random_forest
R² train score: 0.9855
R² test score: 0.0411


xgboost
R² train score: 0.9999
R² test score: 0.0419




In [35]:
X = df_audi_a4.drop(columns=["id", "created_at", "price_cash", "price_financed", "link"]).copy()
y = df_audi_a4["price_cash"].copy()
results = {}
for model in models:
    print(f"{model}")
    pipeline = CarPriceTrainingPipeline(models[model])
    results[model] = pipeline.train(X, y)
    print("\n")

linear_regression
R² train score: 0.8378
R² test score: 0.7641


random_forest
R² train score: 0.9946
R² test score: 0.8182


xgboost
R² train score: 0.9995
R² test score: 0.8197




In [11]:
pipeline.save_model()

In [21]:
X_predict = pd.DataFrame({
    "manufacturer": ["audi"],
    "model": ["a4"],
    "version": [None],
    "month": [11],
    "year": [2017],
    "fuel": ["d"],
    "transmission": ["m"],
    "color": ["negro"],
    "kms": [128000],
    "power_hp": [150],
    "no_doors": [5],
    "seller": ["part"]
})

In [36]:
pipeline.predict(X_predict)

array([22638.822], dtype=float32)

In [30]:
def test_models(pipeline, data, target_column='price_cash'):
    """
    Prueba diferentes modelos de regresión con datos preprocesados.
    
    Args:
        pipeline: Instancia de CarPriceTrainingPipeline con el preprocesador configurado.
        data: DataFrame con los datos crudos.
        target_column: Nombre de la columna objetivo.
    
    Returns:
        DataFrame con los resultados de cada modelo.
    """
    # Obtener datos preprocesados
    X_train, X_test, y_train, y_test = pipeline.get_preprocessed_data(data, target_column)

    # Modelos a evaluar
    models = {
        'LinearRegression': LinearRegression(),
        'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
        'LightGBM': LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
        # 'CatBoost': CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, random_seed=42, verbose=0),
        'Ridge': Ridge(alpha=1.0),
        'Lasso': Lasso(alpha=0.01)
    }

    results = []
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        results.append({'Model': name, 'R²': r2, 'RMSE': rmse})
    
    df_results = pd.DataFrame(results).sort_values(by='R²', ascending=False)
    return df_results


In [31]:
pipeline = CarPriceTrainingPipeline()
df_results = test_models(pipeline, df)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1155
[LightGBM] [Info] Number of data points in the train set: 123726, number of used features: 28
[LightGBM] [Info] Start training from score 21589.477992


  model = cd_fast.enet_coordinate_descent(


In [34]:
df_results

Unnamed: 0,Model,R²,RMSE
4,LightGBM,0.73215,7877.095117
0,LinearRegression,0.670012,8743.171957
6,Lasso,0.67001,8743.208477
5,Ridge,0.670009,8743.212624
1,RandomForest,0.22781,13374.639167
2,GradientBoosting,-0.034768,15482.518124
3,XGBoost,-0.125125,16144.34768


In [33]:
X_train, X_test, y_train, y_test = pipeline.get_preprocessed_data(df)

In [36]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2', n_jobs=1)
grid.fit(X_train, y_train)

print("Mejores hiperparámetros:", grid.best_params_)
print("Mejor R² en validación:", grid.best_score_)

best_model = grid.best_estimator_

best_model

Mejores hiperparámetros: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}
Mejor R² en validación: 0.6448094253832631


In [None]:
# 🔹 Modelos y sus hiperparámetros a probar
models = {
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {}  # No tiene hiperparámetros importantes
    },
    "Ridge": {
        "model": Ridge(),
        "params": {"alpha": np.logspace(-3, 3, 10)}
    },
    "RandomForest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [50, 100, 200, 300],
            "max_depth": [5, 10, 20, None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "SVR": {
        "model": SVR(),
        "params": {
            "C": np.logspace(-3, 3, 10),
            "gamma": ["scale", "auto"],
            "kernel": ["linear", "rbf", "poly"]
        }
    },
    "XGBoost": {
        "model": XGBRegressor(objective="reg:squarederror", random_state=42),
        "params": {
            "n_estimators": [50, 100, 200, 300],
            "max_depth": [3, 5, 10],
            "learning_rate": np.logspace(-3, 0, 10),
            "subsample": [0.6, 0.8, 1.0]
        }
    }
}

# 🔹 Diccionario para almacenar los resultados
results = {}

# 🔍 Probar cada modelo
for name, config in models.items():
    print(f"🔎 Probando modelo: {name}...")
    
    # Si el modelo no tiene hiperparámetros, solo lo entrenamos
    if not config["params"]:
        model = config["model"].fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        results[name] = {"best_params": "Default", "r2_score": r2}
        continue
    
    # Aplicar búsqueda aleatoria de hiperparámetros
    search = RandomizedSearchCV(
        config["model"], config["params"], 
        n_iter=10, cv=5, scoring="r2", n_jobs=-1, random_state=42
    )
    search.fit(X_train, y_train)
    
    # Evaluar el mejor modelo en el conjunto de prueba
    best_model = search.best_estimator_
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    # Guardar los resultados
    results[name] = {"best_params": search.best_params_, "r2_score": r2}

# 📊 Mostrar resultados ordenados por mejor desempeño
results_df = pd.DataFrame(results).T.sort_values(by="r2_score", ascending=False)
print("\n🔹 Resultados finales:")
print(results_df)


🔎 Probando modelo: LinearRegression...
🔎 Probando modelo: Ridge...
🔎 Probando modelo: RandomForest...
🔎 Probando modelo: SVR...
