In [20]:
import pandas as pd
import json
from datetime import datetime
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [5]:
with open("config/db_credentials.json", "r") as f:
    db_credentials = json.load(f)

In [8]:
def create_engine_connection(db_credentials: dict):
    return create_engine(
        f"postgresql+psycopg2://{db_credentials['user']}:{db_credentials['password']}@{db_credentials['host']}:{db_credentials['port']}/{db_credentials['dbname']}"
    )

In [9]:
query = "SELECT * FROM public.cars_scraped"
engine = create_engine_connection(db_credentials)

In [11]:
df = pd.read_sql(query, engine)

In [12]:
df

Unnamed: 0,id,created_at,manufacturer,model,version,month,year,kms,fuel,transmission,power_hp,no_doors,color,seller,price_cash,price_financed,link
0,1,2025-02-11 23:50:19.307619,audi,a4,AVANT ADVANCED EDITION 2.0 TDI 190 CV 5P,5,2018,133382,d,a,190,5,gris,prof,17200.0,248.24,https://www.coches.com/coches-segunda-mano/oca...
1,2,2025-02-11 23:50:19.307619,audi,a4,s-line,11,2022,47800,hg,a,136,5,gris,prof,27128.0,25772.00,https://www.coches.com/coches-segunda-mano/oca...
2,3,2025-02-11 23:50:19.307619,audi,a4,S line 35 TDI 120kW (163CV) S tronic,3,2024,18120,hd,a,163,4,blanco,prof,35900.0,34900.00,https://www.coches.com/coches-segunda-mano/oca...
3,4,2025-02-11 23:50:19.307619,audi,a4,2.0 TDI 143cv DPF,1,2011,211158,d,a,143,4,negro,prof,9800.0,,https://www.coches.com/coches-segunda-mano/oca...
4,5,2025-02-11 23:50:19.307619,audi,a4,s-line,8,2019,106014,hg,a,150,5,negro,prof,19289.0,18324.00,https://www.coches.com/coches-segunda-mano/oca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154653,154673,2025-02-20 21:29:38.344217,tesla,model y,Tracción Trasera RWD 188 kW (256 CV),3,2024,11421,e,a,256,5,blanco,prof,39990.0,36355.00,https://www.coches.com/coches-segunda-mano/oca...
154654,154674,2025-02-20 21:29:38.344217,tesla,model y,Gran Autonomía 4WD 378 kW (514 CV),11,2021,84299,e,a,514,5,negro,prof,39490.0,35900.00,https://www.coches.com/coches-segunda-mano/oca...
154655,154675,2025-02-20 21:29:38.344217,tesla,model y,Gran Autonomia AWD,1,2022,45000,e,a,351,5,negro,part,37850.0,489.69,https://www.coches.com/coches-segunda-mano/oca...
154656,154676,2025-02-20 21:29:38.344217,tesla,model y,RWD,1,2024,2280,e,a,255,5,gris,part,44000.0,485.60,https://www.coches.com/coches-segunda-mano/oca...


# Feature engineering

## Age

In [21]:
current_year = datetime.now().year + datetime.now().month / 12
current_year

2025.1666666666667

In [22]:
df["age"] = current_year - (df["year"] + df["month"] / 12)

## Kms per year

In [24]:
df["kms_per_year"] = df["kms"] / (df["age"] + 1/12)

## High kms per year

In [26]:
df["high_kms"] = df["kms_per_year"] > 20000

In [25]:
df

Unnamed: 0,id,created_at,manufacturer,model,version,month,year,kms,fuel,transmission,power_hp,no_doors,color,seller,price_cash,price_financed,link,age,kms_per_year
0,1,2025-02-11 23:50:19.307619,audi,a4,AVANT ADVANCED EDITION 2.0 TDI 190 CV 5P,5,2018,133382,d,a,190,5,gris,prof,17200.0,248.24,https://www.coches.com/coches-segunda-mano/oca...,6.750000,19519.317073
1,2,2025-02-11 23:50:19.307619,audi,a4,s-line,11,2022,47800,hg,a,136,5,gris,prof,27128.0,25772.00,https://www.coches.com/coches-segunda-mano/oca...,2.250000,20485.714286
2,3,2025-02-11 23:50:19.307619,audi,a4,S line 35 TDI 120kW (163CV) S tronic,3,2024,18120,hd,a,163,4,blanco,prof,35900.0,34900.00,https://www.coches.com/coches-segunda-mano/oca...,0.916667,18120.000000
3,4,2025-02-11 23:50:19.307619,audi,a4,2.0 TDI 143cv DPF,1,2011,211158,d,a,143,4,negro,prof,9800.0,,https://www.coches.com/coches-segunda-mano/oca...,14.083333,14905.270588
4,5,2025-02-11 23:50:19.307619,audi,a4,s-line,8,2019,106014,hg,a,150,5,negro,prof,19289.0,18324.00,https://www.coches.com/coches-segunda-mano/oca...,5.500000,18987.582090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154653,154673,2025-02-20 21:29:38.344217,tesla,model y,Tracción Trasera RWD 188 kW (256 CV),3,2024,11421,e,a,256,5,blanco,prof,39990.0,36355.00,https://www.coches.com/coches-segunda-mano/oca...,0.916667,11421.000000
154654,154674,2025-02-20 21:29:38.344217,tesla,model y,Gran Autonomía 4WD 378 kW (514 CV),11,2021,84299,e,a,514,5,negro,prof,39490.0,35900.00,https://www.coches.com/coches-segunda-mano/oca...,3.250000,25289.700000
154655,154675,2025-02-20 21:29:38.344217,tesla,model y,Gran Autonomia AWD,1,2022,45000,e,a,351,5,negro,part,37850.0,489.69,https://www.coches.com/coches-segunda-mano/oca...,3.083333,14210.526316
154656,154676,2025-02-20 21:29:38.344217,tesla,model y,RWD,1,2024,2280,e,a,255,5,gris,part,44000.0,485.60,https://www.coches.com/coches-segunda-mano/oca...,1.083333,1954.285714


# Preprocessing

In [None]:
def create_preprocessor():
    numeric_features = ["age", "month", "year", "kms", "power_hp", "no_doors", "price_cash"]
    categorical_features = ["manufacturer", "model", "fuel", "transmission", "color", "seller"]