In [1]:
# ===============================
# Still Lost in ML - Pipelines I
# ===============================

# Importando Librerías
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Importar librerías para los modelos de regresión
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import re

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import Lasso, Ridge


# librearías para los pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Ignorar warnings
import warnings
warnings.filterwarnings('ignore')

# Importar librerías para guardar modelos
import joblib
import os

# Crear directorios necesarios
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

In [None]:
# ====================================
# 1. Carga de Datos y split Train/test
# ====================================

# Importando Conjunto de Datos
df_car = pd.read_csv("../data/car_price_dataset.csv")

# Dividir en features y target
target = "Price"
features = [col for col in df_car if col != target]

# División de variables numéricas y categóricas
one_hot_columns = ["Fuel_Type", "Transmission"]   # Variables categóricas → OneHotEncoding
ordinal_columns = ["Brand", "Model"]              # Variables categóricas → OrdinalEncoder (LabelEncoder es peor para pipelines)

# Identificar columnas numéricas excluyendo las categóricas
features_num = [col for col in features if df_car[col].dtype != object]

X = df_car[features]
y = df_car[target]

# División en train y test (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Guardar datasets en src/data
train_df = pd.concat([X_train, y_train], axis=1)
train_df.to_csv('../data/car_price_dataset_train.csv', index=False)

test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv('../data/car_price_dataset_test.csv', index=False)

X_train.head(5)

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count
9254,Volkswagen,Golf,2007,3.2,Hybrid,Semi-Automatic,33948,5,4
1561,BMW,5 Series,2010,4.8,Electric,Semi-Automatic,224853,2,1
1670,Hyundai,Sonata,2006,1.9,Hybrid,Semi-Automatic,13435,3,5
6087,Honda,CR-V,2019,1.7,Diesel,Semi-Automatic,199846,2,3
6669,Mercedes,E-Class,2005,3.6,Petrol,Automatic,261,3,2


In [3]:
# ================================================
# 2. Construcción del Pipeline de Preprocesamiento
# ================================================

# Variables categóricas
one_hot_columns = ["Fuel_Type", "Transmission"]
ordinal_columns = ["Brand", "Model"] 

# Definición de categorias para el OrdinalEncoder
brand_categories = ['Volkswagen', 'BMW', 'Hyundai', 'Honda', 'Mercedes', 'Audi',
                    'Ford', 'Kia', 'Chevrolet', 'Toyota']

model_categories = ['Golf', '5 Series', 'Sonata', 'CR-V', 'E-Class', 'GLA', 'Tiguan',
                    'A3', 'Focus', 'Civic', 'Sportage', 'A4', 'Fiesta', 'X5', 'Malibu',
                    'Explorer', '3 Series', 'Q5', 'Elantra', 'Camry', 'C-Class',
                    'Passat', 'Impala', 'Accord', 'Equinox', 'Optima', 'Rio', 'Tucson',
                    'Corolla', 'RAV4']

# CATEGORICAS
# PIPE_1: Preprocesamiento para variables categóricas One-Hot (imputa moda y codifica)
onehot_pipeline = Pipeline(steps=[
            ('Impute_Mode',                 SimpleImputer(strategy='most_frequent')),   # Imputar valores faltantes con la moda
            ('OHEncoder',                   OneHotEncoder(sparse_output=False, 
                                                          drop="first", 
                                                          handle_unknown='ignore'))
])

# PIPE_2: Preprocesamiento para variables categóricas Ordinales (imputa moda y codifica)
ordinal_pipeline = Pipeline(steps=[
            ('Impute_Mode',                 SimpleImputer(strategy='most_frequent')),   # Imputar valores faltantes con la moda
            ('ordinal',                     OrdinalEncoder(categories = [brand_categories, model_categories],
                                                           handle_unknown='use_encoded_value', 
                                                           unknown_value=-1)),
            ('Scaler',                      StandardScaler())                           # Escalado para las columnas ordinales                                                           
])

# NUMERICAS
# PIPE_3: Preprocesamiento para variables numéricas (imputa media y escala)
num_pipeline = Pipeline(steps=[
            ('Impute_Numeric',              SimpleImputer(strategy='mean')),    # Imputar valores faltantes con la media
#           ('log_transform',               log_function)                       # Transformación logarítmica
            ('Scaler',                      StandardScaler())
])

preprocessing = ColumnTransformer([
            ("Process_NUM",                 num_pipeline,           features_num),       # Preprocesa numéricas (sin las categóricas)
            ("Process_CAT_OneHot",          onehot_pipeline,        one_hot_columns),    # Preprocesa con One-Hot
            ("Process_CAT_Ordinal",         ordinal_pipeline,       ordinal_columns)     # Preprocesa con Label/Ordinal
#           ("Exclude",                    "drop",                  columns_to_exclude)  # Borrar columnas no deseadas
], remainder = "passthrough")  

pipe_preprocessed = preprocessing.fit_transform(X_train)
df_check = pd.DataFrame(pipe_preprocessed, columns= preprocessing.get_feature_names_out())
df_check.head(5)

Unnamed: 0,Process_NUM__Year,Process_NUM__Engine_Size,Process_NUM__Mileage,Process_NUM__Doors,Process_NUM__Owner_Count,Process_CAT_OneHot__Fuel_Type_Electric,Process_CAT_OneHot__Fuel_Type_Hybrid,Process_CAT_OneHot__Fuel_Type_Petrol,Process_CAT_OneHot__Transmission_Manual,Process_CAT_OneHot__Transmission_Semi-Automatic,Process_CAT_Ordinal__Brand,Process_CAT_Ordinal__Model
0,-0.655377,0.176135,-1.333932,1.346373,0.709567,0.0,1.0,0.0,0.0,1.0,-1.557873,-1.683686
1,-0.220429,1.563621,0.870313,-1.352896,-1.401979,1.0,0.0,0.0,0.0,1.0,-1.209463,-1.567657
2,-0.80036,-0.951198,-1.570781,-0.45314,1.413416,0.0,1.0,0.0,0.0,1.0,-0.861052,-1.451629
3,1.084418,-1.124633,0.581575,-1.352896,0.005719,0.0,0.0,0.0,0.0,1.0,-0.512642,-1.335601
4,-0.945343,0.523006,-1.722892,-0.45314,-0.69813,0.0,0.0,1.0,0.0,0.0,-0.164232,-1.219573


In [4]:
preprocessing

In [5]:
# Aplicar transformaciones a los datos de prueba
X_train_processed = preprocessing.fit_transform(X_train)
X_test_processed = preprocessing.transform(X_test)

In [None]:
# ================================================
# 3. Construcción del Pipeline de Modelos
# ================================================

# PIPE_4
elastic_net_pipeline =  Pipeline(
    [("Preprocesado",           preprocessing),         
     ("Modelo",                 ElasticNet(alpha = 0, l1_ratio = 0, random_state=42))
    ])
rf_reg_pipeline =       Pipeline(
    [("Preprocesado",           preprocessing),
     ("Modelo",                 RandomForestRegressor(random_state=42))
    ])
xg_reg_pipeline =       Pipeline(
    [("Preprocesado",           preprocessing),
     ("Modelo",                 xgb.XGBRegressor(random_state=42))
    ])

# Cross Validation
best_score = float('inf')
best_model_name = ""

for name, pipe in zip(["ElasticNet", "RandomForest", "XGBoost"], [elastic_net_pipeline, rf_reg_pipeline, xg_reg_pipeline]):
    resultado = cross_val_score(pipe, 
                                X_train, 
                                y_train, 
                                cv=5, 
                                scoring='neg_root_mean_squared_error', 
                                n_jobs=-1)
    
    mean_score = abs(np.mean(resultado))
    print(f"🔹 {name}")
    print(f"   CROSS-VAL    RMSE: {round(mean_score, 5)}")
    print("")
    
    if mean_score < best_score:
        best_score = mean_score
        best_model_name = name

print(f"🏆 MEJOR MODELO BASE (RMSE): {best_model_name}")

🔹 ElasticNet
   CROSS-VAL    RMSE: 88.88269

🔹 RandomForest
   CROSS-VAL    RMSE: 590.71817

🔹 XGBoost
   CROSS-VAL    RMSE: 262.24483

🏆 MEJOR MODELO BASE (RMSE): ElasticNet


In [None]:
# ===================================
# 4. Optimización de Hiperparámetros
# ===================================

# PIPE_5
# Definir el espacio de búsqueda para cada modelo
param_grid_elastic = {
    "Modelo__alpha": np.linspace(0.001, 100, 100),  # Regularización
    "Modelo__l1_ratio": np.linspace(0.0, 1.0, 11)   # Mezcla entre L1 (Lasso) y L2 (Ridge)
}
param_grid_rf = {
    "Modelo__n_estimators": [100, 300, 500],
    "Modelo__max_depth": [None, 10, 20],
    "Modelo__min_samples_split": [2, 5, 10]
}
param_grid_xgb = {
    "Modelo__n_estimators": [100, 300, 500],
    "Modelo__learning_rate": [0.01, 0.1, 0.3],
    "Modelo__max_depth": [3, 6, 10]
}

# ElasticNet
grid_search_elastic = GridSearchCV(
                        elastic_net_pipeline,
                        param_grid=param_grid_elastic,
                        scoring='neg_root_mean_squared_error',
                        cv=5,
                        n_jobs=-1)
# Random Forest
grid_search_rf = GridSearchCV(
                        rf_reg_pipeline,
                        param_grid=param_grid_rf,
                        scoring='neg_root_mean_squared_error',
                        cv=5,
                        n_jobs=-1)
# XGBoost
grid_search_xgb = GridSearchCV(
                        xg_reg_pipeline,
                        param_grid=param_grid_xgb,
                        scoring='neg_root_mean_squared_error',
                        cv=5,
                        n_jobs=-1)

pipe_grids = {  "gs_Elastic_Net":grid_search_elastic,
                "gs_Random_Forest":grid_search_rf,
                "gs_XGBoost":grid_search_xgb}

# Entrenar los modelos con GridSearchCV
grid_search_elastic.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)
grid_search_xgb.fit(X_train, y_train)

# Obtener los mejores parámetros
print("Mejores hiperparámetros\n")
print(f"🏅 ElasticNet:      {grid_search_elastic.best_params_}")
print(f"🏅 RandomForest:    {grid_search_rf.best_params_}")
print(f"🏅 XGBoost:         {grid_search_xgb.best_params_}")
print("")

# Mejores modelos optimizados
best_grids = [(i, abs(j.best_score_)) for i, j in pipe_grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=True)
best_grids

Mejores hiperparámetros

🏅 ElasticNet:      {'Modelo__alpha': 0.001, 'Modelo__l1_ratio': 1.0}
🏅 RandomForest:    {'Modelo__max_depth': 20, 'Modelo__min_samples_split': 2, 'Modelo__n_estimators': 500}
🏅 XGBoost:         {'Modelo__learning_rate': 0.1, 'Modelo__max_depth': 3, 'Modelo__n_estimators': 500}



Unnamed: 0,Grid,Best score
0,gs_ela_log,88.882631
2,gs_xgb,142.967768
1,gs_rand_forest,585.505924


In [18]:
best_model = pipe_grids[best_grids.iloc[0,0]]
best_model

In [None]:
# ====================
# 5. Guardar el modelo
# ====================

# Guardar el modelo entrenado usando joblib
joblib.dump(best_model, '../models/best_model.pkl')
print("¡Modelo guardado con éxito!")

¡Modelo guardado con éxito!
