In [1]:
# ===============================
# Still Lost in ML - Pipelines I
# ===============================

# Importando Librerías
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import joblib
import os

# Crear directorios necesarios
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

In [2]:
# ==================
# 1. Carga de Datos
# ==================

# Importando Conjunto de Datos
df_car = pd.read_csv("../data/car_price_dataset.csv")

# Dividir en features y target
target = "Price"
features = [col for col in df_car.columns if col != target]

# División de features numéricas y categóricas
features_num = [col for col in features if df_car[col].dtype != object]
features_cat = [col for col in features if df_car[col].dtype == object]

X = df_car[features]
y = df_car[target]

# División en train y test (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Guardar datasets en src/data
X_train.to_csv('../data/car_price_train.csv', index=False)
X_test.to_csv('../data/car_price_test.csv', index=False)
pd.DataFrame(y_train).to_csv('../data/car_price_train_target.csv', index=False)
pd.DataFrame(y_test).to_csv('../data/car_price_test_target.csv', index=False)

In [3]:
# =============================
# 2. Construcción del Pipeline
# =============================

# Preprocesamiento para variables numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocesamiento para variables categóricas
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar transformaciones en un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_num),
        ('cat', categorical_transformer, features_cat)
    ]
)

# Crear el pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet())
])


In [4]:
# ===================================
# 3. Optimización de Hiperparámetros
# ===================================

# Definir espacio de búsqueda para RandomizedSearchCV
param_dist = {
    'regressor__alpha': np.logspace(-5, 5, 11),
    'regressor__l1_ratio': np.linspace(0, 1, 11)
}

random_search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_dist,
    n_iter=100,
    scoring='neg_root_mean_squared_error',
    cv=5,
    random_state=42,
    n_jobs=-1
)

# Entrenar el pipeline con optimización
random_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_model = random_search.best_estimator_

In [5]:
# =========================
# 4. Evaluación del Modelo
# =========================

# Predicciones en train
y_train_pred = best_model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Entrenamiento:")
print(f"RMSE: {rmse_train}, MAPE: {mape_train}, R²: {r2_train}")

# Guardar el modelo entrenado
joblib.dump(best_model, '../models/best_model.pkl')
print("¡Modelo guardado con éxito!")

Entrenamiento:
RMSE: 90.13247728926791, MAPE: 0.005994319589858227, R²: 0.9991718140859525
¡Modelo guardado con éxito!
