In [5]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import logging
import time
from datetime import datetime

# 1. Configuración de modelos

In [15]:
model_configurations = {
    'linear_regression': {
        'model': LinearRegression(),
        'param_grid': {
            'fit_intercept': [True, False]
        }
    },
    
    'random_forest': {
        'model': RandomForestRegressor(random_state=2026, n_jobs=-1),
        'param_grid': {
            'n_estimators': [25, 50, 100],
            'max_depth': [None, 5, 10]
        }
    },
    
    'gradient_boosting': {
        'model': GradientBoostingRegressor(random_state=2024),
        'param_grid': {
            'learning_rate': [0.01, 0.1, 0.5],
            'n_estimators': [25, 50, 100]
        }
    },
    
    'svr': {
        'model': SVR(),
        'param_grid': {
            'kernel': ['rbf', 'linear', 'poly'],
            'C': [1, 10, 100],
            'epsilon': [0.01, 0.1, 0.2]
        }
    },
    
    'xgboost': {
        'model': XGBRegressor(random_state=2026, n_jobs=-1, enable_categorical=True),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7]
        }
    }
}

In [16]:
logging.basicConfig(filename="ml_system.log", encoding="utf-8", filemode="a", level=logging.INFO,
                    format="{asctime}, {levelname}, {message}",style="{",datefmt="%Y-%m-%d %H:%M")

# 3. Entrenamiento y seleccion del modelo ganador

In [18]:
dataset = pd.read_csv('../data/raw/stores_sales_forecasting_updated_v3.1.csv', 
                      sep=';', 
                      encoding='utf-8')

# Seleccionar solo columnas numéricas
X = dataset.select_dtypes(include=['int64', 'float64', 'bool']).copy()

X = dataset.drop("Sales", axis=1)
y = dataset['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=2025, test_size=0.3)

results = {}
logging.info(f"Iniciando Entrenamiento, ENTRENAMIENTO")
start = time.time()

current_datetime = datetime.now()
formatted_time = current_datetime.strftime("%Y-%m-%d %H:%M:%S")


for model_name, model in model_configurations.items():
    base_model = model['model']
    param_grid = model["param_grid"]

    grid_search_config = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,
        scoring="neg_mean_squared_error"
    )
grid_search_config.fit(X_train, y_train)
best_model = grid_search_config.best_estimator_
preds = best_model.predict(X_test)
params = grid_search_config.best_params_
rmse_mean = np.round(np.sqrt(mean_squared_error(y_test, preds)), 2)
results[model_name] = {

"rmse": rmse_mean, 
"best_param": params, 
"best_model": best_model, 
"run_id": mlflow.active_run().info.run_id}
print(f"RMSE del modelo {model_name}: {rmse_mean}, ENTRENAMIENTO")
logging.info(f"RMSE del modelo {model_name}: {rmse_mean}, ENTRENAMIENTO")


finish = time.time()
logging.info(f"Tiempo de entrenamiento {finish-start}, ENTRENAMIENTO")

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\data.py", line 407, in pandas_feature_info
    new_feature_types.append(_pandas_dtype_mapper[dtype.name])
KeyError: 'object'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 705, in inner_f
    return func(**kwargs)
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\sklearn.py", line 1222, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\sklearn.py", line 628, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\sklearn.py", line 1137, in _create_dmatrix
    return QuantileDMatrix(
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 705, in inner_f
    return func(**kwargs)
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 1590, in __init__
    self._init(
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 1654, in _init
    it.reraise()
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 548, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 529, in _handle_exception
    return fn()
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 616, in <lambda>
    return self._handle_exception(lambda: int(self.next(input_data)), 0)
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\data.py", line 1654, in next
    input_data(**self.kwargs)
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 705, in inner_f
    return func(**kwargs)
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\core.py", line 596, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\data.py", line 1707, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\data.py", line 640, in _transform_pandas_df
    feature_names, feature_types = pandas_feature_info(
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\data.py", line 409, in pandas_feature_info
    _invalid_dataframe_dtype(data)
  File "c:\Users\mitch\anaconda3\envs\venv_papd_a\lib\site-packages\xgboost\data.py", line 372, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Order ID: object, Order Date: object, Ship Date: object, Ship Mode: object, Customer ID: object, Customer Name: object, Segment: object, Country: object, City: object, State: object, Branch: object, Region: object, Product ID: object, Category: object, Sub-Category: object, Product Name: object
