# Noteboook destinada al entrenamiento del modelo

## 1. Importar librerías y raw data
Todas las librerías (Pandas, NumPy, Seaborn...) y el propio código modular `src/`.

In [2]:
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import importlib
import feature_engineering 
import preprocesing
importlib.reload(feature_engineering)
importlib.reload(preprocesing)
import numpy as np
from feature_engineering import get_cat_num_features, log_transform
from preprocesing import pipeline_preprocesamiento_rdmforest, pipeline_preprocesamiento_catboost
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [26]:
# Imporar dataset preprocesado
df = pd.read_csv('../data/processed/df_feat.csv')
#df = pd.read_csv('../data/processed/df_clean.csv')
#df = df.drop(columns=['cliente_id', 'fecha'])
target = 'minutos_visitados_mes'

feat=['promedio_productos_cliente',
 'freq_compra',
 'venta_negocio1_mes',
 'cantidad_heladeras',
 'promedio_venta_total',
 'venta_total_negocios_mes',
 'venta_negocio2_mes',
 'venta_negocio3_mes',
 'venta_por_heladera',
 'ratio_neg3',
 'venta_promedio_x_compra']

df = df[~df[feat].lt(0).any(axis=1)]

In [27]:
X = df.drop(columns=[target])
y = df[target]

Separar variables numéricas de categóricas

In [28]:
# Split temporal train-test
X_train, y_train = X[X['aniomes'] < 202504], y[X['aniomes'] < 202504]
X_valid, y_valid =X[X['aniomes'] == 202504], y[X['aniomes'] == 202504]
X_test, y_test = X[X['aniomes'] == 202505], y[X['aniomes'] == 202505]

print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_valid.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (168015, 26)
Validation shape: (14018, 26)
Test shape: (14008, 26)


In [6]:
cat_features, num_features = get_cat_num_features(X_train)

3 variables categóricas
23 variables numéricas
26 variables en total


In [7]:
num_features = [col for col in num_features if col not in ['aniomes']]
cat_features = [col for col in cat_features if col not in ['cliente_id']]

In [29]:
X_train = log_transform(X_train, feat)
X_train=X_train.drop(columns=feat)
X_valid =  log_transform(X_valid, feat)
X_valid=X_valid.drop(columns=feat)
cat_features, num_features = get_cat_num_features(X_train)
num_features = [col for col in num_features if col not in ['aniomes']]
cat_features = [col for col in cat_features if col not in ['cliente_id']]

3 variables categóricas
23 variables numéricas
26 variables en total


Preprocesar variables categóricas y numéricas
- A las variables numéricas las voy a normalizar con RobustScaler debido a la presencia de outliers en todas las features
- Las variables categóricas las codifico con One-Hot Encoding dado que son pocas categorías

### Modelo Baseline

In [9]:
# Calculo la media por región en train
mean_by_region = y_train.groupby(X_train['cliente_id']).mean()

# Predicción para validación usando la media de la región correspondiente
mean_target = np.mean(y_train)
y_pred_baseline = X_valid['cliente_id'].map(mean_by_region).fillna(mean_target).values
y_train_baseline = X_train['cliente_id'].map(mean_by_region).fillna(mean_target).values

# Evaluación del modelo baseline
rmse_baseline_train = root_mean_squared_error(y_train, y_train_baseline)
mae_baseline_train = mean_absolute_error(y_train, y_train_baseline)
rmse_baseline_valid = root_mean_squared_error(y_valid, y_pred_baseline)
mae_baseline_valid = mean_absolute_error(y_valid, y_pred_baseline)


print(f"Train Baseline (mean por cliente) MAE: {mae_baseline_train:.4f}")
print(f"Train Baseline (mean por cliente) RMSE: {rmse_baseline_train:.4f}")

print(f"Validation Baseline (mean por cliente) MAE: {mae_baseline_valid:.4f}")
print(f"Validation Baseline (mean por cliente) RMSE: {rmse_baseline_valid:.4f}")

Train Baseline (mean por cliente) MAE: 21.1708
Train Baseline (mean por cliente) RMSE: 37.5081
Validation Baseline (mean por cliente) MAE: 23.4385
Validation Baseline (mean por cliente) RMSE: 38.9750


### RandomForest

In [219]:
X_train_rf = X_train[num_features + cat_features].copy()
X_valid_rf = X_valid[num_features + cat_features].copy()


# Random Forest pipeline
preproc_rf = pipeline_preprocesamiento_rdmforest(num_features, cat_features, categorical_strategy='onehot')
rf = RandomForestRegressor(random_state=42, n_estimators=50, n_jobs=-1)
pipe_rf = Pipeline([
    ('preproc', preproc_rf),
    ('model', rf)
])
pipe_rf.fit(X_train_rf, y_train)
pred_rf_valid = pipe_rf.predict(X_valid_rf)

mae_rf_train = mean_absolute_error(y_train, pipe_rf.predict(X_train_rf))
rmse_rf_train = root_mean_squared_error(y_train, pipe_rf.predict(X_train_rf))   
print(f"Train MAE Random Forest: {mae_rf_train:.4f}")
print(f"Train RMSE Random Forest: {rmse_rf_train:.4f}")

mae_rf = mean_absolute_error(y_valid, pred_rf_valid)
rmse_rf = root_mean_squared_error(y_valid, pred_rf_valid)

print(f"Validation MAE Random Forest: {mae_rf:.4f}")
print(f"Validation RMSE Random Forest: {rmse_rf:.4f}")



Train MAE Random Forest: 11.3212
Train RMSE Random Forest: 19.5967
Validation MAE Random Forest: 21.5538
Validation RMSE Random Forest: 34.1304


### CatBoost

In [None]:
X_train_cb = X_train[num_features + cat_features].copy()
X_valid_cb = X_valid[num_features + cat_features].copy()

preproc_cb = pipeline_preprocesamiento_catboost(num_features, cat_features)

cat_feature_indices = list(range(len(cat_features)))

cb = CatBoostRegressor(
    iterations=500,
    max_depth=6,
    learning_rate=0.01,
    random_seed=42,
    verbose=False,
    cat_features=cat_feature_indices
)

pipeline_cb = Pipeline([
    ('preprocessor', preproc_cb),
    ('catboost', cb)
])

pipeline_cb.fit(X_train_cb, y_train)




In [45]:
pred_train = pipeline_cb.predict(X_train_cb)
pred_valid = pipeline_cb.predict(X_valid_cb)

mae_cb_train = mean_absolute_error(y_train, pred_train)
rmse_cb_train = root_mean_squared_error(y_train, pred_train)
mae_cb_valid = mean_absolute_error(y_valid, pred_valid)
rmse_cb_valid = root_mean_squared_error(y_valid, pred_valid)

print(f"Train MAE CatBoost: {mae_cb_train:.4f}")
print(f"Train RMSE CatBoost: {rmse_cb_train:.4f}")
print(f"Validation MAE CatBoost: {mae_cb_valid:.4f}")
print(f"Validation RMSE CatBoost: {rmse_cb_valid:.4f}")

Train MAE CatBoost: 27.4657
Train RMSE CatBoost: 48.4673
Validation MAE CatBoost: 23.4676
Validation RMSE CatBoost: 36.5010


In [206]:
from xgboost import XGBRegressor
X_train_xgb = X_train[num_features + cat_features].copy()
X_valid_xgb = X_valid[num_features + cat_features].copy()

y_train_xgb = y_train.copy()
y_valid_xgb = y_valid.copy()

# Preprocesador
preproc_xgb = pipeline_preprocesamiento_rdmforest(num_features, cat_features)

# Modelo XGBoost (puedes ajustar hiperparámetros después)
xgb = XGBRegressor(random_state=42, n_jobs=-1, verbosity=1)

# Pipeline completo
pipe_xgb = Pipeline([
    ('preproc', preproc_xgb),
    ('model', xgb)
])

# Entrenamiento
pipe_xgb.fit(X_train_xgb, y_train_xgb)

# Predicciones
pred_train_xgb = pipe_xgb.predict(X_train_xgb)
pred_valid_xgb = pipe_xgb.predict(X_valid_xgb)

# Métricas
mae_train_xgb = mean_absolute_error(y_train_xgb, pred_train_xgb)
rmse_train_xgb = root_mean_squared_error(y_train_xgb, pred_train_xgb)
mae_valid_xgb = mean_absolute_error(y_valid_xgb, pred_valid_xgb)
rmse_valid_xgb = root_mean_squared_error(y_valid_xgb, pred_valid_xgb)

print(f"Train MAE XGBoost: {mae_train_xgb:.4f}")
print(f"Train RMSE XGBoost: {rmse_train_xgb:.4f}")
print(f"Validation MAE XGBoost: {mae_valid_xgb:.4f}")
print(f"Validation RMSE XGBoost: {rmse_valid_xgb:.4f}")

Train MAE XGBoost: 24.8552
Train RMSE XGBoost: 41.3263
Validation MAE XGBoost: 22.7298
Validation RMSE XGBoost: 35.2598


In [30]:
from lightgbm import LGBMRegressor
preproc_lgbm = pipeline_preprocesamiento_rdmforest(num_features, cat_features)

In [34]:
model = LGBMRegressor(
    objective = 'huber',
    alpha = 0.9,
    num_leaves=31,
    learning_rate= 0.05,
    random_state=42,
    n_estimators=2000)

pipe_lgb = Pipeline([
    ('preproc', preproc_lgbm),
    ('model', model)
])

In [35]:
pipe_lgb.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2606
[LightGBM] [Info] Number of data points in the train set: 168015, number of used features: 30
[LightGBM] [Info] Start training from score 56.697297


In [36]:
# Predicciones
pred_train_lgb = pipe_lgb.predict(X_train)
pred_valid_lgb = pipe_lgb.predict(X_valid)

# Métricas
mae_train_lgb = mean_absolute_error(y_train, pred_train_lgb)
rmse_train_lgb = root_mean_squared_error(y_train, pred_train_lgb)
mae_valid_lgb = mean_absolute_error(y_valid, pred_valid_lgb)
rmse_valid_lgb = root_mean_squared_error(y_valid, pred_valid_lgb)

print(f"Train MAE LGBM: {mae_train_lgb:.4f}")
print(f"Train RMSE LGBM: {rmse_train_lgb:.4f}")
print(f"Validation MAE LGBM: {mae_valid_lgb:.4f}")
print(f"Validation RMSE LGBM: {rmse_valid_lgb:.4f}")

Train MAE LGBM: 29.1989
Train RMSE LGBM: 59.5457
Validation MAE LGBM: 25.1871
Validation RMSE LGBM: 45.7734


In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from catboost import CatBoostRegressor

X_train_cb = preproc_cb.fit_transform(X_train)
X_valid_cb = preproc_cb.transform(X_valid)
model = CatBoostRegressor(verbose=0, random_state=42,cat_features=cat_feature_indices)

cv = KFold(n_splits=3, shuffle=True, random_state=42)

cat_params = {
    'iterations': [500, 1000, 2000],           # se usa early stopping
    'learning_rate': [0.01, 0.03, 0.05],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 7],
    'bagging_temperature': [0, 1, 3],      # regularización bayesiana
    'random_strength': [0.0, 1.0, 3.0],
    'border_count': [32, 64, 128]
}

search = RandomizedSearchCV(
    estimator=model,
    param_distributions=cat_params,
    n_iter=60,                # ajustar según tiempo
    scoring='neg_mean_absolute_error',
    cv=cv,
    verbose=2,
    n_jobs=4,
    random_state=42
)

# fit con eval_set y early stopping: pasamos fit_params al buscador
fit_params = {
    'eval_set': (X_valid_cb, y_valid),
    'early_stopping_rounds': 50,
    'use_best_model': True
}

search.fit(X_train_cb, y_train, **fit_params)
print(search.best_params_, -search.best_score_)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
