Para esta sección vamos a hacer una funcion para cada modelo.

Cada funcion de modelaje será hecha con GridSearchCV para la selección de hiperparametros. Se usará TimeSeriesSplit para la validación cruzada. Cada función nos devolverá el modelo entrenado con el resultado de las metricas MAE, RMSE y MAPE. Nos entregará un diccionario con estas métricas. Cada función también va a imprimir estas métricas. Cada función va a tomar 6 parametros. X_train, X_validation, X_test, y_train, y_validation, y_test.

Habrá una función que divida el data set de la manera que se estipula en el pdf. Siendo esta train 2017, validation 1er sem 2018, test 2do sem 2018.

Habrá una 'función maestra' que como parámetro solo tenga el df. Aquí se van a ejecutar todas las funciones. Primero dividiendo el dataset, y de ahí yendo en el siguiente orden: Regresiones (Lineal, Estocastica y Ridge), Modelos avanzados (Random Forest y Gradient Boosting (XGBoost, LightGBM y CatBoost)) y Ensamblador (stacking y blending). 

In [32]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.base import clone

In [33]:
RANDOM_STATE = 42
#poniendo una semilla por reproducibilidad

In [34]:
# Cargar dataset
df = pd.read_csv('../data/processed/modeling_dataset.csv')
df['fecha_primera_sesion'] = pd.to_datetime(df['fecha_primera_sesion'])

In [35]:
# dividiendo el dataset como se pide. Bueno esta hecho para 6 meses de 2017 y 2 trimestres de 2018
def dividir_dataset(df, fecha_col='fecha_primera_sesion'):
    df[fecha_col] = pd.to_datetime(df[fecha_col])
    train = df[df[fecha_col].dt.year == 2017]
    val = df[(df[fecha_col].dt.year == 2018) & (df[fecha_col].dt.month <= 3)]
    test = df[(df[fecha_col].dt.year == 2018) & (df[fecha_col].dt.month > 3)]
    return train, val, test

In [36]:
def to_df(X_like, ref_df):
#  convierte ndarray -> DataFrame con los mismos nombres.
    if isinstance(X_like, np.ndarray):
        return pd.DataFrame(X_like, columns=ref_df.columns)
    return X_like

In [37]:
def metricas(y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred)[y_true != 0] / y_true[y_true != 0])) * 100
    return {"MAE": mae, "RMSE": rmse, "MAPE": mape}

In [38]:
def modelo_con_gridsearch(modelo_base, param_grid,X_train, X_val, X_test, y_train, y_val, y_test, numeric_cols=None):
    if numeric_cols is None:
        numeric_cols = X_train.columns                
    preproc = ColumnTransformer(
        [("scale", StandardScaler(), numeric_cols)],
        remainder="drop"
    ).set_output(transform="pandas")                  

    pipeline = Pipeline([
        ("pre", preproc),
        ("modelo", modelo_base)
    ])

    param_grid_pipeline = {f"modelo__{k}": v for k, v in param_grid.items()}

    tscv = TimeSeriesSplit(n_splits=5)
    grid = GridSearchCV(
        pipeline,
        param_grid_pipeline,
        cv=tscv,
        scoring="neg_root_mean_squared_error",
        refit=True
    )
    grid.fit(X_train, y_train)
    print("Mejores parámetros:", grid.best_params_)

    best_model = grid.best_estimator_          

    # evaluación en validacion
    y_pred_val = best_model.predict(to_df(X_val, X_train))
    val_scores = metricas(y_val, y_pred_val)
    print(f"[VALIDACIÓN] MAE={val_scores['MAE']:.2f}, "
          f"RMSE={val_scores['RMSE']:.2f}, MAPE={val_scores['MAPE']:.2f}%")

    # refit en train+val
    X_train_val = pd.concat([X_train, X_val], axis=0)
    y_train_val = pd.concat([y_train, y_val], axis=0)
    best_model.fit(X_train_val, y_train_val)

    # evaluación final en test
    y_pred_test = best_model.predict(to_df(X_test, X_train))
    test_scores = metricas(y_test, y_pred_test)
    print(f"[TEST] MAE={test_scores['MAE']:.2f}, "
          f"RMSE={test_scores['RMSE']:.2f}, MAPE={test_scores['MAPE']:.2f}%")

    return best_model, {"val": val_scores, "test": test_scores}

In [39]:
#regresion lineal
def modelo_lineal(*args):
    return modelo_con_gridsearch(LinearRegression(), {}, *args)

In [40]:
#estoacastico
def modelo_estocastico(*args):
    param_grid = {'alpha': [0.00001,0.0001, 0.001, 0.01,0.1,1,10,100], 'penalty': ['l2', 'elasticnet']}
    return modelo_con_gridsearch(SGDRegressor(max_iter=1000, tol=1e-3,random_state=RANDOM_STATE), param_grid, *args)

In [41]:
#ridge
def modelo_ridge(*args):
    param_grid = {'alpha': [0.1, 1.0, 10.0]}
    return modelo_con_gridsearch(Ridge(), param_grid, *args)

In [42]:
#random forest
def modelo_rf(*args):
    param_grid = {'n_estimators': [100, 200], 'max_depth': [3, 5, 10]}
    return modelo_con_gridsearch(RandomForestRegressor(random_state=RANDOM_STATE), param_grid, *args)

In [43]:
#xgboost
def modelo_xgb(*args):
    param_grid = {'n_estimators': [100], 'max_depth': [3, 5], 'learning_rate': [0.1]}
    return modelo_con_gridsearch(XGBRegressor(verbosity=0,random_state=RANDOM_STATE), param_grid, *args)

In [44]:
#lightGBM
def modelo_lgbm(*args):
    param_grid = {'n_estimators': [100], 'max_depth': [3, 5], 'learning_rate': [0.1]}
    #si algo se ve raro cambiar el verbose
    return modelo_con_gridsearch(LGBMRegressor(verbose=-1,random_state=RANDOM_STATE), param_grid, *args)

In [45]:
#catboost
def modelo_catboost(*args):
    param_grid = {'iterations': [100], 'depth': [3, 5], 'learning_rate': [0.1]}
    #si algo se ve raro cambiar el verbose
    return modelo_con_gridsearch(CatBoostRegressor(verbose=0,random_state=RANDOM_STATE), param_grid, *args)

In [46]:
from sklearn.ensemble import StackingRegressor
from sklearn.base import clone
import pandas as pd

def ensamblador_stacking_gridsearch(
    base_estimators: dict,
    param_grids: dict,
    final_estimator,
    X_train, X_val, X_test,
    y_train, y_val, y_test,
    numeric_cols=None
):

    best_models = []

    # Grid search por modelo base
    for name, est in base_estimators.items():
        print(f"\n====== Optimizando {name} ======")
        best_model, _ = modelo_con_gridsearch(
            modelo_base=est,
            param_grid=param_grids[name],
            X_train=X_train, X_val=X_val, X_test=X_test,
            y_train=y_train, y_val=y_val, y_test=y_test,
            numeric_cols=numeric_cols
        )
        # clone() devuelve una copia *sin entrenar* apta para StackingRegressor
        best_models.append((name, clone(best_model)))

    # definir y entrenar el stacking
    stacker = StackingRegressor(
        estimators=best_models,
        final_estimator=final_estimator,
        n_jobs=-1,
        passthrough=False        # solo usamos las predicciones, no las X originales
    )

    # train + val retrain
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])
    stacker.fit(X_train_val, y_train_val)

    #evaluacion
    y_pred_val  = stacker.predict(to_df(X_val,  X_train))
    val_scores  = metricas(y_val,  y_pred_val)
    print(f"[STACKING VALIDACIÓN] MAE={val_scores['MAE']:.2f}, "
          f"RMSE={val_scores['RMSE']:.2f}, MAPE={val_scores['MAPE']:.2f}%")

    y_pred_test = stacker.predict(to_df(X_test, X_train))
    test_scores = metricas(y_test, y_pred_test)
    print(f"[STACKING TEST]       MAE={test_scores['MAE']:.2f}, "
          f"RMSE={test_scores['RMSE']:.2f}, MAPE={test_scores['MAPE']:.2f}%")

    return stacker, {"val": val_scores, "test": test_scores}


In [47]:
def modelo_stacking(X_train, X_val, X_test,
                    y_train, y_val, y_test,
                    numeric_cols=None):

    base_estimators = {
        "ridge": Ridge(),
        "rf"   : RandomForestRegressor(random_state=RANDOM_STATE),
        "xgb"  : XGBRegressor(verbosity=0, random_state=RANDOM_STATE)
    }
    param_grids = {
        "ridge": {"alpha": [0.1, 1.0, 10]},
        "rf"   : {"n_estimators": [100, 200],
                  "max_depth"  : [3, 5, 10]},
        "xgb"  : {"n_estimators"  : [200, 400],
                  "learning_rate": [0.05, 0.1]}
    }

    stack_model, scores = ensamblador_stacking_gridsearch(
        base_estimators=base_estimators,
        param_grids=param_grids,
        final_estimator=LinearRegression(),
        X_train=X_train, X_val=X_val, X_test=X_test,
        y_train=y_train, y_val=y_val, y_test=y_test,
        numeric_cols=numeric_cols
    )

    return stack_model, scores

In [48]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.linear_model import LinearRegression  # meta‑modelo por defecto

def ensamblador_blending_gridsearch(
    base_estimators: dict,
    param_grids: dict,
    final_estimator,
    X_train, X_val, X_test,
    y_train, y_val, y_test,
    numeric_cols=None
):

    best_models = {}
    val_meta_X  = [] 
    test_meta_X = []

    # Grid search por modelo base
    for name, est in base_estimators.items():
        print(f"\n====== Optimizando {name} ======")
        best_model, _ = modelo_con_gridsearch(
            modelo_base=est,
            param_grid=param_grids[name],
            X_train=X_train, X_val=X_val, X_test=X_test,
            y_train=y_train, y_val=y_val, y_test=y_test,
            numeric_cols=numeric_cols
        )
        best_models[name] = clone(best_model)

    #entrenamiento
    for name, mdl in best_models.items():
        mdl.fit(to_df(X_train, X_train), y_train)          
        val_meta_X.append(mdl.predict(to_df(X_val,  X_train)))
    
    val_meta_X = np.column_stack(val_meta_X)               

    # entreno meta‑modelo con las preds de VAL
    meta = clone(final_estimator)
    meta.fit(val_meta_X, y_val)
    
    # train + val retrain
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])

    test_meta_X = []
    for name, mdl in best_models.items():
        mdl.fit(to_df(X_train_val, X_train), y_train_val)  # Train+Val
        test_meta_X.append(mdl.predict(to_df(X_test, X_train)))

    test_meta_X = np.column_stack(test_meta_X)
    y_pred_test = meta.predict(test_meta_X)

    # metricas
    y_pred_val = meta.predict(val_meta_X)
    val_scores  = metricas(y_val,  y_pred_val)
    test_scores = metricas(y_test, y_pred_test)

    print(f"[BLENDING VALIDACIÓN] MAE={val_scores['MAE']:.2f}, "
          f"RMSE={val_scores['RMSE']:.2f}, MAPE={val_scores['MAPE']:.2f}%")
    print(f"[BLENDING TEST]       MAE={test_scores['MAE']:.2f}, "
          f"RMSE={test_scores['RMSE']:.2f}, MAPE={test_scores['MAPE']:.2f}%")

    return meta, {"val": val_scores, "test": test_scores}


In [49]:
#blending
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

def modelo_blending(X_train, X_val, X_test,
                    y_train, y_val, y_test,
                    numeric_cols=None):

    base_estimators = {
        "ridge": Ridge(),
        "rf"   : RandomForestRegressor(random_state=RANDOM_STATE),
        "xgb"  : XGBRegressor(verbosity=0, random_state=RANDOM_STATE)
    }
    param_grids = {
        "ridge": {"alpha": [0.1, 1.0, 10]},
        "rf"   : {"n_estimators": [100, 200],
                  "max_depth"  : [3, 5, 10]},
        "xgb"  : {"n_estimators"  : [200, 400],
                  "learning_rate": [0.05, 0.1]}
    }

    blending_model, scores = ensamblador_blending_gridsearch(
        base_estimators=base_estimators,
        param_grids=param_grids,
        final_estimator=LinearRegression(),
        X_train=X_train, X_val=X_val, X_test=X_test,
        y_train=y_train, y_val=y_val, y_test=y_test,
        numeric_cols=numeric_cols
    )
    return blending_model, scores


In [50]:
#Funcion principal para ejecutar los modelos
def ejecutar_modelos(df, target_col, fecha_col='fecha_primera_sesion'):
    # Definimos las columnas a excluir del modelado
    columnas_excluir = [
        target_col,
        fecha_col,
        'uid',
        'fecha_primera_compra',
        'fecha_ultima_compra',
        'fecha_primera_sesion',
        'primer_source'
        ]

    # Eliminar las filas que tienen nulos en la columna objetivo
    df = df.dropna(subset=[target_col])
    df = df.dropna()

    # Dividimos el dataset por fecha
    train, val, test = dividir_dataset(df, fecha_col)

    # Separar X e y
    X_train = train.drop(columns=columnas_excluir)
    y_train = train[target_col]

    X_val = val.drop(columns=columnas_excluir)
    y_val = val[target_col]

    X_test = test.drop(columns=columnas_excluir)
    y_test = test[target_col]

    modelos = {}
    evaluaciones = {}

    print("Modelos de regresión:")
    print('\n\nLineal\n\n')
    modelos['lineal'], evaluaciones['lineal'] = modelo_lineal(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\n\nEstocastico\n\n')
    modelos['estocastico'], evaluaciones['estocastico'] = modelo_estocastico(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\n\nRidge\n\n')
    modelos['ridge'], evaluaciones['ridge'] = modelo_ridge(X_train, X_val, X_test, y_train, y_val, y_test)

    print("\n\n\nModelos avanzados:")
    print('\n\nRandom Forest\n\n')
    modelos['rf'], evaluaciones['rf'] = modelo_rf(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\n\nXGBoost\n\n')
    modelos['xgb'], evaluaciones['xgb'] = modelo_xgb(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\n\nLightGBM\n\n')
    modelos['lgbm'], evaluaciones['lgbm'] = modelo_lgbm(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\n\nCatboost\n\n')
    modelos['catboost'], evaluaciones['catboost'] = modelo_catboost(X_train, X_val, X_test, y_train, y_val, y_test)

    print("\n\n\nModelos ensambladores:")
    print('\n\nStacking\n\n')
    modelos['stacking'], evaluaciones['stacking'] = modelo_stacking(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\n\nBlending\n\n')
    modelos['blending'], evaluaciones['blending'] = modelo_blending(X_train, X_val, X_test, y_train, y_val, y_test)

    return modelos, evaluaciones


# Importación de los modelos

In [51]:
import os
import pickle
import pandas as pd

def guardar_modelos_y_resultados(modelos, evaluaciones, target_name):
    # Crear carpeta si no existe
    models_dir = f"../models/{target_name}"
    os.makedirs(models_dir, exist_ok=True)

    # Guardar modelos
    for nombre, modelo in modelos.items():
        ruta = os.path.join(models_dir, f"{nombre}.pkl")
        with open(ruta, "wb") as f:
            pickle.dump(modelo, f, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"Modelo «{nombre}» guardado en {ruta}")


In [52]:
def mostrar_resultados(evaluaciones, target_name):
    resultados_df = pd.DataFrame({
        modelo: scores["test"] for modelo, scores in evaluaciones.items()
    }).T
    resultados_df.index.name = f"Modelos ({target_name})"
    display(resultados_df.sort_values('MAPE'))
    return resultados_df


### Implementación

In [53]:
# Entrenar y guardar LTV
print("\n\nENTRENAMIENTO Y EVALUACION DE LTV\n\n")
modelos_ltv, evaluaciones_ltv = ejecutar_modelos(df, target_col='LTV_180')
guardar_modelos_y_resultados(modelos_ltv, evaluaciones_ltv, target_name='LTV_180')

# Entrenar y guardar CAC
print("\n\nENTRENAMIENTO Y EVALUACION DE CAC\n\n")
modelos_cac, evaluaciones_cac = ejecutar_modelos(df, target_col='CAC_source_30')
guardar_modelos_y_resultados(modelos_cac, evaluaciones_cac, target_name='CAC_source_30')


ENTRENAMIENTO Y EVALUACION DE LTV
Modelos de regresión:


Lineal


Mejores parámetros: {}
[VALIDACIÓN] MAE=1.61, RMSE=4.49, MAPE=90.45%
[TEST] MAE=0.85, RMSE=2.01, MAPE=48.90%


Estocastico


Mejores parámetros: {'modelo__alpha': 1, 'modelo__penalty': 'elasticnet'}
[VALIDACIÓN] MAE=4.83, RMSE=13.60, MAPE=279.02%
[TEST] MAE=1.82, RMSE=11.96, MAPE=111.80%


Ridge


Mejores parámetros: {'modelo__alpha': 0.1}
[VALIDACIÓN] MAE=1.61, RMSE=4.49, MAPE=90.46%
[TEST] MAE=0.85, RMSE=2.01, MAPE=48.89%



Modelos avanzados:


Random Forest


Mejores parámetros: {'modelo__max_depth': 10, 'modelo__n_estimators': 200}
[VALIDACIÓN] MAE=0.13, RMSE=3.43, MAPE=1.23%
[TEST] MAE=0.07, RMSE=2.59, MAPE=0.88%


XGBoost


Mejores parámetros: {'modelo__learning_rate': 0.1, 'modelo__max_depth': 3, 'modelo__n_estimators': 100}
[VALIDACIÓN] MAE=0.68, RMSE=4.55, MAPE=39.49%
[TEST] MAE=0.73, RMSE=6.55, MAPE=45.04%


LightGBM


Mejores parámetros: {'modelo__learning_rate': 0.1, 'modelo__max_depth': 5, 'modelo__n_estim

In [54]:
# Para LTV
res_ltv = mostrar_resultados(evaluaciones_ltv, "LTV_180")

# Para CAC
res_cac = mostrar_resultados(evaluaciones_cac, "CAC_source_30")


Unnamed: 0_level_0,MAE,RMSE,MAPE
Modelos (LTV_180),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rf,0.074981,2.587866,0.87684
lgbm,1.663339,22.229173,39.041494
stacking,0.807663,1.896112,42.84039
xgb,0.727004,6.553983,45.041997
ridge,0.850515,2.006956,48.893232
lineal,0.850514,2.006953,48.896834
blending,1.002366,3.03587,52.964587
catboost,0.899823,2.859969,71.012933
estocastico,1.817499,11.961274,111.802887


Unnamed: 0_level_0,MAE,RMSE,MAPE
Modelos (CAC_source_30),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
blending,0.124954,0.149089,40.897014
lineal,0.126369,0.148892,42.006907
ridge,0.126372,0.148892,42.008931
xgb,0.126638,0.149092,42.035539
catboost,0.126711,0.148817,42.263526
stacking,0.126845,0.148837,42.315078
lgbm,0.12712,0.148697,42.507954
rf,0.127333,0.148788,42.661471
estocastico,0.128409,0.149039,43.325694
