# --- Model Training ---

Para esta sección vamos a hacer una función para cada modelo.

Cada función de modelaje será hecha con GridSearchCV para la selección de hiperparámetros. Se usará TimeSeriesSplit para la validación cruzada. Cada función nos devolverá el modelo entrenado con el resultado de las metricas MAE, RMSE y MAPE. Nos entregará un diccionario con estas métricas. Cada función también va a imprimir estas métricas. Cada función va a tomar 6 parámetros. X_train, X_validation, X_test, y_train, y_validation, y_test.

Habrá una función que divida el dataset de la manera que se estipula en el pdf. Siendo esta train 2017, validation 1er sem 2018, test 2do sem 2018.

Habrá una `función maestra` que como parámetro solo tenga el df. Aquí se van a ejecutar todas las funciones. Primero dividiendo el dataset, y de ahí yendo en el siguiente orden: Regresiones (Lineal, Estocastica y Ridge), Modelos avanzados (Random Forest y Gradient Boosting (XGBoost, LightGBM y CatBoost)) y Ensamblador (stacking y blending). 

In [126]:
# Importaciones de la librerias necesarias
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.base import clone

In [127]:
#Semilla general para asegurar reproducibilidad
RANDOM_STATE = 42

In [128]:
# Cargar dataset
df = pd.read_csv('../data/processed/modeling_dataset.csv')
df['fecha_primera_sesion'] = pd.to_datetime(df['fecha_primera_sesion'])

In [129]:
# División del dataset -> 6 meses de 2017 y 2 trimestres de 2018
def dividir_dataset(df, fecha_col='fecha_primera_sesion'):
    df[fecha_col] = pd.to_datetime(df[fecha_col])
    train = df[df[fecha_col].dt.year == 2017]
    val = df[(df[fecha_col].dt.year == 2018) & (df[fecha_col].dt.month <= 3)]
    test = df[(df[fecha_col].dt.year == 2018) & (df[fecha_col].dt.month > 3)]
    return train, val, test

In [130]:
def to_df(X_like, ref_df):
#  convierte ndarray -> DataFrame con los mismos nombres.
    if isinstance(X_like, np.ndarray):
        return pd.DataFrame(X_like, columns=ref_df.columns)
    return X_like

In [131]:
def metricas(y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred)[y_true != 0] / y_true[y_true != 0])) * 100
    return {"MAE": mae, "RMSE": rmse, "MAPE": mape}

In [132]:
def modelo_con_gridsearch(modelo_base, param_grid,X_train, X_val, X_test, y_train, y_val, y_test, numeric_cols=None):
    if numeric_cols is None:
        numeric_cols = X_train.columns                
    preproc = ColumnTransformer(
        [("scale", StandardScaler(), numeric_cols)],
        remainder="drop"
    ).set_output(transform="pandas")                  

    pipeline = Pipeline([
        ("pre", preproc),
        ("modelo", modelo_base)
    ])

    param_grid_pipeline = {f"modelo__{k}": v for k, v in param_grid.items()}

    tscv = TimeSeriesSplit(n_splits=5)
    grid = GridSearchCV(
        pipeline,
        param_grid_pipeline,
        cv=tscv,
        scoring="neg_root_mean_squared_error",
        refit=True
    )
    grid.fit(X_train, y_train)
    print("Mejores parámetros:", grid.best_params_)

    best_model = grid.best_estimator_          

    # evaluación en validacion
    y_pred_val = best_model.predict(to_df(X_val, X_train))
    val_scores = metricas(y_val, y_pred_val)
    print(f"[VALIDACIÓN] MAE={val_scores['MAE']:.2f}, "
          f"RMSE={val_scores['RMSE']:.2f}, MAPE={val_scores['MAPE']:.2f}%")

    # refit en train+val
    X_train_val = pd.concat([X_train, X_val], axis=0)
    y_train_val = pd.concat([y_train, y_val], axis=0)
    best_model.fit(X_train_val, y_train_val)

    # evaluación final en test
    y_pred_test = best_model.predict(to_df(X_test, X_train))
    test_scores = metricas(y_test, y_pred_test)
    print(f"[TEST] MAE={test_scores['MAE']:.2f}, "
          f"RMSE={test_scores['RMSE']:.2f}, MAPE={test_scores['MAPE']:.2f}%")

    return best_model, {"val": val_scores, "test": test_scores}

In [133]:
#regresion lineal
def modelo_lineal(*args):
    return modelo_con_gridsearch(LinearRegression(), {}, *args)

In [134]:
#estoacastico
def modelo_estocastico(*args):
    param_grid = {'alpha': [0.00001,0.0001, 0.001, 0.01,0.1,1,10,100], 'penalty': ['l2', 'elasticnet']}
    return modelo_con_gridsearch(SGDRegressor(max_iter=1000, tol=1e-3,random_state=RANDOM_STATE), param_grid, *args)

In [135]:
#ridge
def modelo_ridge(*args):
    param_grid = {'alpha': [0.1, 1.0, 10.0]}
    return modelo_con_gridsearch(Ridge(), param_grid, *args)

In [136]:
#random forest
def modelo_rf(*args):
    param_grid = {'n_estimators': [100, 200], 'max_depth': [3, 5, 10]}
    return modelo_con_gridsearch(RandomForestRegressor(random_state=RANDOM_STATE), param_grid, *args)

In [137]:
#xgboost
def modelo_xgb(*args):
    param_grid = {'n_estimators': [100], 'max_depth': [3, 5], 'learning_rate': [0.1]}
    return modelo_con_gridsearch(XGBRegressor(verbosity=0,random_state=RANDOM_STATE), param_grid, *args)

In [138]:
#lightGBM
def modelo_lgbm(*args):
    param_grid = {'n_estimators': [100], 'max_depth': [3, 5], 'learning_rate': [0.1]}
    #si algo se ve raro cambiar el verbose
    return modelo_con_gridsearch(LGBMRegressor(verbose=-1,random_state=RANDOM_STATE), param_grid, *args)

In [139]:
#catboost
def modelo_catboost(*args):
    param_grid = {'iterations': [100], 'depth': [3, 5], 'learning_rate': [0.1]}
    #si algo se ve raro cambiar el verbose
    return modelo_con_gridsearch(CatBoostRegressor(verbose=0,random_state=RANDOM_STATE), param_grid, *args)

In [140]:
from sklearn.ensemble import StackingRegressor
from sklearn.base import clone
import pandas as pd

def ensamblador_stacking_gridsearch(
    base_estimators: dict,
    param_grids: dict,
    final_estimator,
    X_train, X_val, X_test,
    y_train, y_val, y_test,
    numeric_cols=None
):

    best_models = []

    # Grid search por modelo base
    for name, est in base_estimators.items():
        print(f"\n====== Optimizando {name} ======")
        best_model, _ = modelo_con_gridsearch(
            modelo_base=est,
            param_grid=param_grids[name],
            X_train=X_train, X_val=X_val, X_test=X_test,
            y_train=y_train, y_val=y_val, y_test=y_test,
            numeric_cols=numeric_cols
        )
        # clone() devuelve una copia *sin entrenar* apta para StackingRegressor
        best_models.append((name, clone(best_model)))

    # definir y entrenar el stacking
    stacker = StackingRegressor(
        estimators=best_models,
        final_estimator=final_estimator,
        n_jobs=-1,
        passthrough=False        # solo usamos las predicciones, no las X originales
    )

    # train + val retrain
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])
    stacker.fit(X_train_val, y_train_val)

    #evaluacion
    y_pred_val  = stacker.predict(to_df(X_val,  X_train))
    val_scores  = metricas(y_val,  y_pred_val)
    print(f"[STACKING VALIDACIÓN] MAE={val_scores['MAE']:.2f}, "
          f"RMSE={val_scores['RMSE']:.2f}, MAPE={val_scores['MAPE']:.2f}%")

    y_pred_test = stacker.predict(to_df(X_test, X_train))
    test_scores = metricas(y_test, y_pred_test)
    print(f"[STACKING TEST]       MAE={test_scores['MAE']:.2f}, "
          f"RMSE={test_scores['RMSE']:.2f}, MAPE={test_scores['MAPE']:.2f}%")

    return stacker, {"val": val_scores, "test": test_scores}


In [141]:
def modelo_stacking(X_train, X_val, X_test,
                    y_train, y_val, y_test,
                    numeric_cols=None):

    base_estimators = {
        "ridge": Ridge(),
        "rf"   : RandomForestRegressor(random_state=RANDOM_STATE),
        "xgb"  : XGBRegressor(verbosity=0, random_state=RANDOM_STATE)
    }
    param_grids = {
        "ridge": {"alpha": [0.1, 1.0, 10]},
        "rf"   : {"n_estimators": [100, 200],
                  "max_depth"  : [3, 5, 10]},
        "xgb"  : {"n_estimators"  : [200, 400],
                  "learning_rate": [0.05, 0.1]}
    }

    stack_model, scores = ensamblador_stacking_gridsearch(
        base_estimators=base_estimators,
        param_grids=param_grids,
        final_estimator=LinearRegression(),
        X_train=X_train, X_val=X_val, X_test=X_test,
        y_train=y_train, y_val=y_val, y_test=y_test,
        numeric_cols=numeric_cols
    )

    return stack_model, scores

In [142]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.linear_model import LinearRegression  # meta‑modelo por defecto

def ensamblador_blending_gridsearch(
    base_estimators: dict,
    param_grids: dict,
    final_estimator,
    X_train, X_val, X_test,
    y_train, y_val, y_test,
    numeric_cols=None
):

    best_models = {}
    val_meta_X  = [] 
    test_meta_X = []

    # Grid search por modelo base
    for name, est in base_estimators.items():
        print(f"\n====== Optimizando {name} ======")
        best_model, _ = modelo_con_gridsearch(
            modelo_base=est,
            param_grid=param_grids[name],
            X_train=X_train, X_val=X_val, X_test=X_test,
            y_train=y_train, y_val=y_val, y_test=y_test,
            numeric_cols=numeric_cols
        )
        best_models[name] = clone(best_model)

    #entrenamiento
    for name, mdl in best_models.items():
        mdl.fit(to_df(X_train, X_train), y_train)          
        val_meta_X.append(mdl.predict(to_df(X_val,  X_train)))
    
    val_meta_X = np.column_stack(val_meta_X)               

    # entreno meta‑modelo con las preds de VAL
    meta = clone(final_estimator)
    meta.fit(val_meta_X, y_val)
    
    # train + val retrain
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])

    test_meta_X = []
    for name, mdl in best_models.items():
        mdl.fit(to_df(X_train_val, X_train), y_train_val)  # Train+Val
        test_meta_X.append(mdl.predict(to_df(X_test, X_train)))

    test_meta_X = np.column_stack(test_meta_X)
    y_pred_test = meta.predict(test_meta_X)

    # metricas
    y_pred_val = meta.predict(val_meta_X)
    val_scores  = metricas(y_val,  y_pred_val)
    test_scores = metricas(y_test, y_pred_test)

    print(f"[BLENDING VALIDACIÓN] MAE={val_scores['MAE']:.2f}, "
          f"RMSE={val_scores['RMSE']:.2f}, MAPE={val_scores['MAPE']:.2f}%")
    print(f"[BLENDING TEST]       MAE={test_scores['MAE']:.2f}, "
          f"RMSE={test_scores['RMSE']:.2f}, MAPE={test_scores['MAPE']:.2f}%")

    # **Asigna sus propias entradas** (los nombres de los base_estimators)
    names = list(best_models.keys())  
    # scikit-learn expects a numpy array here
    meta.feature_names_in_ = np.array(names, dtype=object)

    return meta, {"val": val_scores, "test": test_scores}


In [143]:
#blending
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

def modelo_blending(X_train, X_val, X_test,
                    y_train, y_val, y_test,
                    numeric_cols=None):

    base_estimators = {
        "ridge": Ridge(),
        "rf"   : RandomForestRegressor(random_state=RANDOM_STATE),
        "xgb"  : XGBRegressor(verbosity=0, random_state=RANDOM_STATE)
    }
    param_grids = {
        "ridge": {"alpha": [0.1, 1.0, 10]},
        "rf"   : {"n_estimators": [100, 200],
                  "max_depth"  : [3, 5, 10]},
        "xgb"  : {"n_estimators"  : [200, 400],
                  "learning_rate": [0.05, 0.1]}
    }

    blending_model, scores = ensamblador_blending_gridsearch(
        base_estimators=base_estimators,
        param_grids=param_grids,
        final_estimator=LinearRegression(),
        X_train=X_train, X_val=X_val, X_test=X_test,
        y_train=y_train, y_val=y_val, y_test=y_test,
        numeric_cols=numeric_cols
    )
    return blending_model, scores


In [144]:
def ejecutar_modelos(df, target_col, fecha_col='fecha_primera_sesion'):
    # Definimos las columnas a excluir del modelado
    columnas_excluir = [
        target_col,
        fecha_col,
        'uid',
        'fecha_primera_compra',
        'fecha_ultima_compra',
        'fecha_primera_sesion',
        'primer_source'
    ]

    # Limpiamos nulos
    df = df.dropna(subset=[target_col]).dropna()

    # Particionamos
    train, val, test = dividir_dataset(df, fecha_col)

    # X / y
    X_train, y_train = train.drop(columns=columnas_excluir), train[target_col]
    X_val,   y_val   = val.drop(columns=columnas_excluir),   val[target_col]
    X_test,  y_test  = test.drop(columns=columnas_excluir),  test[target_col]

    modelos = {}
    evaluaciones = {}

    print("Modelos de regresión:")
    print('\nLineal\n')
    modelos['lineal'],      evaluaciones['lineal']      = modelo_lineal(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\nEstocástico\n')
    modelos['estocastico'], evaluaciones['estocastico'] = modelo_estocastico(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\nRidge\n')
    modelos['ridge'],       evaluaciones['ridge']       = modelo_ridge(X_train, X_val, X_test, y_train, y_val, y_test)

    print("\n\nModelos avanzados:")
    print('\nRandom Forest\n')
    modelos['rf'],          evaluaciones['rf']          = modelo_rf(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\nXGBoost\n')
    modelos['xgb'],         evaluaciones['xgb']         = modelo_xgb(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\nLightGBM\n')
    modelos['lgbm'],        evaluaciones['lgbm']        = modelo_lgbm(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\nCatBoost\n')
    modelos['catboost'],    evaluaciones['catboost']    = modelo_catboost(X_train, X_val, X_test, y_train, y_val, y_test)

    print("\n\nModelos ensambladores:")
    print('\nStacking\n')
    modelos['stacking'],    evaluaciones['stacking']    = modelo_stacking(X_train, X_val, X_test, y_train, y_val, y_test)
    print('\nBlending\n')
    modelos['blending'],    evaluaciones['blending']    = modelo_blending(X_train, X_val, X_test, y_train, y_val, y_test)

    # ——— Nuevo bloque: elegir el mejor por MAE en validación ———
    sample = next(iter(evaluaciones))
    keys = set(evaluaciones[sample].keys())

    # 2) Definimos cómo extraer el MAE de validación
    if 'validation' in keys:
        # caso anidado: evaluaciones[m]['validation']['mae']
        get_mae = lambda m: evaluaciones[m]['validation']['mae']
    elif 'val' in keys and 'MAE' in evaluaciones[sample]['val']:
        # caso pipeline con llave 'val' y dentro 'MAE'
        get_mae = lambda m: evaluaciones[m]['val']['MAE']
    elif 'MAE' in keys:
        # caso plano: evaluaciones[m]['MAE']
        get_mae = lambda m: evaluaciones[m]['MAE']
    else:
        raise KeyError(f"No encontré MAE en las keys: {keys}")

    # 3) Selección del mejor
    best_model = min(evaluaciones.keys(), key=get_mae)
    # 4) Recuperamos sus métricas
    best = evaluaciones[best_model]
    # Si es anidado o usa 'val', nivelamos a un dict plano
    if 'validation' in keys:
        best_val = best['validation']
    elif 'val' in keys:
        best_val = best['val']
    else:
        best_val = {'MAE': best['MAE'], 'RMSE': best.get('RMSE'), 'MAPE': best.get('MAPE')}

    print(f"\n>>> Mejor modelo en VALIDACIÓN: {best_model.upper()}")
    print(f"    MAE  = {best_val['MAE']:.2f}")
    print(f"    RMSE = {best_val['RMSE']:.2f}")
    print(f"    MAPE = {best_val['MAPE']:.2f}%")

    return modelos, evaluaciones


# Importación de los modelos

In [145]:
import os
import pickle
import json

def guardar_modelos_y_resultados(
    modelos,
    evaluaciones,
    target_name,
    features_dict=None   # ahora opcional
):
    """
    modelos:       dict {nombre_modelo: objeto_modelo}
    evaluaciones:  dict de métricas (sin usar aquí, pero por consistencia)
    target_name:   'LTV_180' o 'CAC_source_30'
    features_dict: opcional dict {nombre_modelo: [lista_de_columnas_usadas]}
    """
    models_dir = f"../models/{target_name}"
    os.makedirs(models_dir, exist_ok=True)

    for nombre, modelo in modelos.items():
        # 1) Guardar el pickle del modelo
        ruta_modelo = os.path.join(models_dir, f"{nombre}.pkl")
        with open(ruta_modelo, "wb") as f:
            pickle.dump(modelo, f, protocol=pickle.HIGHEST_PROTOCOL)

        # 2) Averiguar lista de features
        feats = None

        # a) si me la pasan explícita en features_dict
        if features_dict and nombre in features_dict:
            feats = features_dict[nombre]

        # b) si el modelo tiene feature_names_in_ (sklearn o XGBoost/LGBM)
        elif hasattr(modelo, "feature_names_in_"):
            feats = modelo.feature_names_in_.tolist()

        # 3) Guardar lista de features en JSON si la encontramos
        if feats is not None:
            ruta_feats = os.path.join(models_dir, f"{nombre}_features.json")
            with open(ruta_feats, "w", encoding="utf-8") as f:
                json.dump(feats, f, ensure_ascii=False, indent=2)



In [146]:
def mostrar_resultados(evaluaciones, target_name):
    resultados_df = pd.DataFrame({
        modelo: scores["test"] for modelo, scores in evaluaciones.items()
    }).T
    resultados_df.index.name = f"Modelos ({target_name})"
    display(resultados_df.sort_values('MAPE'))
    return resultados_df


### Implementación

In [147]:
# Entrenar y guardar LTV
print("\n\nENTRENAMIENTO Y EVALUACION DE LTV\n\n")
modelos_ltv, evaluaciones_ltv = ejecutar_modelos(df, target_col='LTV_180')
guardar_modelos_y_resultados(modelos_ltv, evaluaciones_ltv, target_name='LTV_180')

# Entrenar y guardar CAC
print("\n\nENTRENAMIENTO Y EVALUACION DE CAC\n\n")
modelos_cac, evaluaciones_cac = ejecutar_modelos(df, target_col='CAC_source_30')
guardar_modelos_y_resultados(modelos_cac, evaluaciones_cac, target_name='CAC_source_30')




ENTRENAMIENTO Y EVALUACION DE LTV


Modelos de regresión:

Lineal

Mejores parámetros: {}
[VALIDACIÓN] MAE=3255361398621.24, RMSE=3255361398621.24, MAPE=88334845598783.39%
[TEST] MAE=5.47, RMSE=9.25, MAPE=106.13%

Estocástico

Mejores parámetros: {'modelo__alpha': 0.1, 'modelo__penalty': 'l2'}
[VALIDACIÓN] MAE=44.55, RMSE=63.99, MAPE=1278.18%
[TEST] MAE=14.50, RMSE=21.52, MAPE=477.25%

Ridge

Mejores parámetros: {'modelo__alpha': 0.1}
[VALIDACIÓN] MAE=6.85, RMSE=16.13, MAPE=132.01%
[TEST] MAE=5.48, RMSE=9.25, MAPE=106.42%


Modelos avanzados:

Random Forest

Mejores parámetros: {'modelo__max_depth': 10, 'modelo__n_estimators': 100}
[VALIDACIÓN] MAE=0.74, RMSE=6.86, MAPE=2.60%
[TEST] MAE=1.36, RMSE=14.48, MAPE=2.17%

XGBoost

Mejores parámetros: {'modelo__learning_rate': 0.1, 'modelo__max_depth': 5, 'modelo__n_estimators': 100}
[VALIDACIÓN] MAE=1.09, RMSE=5.95, MAPE=15.59%
[TEST] MAE=1.38, RMSE=6.27, MAPE=24.80%

LightGBM

Mejores parámetros: {'modelo__learning_rate': 0.1, 'modelo__ma

In [148]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

def eval_baseline(df, target_col, fecha_col='fecha_primera_sesion'):
    # 1) Particiona
    train, val, test = dividir_dataset(df, fecha_col)

    for strategy in ['mean','median']:
        # 2) Calcula el valor fijo de predicción
        y_train = train[target_col]
        if strategy=='mean':
            pred_value = y_train.mean()
        else:
            pred_value = y_train.median()

        # 3) Genera vectores de predicción
        y_val   = val[target_col].values
        y_test  = test[target_col].values
        y_pred_val  = np.full_like(y_val,  pred_value, dtype=float)
        y_pred_test = np.full_like(y_test, pred_value, dtype=float)

        # 4) Calcula métricas
        mae_val  = mean_absolute_error(y_val,  y_pred_val)
        rmse_val = mean_squared_error(y_val,  y_pred_val, squared=False)
        mape_val = np.mean(np.abs((y_val - y_pred_val)[y_val!=0] / y_val[y_val!=0]))*100

        mae_test  = mean_absolute_error(y_test,  y_pred_test)
        rmse_test = mean_squared_error(y_test,  y_pred_test, squared=False)
        mape_test = np.mean(np.abs((y_test - y_pred_test)[y_test!=0] / y_test[y_test!=0]))*100

        # 5) Imprime
        print(f"\n--- Baseline ({strategy}) para {target_col} ---")
        print(f"[VAL]  MAE={mae_val:.2f}, RMSE={rmse_val:.2f}, MAPE={mape_val:.2f}%")
        print(f"[TEST] MAE={mae_test:.2f}, RMSE={rmse_test:.2f}, MAPE={mape_test:.2f}%")

# Ejecútalo así:
eval_baseline(df, target_col='LTV_180')
eval_baseline(df, target_col='CAC_source_30')



--- Baseline (mean) para LTV_180 ---
[VAL]  MAE=5.34, RMSE=13.80, MAPE=380.22%
[TEST] MAE=5.56, RMSE=13.81, MAPE=430.84%

--- Baseline (median) para LTV_180 ---
[VAL]  MAE=3.27, RMSE=13.65, MAPE=134.28%
[TEST] MAE=3.52, RMSE=13.68, MAPE=155.10%

--- Baseline (mean) para CAC_source_30 ---
[VAL]  MAE=0.13, RMSE=0.15, MAPE=45.41%
[TEST] MAE=0.13, RMSE=0.15, MAPE=42.56%

--- Baseline (median) para CAC_source_30 ---
[VAL]  MAE=0.12, RMSE=0.17, MAPE=29.85%
[TEST] MAE=0.11, RMSE=0.17, MAPE=28.29%




In [149]:
# Para LTV
print('Resumen de las métricas de LTV')
res_ltv = mostrar_resultados(evaluaciones_ltv, "LTV_180")

# Para CAC
print('\nResumen de las métricas de CAC')
res_cac = mostrar_resultados(evaluaciones_cac, "CAC_source_30")


Resumen de las métricas de LTV


Unnamed: 0_level_0,MAE,RMSE,MAPE
Modelos (LTV_180),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rf,1.359281,14.478625,2.16518
blending,1.421337,8.118607,20.49438
xgb,1.380891,6.273534,24.795161
stacking,3.630236,5.383634,58.023771
catboost,18.693288,177.972063,84.472639
lgbm,5.82401,15.079647,102.567909
lineal,5.473496,9.252063,106.132157
ridge,5.482115,9.252748,106.417487
estocastico,14.496829,21.517275,477.250075



Resumen de las métricas de CAC


Unnamed: 0_level_0,MAE,RMSE,MAPE
Modelos (CAC_source_30),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
xgb,0.125745,0.149223,42.142213
lgbm,0.125865,0.149211,42.283318
catboost,0.126794,0.150225,42.444528
estocastico,0.12756,0.150329,43.028965
lineal,0.127554,0.149881,43.128816
rf,0.128037,0.150661,43.200952
stacking,0.127981,0.150269,43.266239
ridge,0.128066,0.150006,43.435056
blending,0.13132,0.150454,45.607072


In [150]:
# =============================================================================
# Cálculo de umbrales esperados para tus métricas
# =============================================================================
def imprimir_thresholds(df, target_cols):
    for target in target_cols:
        y = df[target].dropna()
        mu, med, sd = y.mean(), y.median(), y.std()
        mae_thr  = 0.20 * mu
        rmse_thr = 0.25 * mu
        mape_thr = 30.0  # en porcentaje
        
        print(f"\n== Estadísticas de '{target}' ==")
        print(f"  Count     = {len(y)}")
        print(f"  Mean (μ)  = {mu:,.2f}")
        print(f"  Median    = {med:,.2f}")
        print(f"  Std (σ)   = {sd:,.2f}")
        print("== Umbrales de métrica ==")
        print(f"  MAE  ≤ {mae_thr:,.2f}    ({mae_thr/mu:.0%} de μ)")
        print(f"  RMSE ≤ {rmse_thr:,.2f}    ({rmse_thr/mu:.0%} de μ)")
        print(f"  MAPE < {mape_thr:.0f}%")
        
# Usar sobre tu df cargado:
imprimir_thresholds(
    df,
    target_cols=['LTV_180','CAC_source_30']
)



== Estadísticas de 'LTV_180' ==
  Count     = 35636
  Mean (μ)  = 6.42
  Median    = 3.00
  Std (σ)   = 85.57
== Umbrales de métrica ==
  MAE  ≤ 1.28    (20% de μ)
  RMSE ≤ 1.60    (25% de μ)
  MAPE < 30%

== Estadísticas de 'CAC_source_30' ==
  Count     = 36522
  Mean (μ)  = 0.34
  Median    = 0.26
  Std (σ)   = 0.15
== Umbrales de métrica ==
  MAE  ≤ 0.07    (20% de μ)
  RMSE ≤ 0.08    (25% de μ)
  MAPE < 30%
