In [1]:
import numpy as np
import pandas as pd

In [2]:
df_medianas = pd.read_csv('df_medianas_lightgbm_e_historicas.csv')

In [3]:
print(df_medianas.head())

   CUSTOMER_ID  PRODUCT_ID  TN_median  lgbm_pred_median
0        10400       20004        0.0           0.29227
1        10400       20005        0.0           0.28392
2        10406       20003        0.0           0.11807
3        10379       20010        0.0           0.20038
4        10379       20021        0.0           0.27023


In [4]:
df_promedios = pd.read_csv('promedios_tn_no_mueven_aguja.csv')

In [5]:
#df_promedios = df_promedios.sort_values(by="TN_MEAN", ascending=False).reset_index(drop=True)
df_promedios = df_promedios.sort_values(by="TN_MEAN", ascending=False)
print(df_promedios.head())

       PRODUCT_ID  CUSTOMER_ID    TN_MEAN
8624        20032        10013  71.558814
34510       20127        10013  33.455072
8646        20032        10045  24.859385
8627        20032        10017  23.109227
8626        20032        10015  22.222603


In [7]:
def ensamble_historico_modelo(df, w_hist=0.5, tipo='media', clip_min=0.0, clip_max=None):
    """
    Devuelve un DataFrame con una nueva columna 'pred_ensamble' que combina TN_median y lgbm_pred_median.
    
    - w_hist: peso para TN_median (0 = solo modelo, 1 = solo histórico)
    - tipo: 'media' (promedio ponderado), 'geometrica' (media geométrica), 'max', 'min'
    - clip_min, clip_max: para acotar valores
    """
    if tipo == 'media':
        pred = w_hist * df['TN_median'] + (1 - w_hist) * df['lgbm_pred_median']
    elif tipo == 'geometrica':
        # Cuidado con ceros
        pred = (df['TN_median'].clip(lower=1e-9) ** w_hist) * (df['lgbm_pred_median'].clip(lower=1e-9) ** (1-w_hist))
    elif tipo == 'max':
        pred = df[['TN_median', 'lgbm_pred_median']].max(axis=1)
    elif tipo == 'min':
        pred = df[['TN_median', 'lgbm_pred_median']].min(axis=1)
    else:
        raise ValueError("Tipo no soportado")
    
    if clip_max is not None:
        pred = pred.clip(lower=clip_min, upper=clip_max)
    else:
        pred = pred.clip(lower=clip_min)
        
    df = df.copy()
    df['pred_ensamble'] = pred
    return df


In [57]:
df_test = ensamble_historico_modelo(df_medianas, tipo='max', w_hist=0.5)


In [58]:
# 1. Renombrar la columna de predicción en cada df a 'TN'
df_test_out = df_test[['CUSTOMER_ID', 'PRODUCT_ID', 'pred_ensamble']].rename(columns={'pred_ensamble': 'TN'})
df_promedios_out = df_promedios[['CUSTOMER_ID', 'PRODUCT_ID', 'TN_MEAN']].rename(columns={'TN_MEAN': 'TN'})

# 2. Concatenar
df_concat = pd.concat([df_test_out, df_promedios_out], ignore_index=True)

print(df_concat.head())


   CUSTOMER_ID  PRODUCT_ID       TN
0        10400       20004  0.29227
1        10400       20005  0.28392
2        10406       20003  0.11807
3        10379       20010  0.20038
4        10379       20021  0.27023


In [95]:
# Ahora hay que sumar TN por cada PRODUCT_ID, la salida debe verse así: product_id,tn
# 20001,1504.68856
# 20002,1087.30855
df_suma_TN = (
    df_concat
    .groupby('PRODUCT_ID')['TN']
    .sum()
    .reset_index()
    .rename(columns={'PRODUCT_ID': 'product_id', 'TN': 'tn'})
)


In [96]:

df_suma_TN['tn'] *= 0.95

In [97]:

print(df_suma_TN.head())
df_suma_TN.to_csv('suma_tn_por_producto.csv', index=False)

   product_id           tn
0       20001  1302.542947
1       20002   847.024079
2       20003   731.166324
3       20004   545.169686
4       20005   532.160269
