In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/sell-in.txt', sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [3]:
df_201912 = df[df['periodo'] == 201912]
df_sum_tn = df_201912.groupby('product_id', as_index=False)['tn'].sum()
df_sum_tn.head()

Unnamed: 0,product_id,tn
0,20001,1504.68856
1,20002,1087.30855
2,20003,892.50129
3,20004,637.90002
4,20005,593.24443


In [18]:
df_sum_tn[df_sum_tn['product_id']==20937]

Unnamed: 0,product_id,tn
742,20937,1.03081


In [4]:
df_caract = pd.read_csv('../data/tb_productos.txt', sep='\t', encoding='utf-8')

In [5]:
df_caract.head()

Unnamed: 0,cat1,cat2,cat3,brand,sku_size,product_id
0,HC,ROPA LAVADO,Liquido,LIMPIEX,900,20280
1,HC,ROPA LAVADO,Liquido,LIMPIEX,450,20180
2,HC,ROPA LAVADO,Liquido,LIMPIEX,120,20332
3,HC,ROPA LAVADO,Liquido,LIMPIEX,450,20222
4,HC,ROPA LAVADO,Liquido,LIMPIEX,900,20288


In [None]:
FECHA_CORTE = '2019-10-01'
horizonte_prediccion = 2

product_ids = df['product_id'].unique()
df_pred_final = pd.DataFrame()

for pid in product_ids:
    df_prod = df[df['product_id'] == pid].copy()
    if df_prod.empty:
        continue

    df_prod['unique_id'] = df_prod['product_id'].astype(str) + "_" + df_prod['customer_id'].astype(str)
    df_prod['ds'] = pd.to_datetime(df_prod['periodo'], format='%Y%m')
    df_prod['y'] = df_prod['y'].fillna(0)
    
    # SOLUCIÓN 1: Eliminar duplicados agregando valores por fecha
    df_final_prod = df_prod.groupby(['unique_id', 'ds'], as_index=False)['y'].sum()
    df_final_prod = df_final_prod.sort_values(by=['unique_id', 'ds']).reset_index(drop=True)
    
    # Filtrar datos de entrenamiento
    df_entrenamiento_prod = df_final_prod[df_final_prod['ds'] <= FECHA_CORTE]
    
    # Verificar que no hay duplicados antes de continuar
    duplicates = df_entrenamiento_prod.duplicated(subset=['unique_id', 'ds']).sum()
    if duplicates > 0:
        print(f"Advertencia: {duplicates} duplicados encontrados para product_id {pid}")
        # Eliminar duplicados manteniendo el último valor
        df_entrenamiento_prod = df_entrenamiento_prod.drop_duplicates(
            subset=['unique_id', 'ds'], keep='last'
        ).reset_index(drop=True)

    def objective(trial):
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'random_state': 42
        }

        tscv = TimeSeriesSplit(n_splits=3)
        maes = []
        
        for train_idx, val_idx in tscv.split(df_entrenamiento_prod):
            train_data = df_entrenamiento_prod.iloc[train_idx].copy()
            val_data = df_entrenamiento_prod.iloc[val_idx].copy()
            
            # Verificar que no hay duplicados en los datos de validación
            val_data = val_data.drop_duplicates(subset=['unique_id', 'ds'], keep='last')
            
            try:
                fcst = MLForecast(
                    models=LGBMRegressor(**params),
                    freq='MS',
                    lags=list(range(1, 25)),
                    date_features=['month', 'year'],
                )
                fcst.fit(train_data, static_features=[])
                
                # El horizonte es el número de fechas únicas en el conjunto de validación
                h = val_data['ds'].nunique()
                preds = fcst.predict(h=h)
                
                # Asegurar que no hay duplicados en las predicciones
                preds = preds.drop_duplicates(subset=['unique_id', 'ds'], keep='last')
                
                # Merge con datos de validación para alinear predicciones y valores reales
                comparison_df = pd.merge(
                    val_data, 
                    preds, 
                    on=['unique_id', 'ds'], 
                    how='inner'  # Solo mantener registros que coincidan
                )
                
                if len(comparison_df) > 0:
                    # Calcular MAE en los datos alineados
                    maes.append(mean_absolute_error(comparison_df['y'], comparison_df['LGBMRegressor']))
                else:
                    # Si no hay datos para comparar, asignar un MAE alto
                    maes.append(1000)
                    
            except Exception as e:
                print(f"Error en validación cruzada: {e}")
                maes.append(1000)  # Penalizar parámetros que causan errores

        return np.mean(maes)

    try:
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=20, show_progress_bar=False)
        best_params = study.best_params
        best_params['random_state'] = 42

        # Entrenar modelo final con mejores parámetros
        fcst_prod = MLForecast(
            models=LGBMRegressor(**best_params),
            freq='MS',
            lags=list(range(1, 25)),
            date_features=['month', 'year'],
        )
        fcst_prod.fit(df_entrenamiento_prod, static_features=[])

        pred_prod = fcst_prod.predict(h=horizonte_prediccion)
        pred_prod['product_id'] = pid

        # Filtrar predicciones para diciembre 2019
        pred_prod_201912 = pred_prod[pred_prod['ds'] == '2019-12-01'].copy()
        
        if not pred_prod_201912.empty:
            pred_prod_201912['customer_id'] = pred_prod_201912['unique_id'].str.split('_').str[1].astype(int)
            pred_prod_201912.rename(columns={'LGBMRegressor': 'tn'}, inplace=True)

            df_pred_final = pd.concat([
                df_pred_final, 
                pred_prod_201912[['product_id', 'customer_id', 'tn']]
            ], ignore_index=True)
        else:
            print(f"No se encontraron predicciones para diciembre 2019 en product_id {pid}")
            
    except Exception as e:
        print(f"Error procesando product_id {pid}: {e}")
        continue

# Resumen final
if not df_pred_final.empty:
    df_pred_sum = df_pred_final.groupby('product_id', as_index=False)['tn'].sum()
    print(df_pred_sum)
else:
    print("No se generaron predicciones")

In [None]:
# Optimizado

FECHA_CORTE = '2019-10-01'
horizonte_prediccion = 2

product_ids = df['product_id'].unique()
df_pred_final = pd.DataFrame()

for pid in product_ids:
    df_prod = df[df['product_id'] == pid].copy()
    if df_prod.empty:
        continue

    df_prod['unique_id'] = df_prod['product_id'].astype(str) + "_" + df_prod['customer_id'].astype(str)
    df_prod['ds'] = pd.to_datetime(df_prod['periodo'], format='%Y%m')
    df_prod['y'] = df_prod['y'].fillna(0)
    
    # SOLUCIÓN 1: Eliminar duplicados agregando valores por fecha
    df_final_prod = df_prod.groupby(['unique_id', 'ds'], as_index=False)['y'].sum()
    df_final_prod = df_final_prod.sort_values(by=['unique_id', 'ds']).reset_index(drop=True)
    
    # Filtrar datos de entrenamiento
    df_entrenamiento_prod = df_final_prod[df_final_prod['ds'] <= FECHA_CORTE]
    
    # Verificar que no hay duplicados antes de continuar
    duplicates = df_entrenamiento_prod.duplicated(subset=['unique_id', 'ds']).sum()
    if duplicates > 0:
        print(f"Advertencia: {duplicates} duplicados encontrados para product_id {pid}")
        # Eliminar duplicados manteniendo el último valor
        df_entrenamiento_prod = df_entrenamiento_prod.drop_duplicates(
            subset=['unique_id', 'ds'], keep='last'
        ).reset_index(drop=True)

    def objective(trial):
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'random_state': 42
        }

        tscv = TimeSeriesSplit(n_splits=3)
        maes = []
        
        for train_idx, val_idx in tscv.split(df_entrenamiento_prod):
            train_data = df_entrenamiento_prod.iloc[train_idx].copy()
            val_data = df_entrenamiento_prod.iloc[val_idx].copy()
            
            # Verificar que no hay duplicados en los datos de validación
            val_data = val_data.drop_duplicates(subset=['unique_id', 'ds'], keep='last')
            
            try:
                fcst = MLForecast(
                    models=LGBMRegressor(**params),
                    freq='MS',
                    lags=list(range(1, 25)),
                    date_features=['month', 'year'],
                )
                fcst.fit(train_data, static_features=[])
                
                # El horizonte es el número de fechas únicas en el conjunto de validación
                h = val_data['ds'].nunique()
                preds = fcst.predict(h=h)
                
                # Asegurar que no hay duplicados en las predicciones
                preds = preds.drop_duplicates(subset=['unique_id', 'ds'], keep='last')
                
                # Merge con datos de validación para alinear predicciones y valores reales
                comparison_df = pd.merge(
                    val_data, 
                    preds, 
                    on=['unique_id', 'ds'], 
                    how='inner'  # Solo mantener registros que coincidan
                )
                
                if len(comparison_df) > 0:
                    # Calcular MAE en los datos alineados
                    maes.append(mean_absolute_error(comparison_df['y'], comparison_df['LGBMRegressor']))
                else:
                    # Si no hay datos para comparar, asignar un MAE alto
                    maes.append(1000)
                    
            except Exception as e:
                print(f"Error en validación cruzada: {e}")
                maes.append(1000)  # Penalizar parámetros que causan errores

        return np.mean(maes)

    try:
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=20, show_progress_bar=False)
        best_params = study.best_params
        best_params['random_state'] = 42

        # Entrenar modelo final con mejores parámetros
        fcst_prod = MLForecast(
            models=LGBMRegressor(**best_params),
            freq='MS',
            lags=list(range(1, 25)),
            date_features=['month', 'year'],
        )
        fcst_prod.fit(df_entrenamiento_prod, static_features=[])

        pred_prod = fcst_prod.predict(h=horizonte_prediccion)
        pred_prod['product_id'] = pid

        # Filtrar predicciones para diciembre 2019
        pred_prod_201912 = pred_prod[pred_prod['ds'] == '2019-12-01'].copy()
        
        if not pred_prod_201912.empty:
            pred_prod_201912['customer_id'] = pred_prod_201912['unique_id'].str.split('_').str[1].astype(int)
            pred_prod_201912.rename(columns={'LGBMRegressor': 'tn'}, inplace=True)

            df_pred_final = pd.concat([
                df_pred_final, 
                pred_prod_201912[['product_id', 'customer_id', 'tn']]
            ], ignore_index=True)
        else:
            print(f"No se encontraron predicciones para diciembre 2019 en product_id {pid}")
            
    except Exception as e:
        print(f"Error procesando product_id {pid}: {e}")
        continue

# Resumen final
if not df_pred_final.empty:
    df_pred_sum = df_pred_final.groupby('product_id', as_index=False)['tn'].sum()
    print(df_pred_sum)
else:
    print("No se generaron predicciones")

In [None]:
FECHA_CORTE = '2019-12-01'
horizonte_prediccion = 2  # enero y febrero 2020

product_ids = df['product_id'].unique()
df_pred_final = pd.DataFrame()

for pid in product_ids:
    df_prod = df[df['product_id'] == pid].copy()
    if df_prod.empty:
        continue

    df_prod['unique_id'] = df_prod['product_id'].astype(str) + "_" + df_prod['customer_id'].astype(str)
    df_prod['ds'] = pd.to_datetime(df_prod['periodo'], format='%Y%m')
    df_prod['y'] = df_prod['y'].fillna(0)
    df_final_prod = df_prod[['unique_id', 'ds', 'y']].sort_values(by=['unique_id', 'ds']).reset_index(drop=True)
    df_final_prod = df_final_prod.loc[:, ~df_final_prod.columns.duplicated()]
    df_entrenamiento_prod = df_final_prod[df_final_prod['ds'] <= FECHA_CORTE]

    fcst_prod = MLForecast(
        models=LGBMRegressor(random_state=42, n_estimators=100),
        freq='MS',
        lags=list(range(1, 25)),
        date_features=['month', 'year'],
    )
    fcst_prod.fit(df_entrenamiento_prod, static_features=[])

    pred_prod = fcst_prod.predict(h=horizonte_prediccion)
    pred_prod['product_id'] = pid

    pred_prod_202002 = pred_prod[pred_prod['ds'] == '2020-02-01'].copy()
    pred_prod_202002['customer_id'] = pred_prod_202002['unique_id'].str.split('_').str[1].astype(int)
    pred_prod_202002.rename(columns={'LGBMRegressor': 'tn'}, inplace=True)

    df_pred_final = pd.concat([df_pred_final, pred_prod_202002[['product_id', 'customer_id', 'tn']]], ignore_index=True)

df_pred_sum = df_pred_final.groupby('product_id', as_index=False)['tn'].sum()
print(df_pred_sum)