In [8]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import numpy as np
import gc
import os
import optuna
import sqlite3
import ray
import matplotlib.pyplot as plt
import polars as pl
from optuna.integration import LightGBMPruningCallback
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [9]:
gc.collect()
df_full = pd.read_parquet('./data/l_vm_completa_train.parquet', engine='fastparquet')# Abrir el archivo parquet y cargarlo en un DataFrame data/l_vm_completa_train_pendientes.parquet


In [10]:
print(df_full.columns)

Index(['PERIODO', 'ANIO', 'MES', 'MES_SIN', 'MES_COS', 'TRIMESTRE', 'ID_CAT1',
       'ID_CAT2', 'ID_CAT3', 'ID_BRAND', 'SKU_SIZE', 'CUSTOMER_ID',
       'PRODUCT_ID', 'PLAN_PRECIOS_CUIDADOS', 'CUST_REQUEST_QTY',
       'CUST_REQUEST_TN', 'TN', 'STOCK_FINAL', 'MEDIA_MOVIL_3M_CLI_PROD',
       'MEDIA_MOVIL_6M_CLI_PROD', 'MEDIA_MOVIL_12M_CLI_PROD',
       'DESVIO_MOVIL_3M_CLI_PROD', 'DESVIO_MOVIL_6M_CLI_PROD',
       'DESVIO_MOVIL_12M_CLI_PROD', 'MEDIA_MOVIL_3M_PROD',
       'MEDIA_MOVIL_6M_PROD', 'MEDIA_MOVIL_12M_PROD', 'DESVIO_MOVIL_3M_PROD',
       'DESVIO_MOVIL_6M_PROD', 'DESVIO_MOVIL_12M_PROD', 'MEDIA_MOVIL_3M_CLI',
       'MEDIA_MOVIL_6M_CLI', 'MEDIA_MOVIL_12M_CLI', 'DESVIO_MOVIL_3M_CLI',
       'DESVIO_MOVIL_6M_CLI', 'DESVIO_MOVIL_12M_CLI', 'TN_LAG_01', 'TN_LAG_02',
       'TN_LAG_03', 'TN_LAG_04', 'TN_LAG_05', 'TN_LAG_06', 'TN_LAG_07',
       'TN_LAG_08', 'TN_LAG_09', 'TN_LAG_10', 'TN_LAG_11', 'TN_LAG_12',
       'TN_LAG_13', 'TN_LAG_14', 'TN_LAG_15', 'CLASE', 'CLASE_DELTA',
    

In [11]:
# Conservar las siguientes columnas
columns_to_keep = ['PERIODO', 'ANIO', 'MES', 'MES_SIN', 'MES_COS', 'TRIMESTRE', 'ID_CAT1',
       'ID_CAT2', 'ID_CAT3', 'ID_BRAND', 'SKU_SIZE', 'CUSTOMER_ID',
       'PRODUCT_ID', 'PLAN_PRECIOS_CUIDADOS', 'CUST_REQUEST_QTY',
       'CUST_REQUEST_TN', 'TN', 'STOCK_FINAL', 'MEDIA_MOVIL_3M_CLI_PROD',
       'MEDIA_MOVIL_6M_CLI_PROD', 'MEDIA_MOVIL_12M_CLI_PROD',
       'TN_LAG_01', 'TN_LAG_02',
       'TN_LAG_03', 'TN_LAG_04', 'TN_LAG_05', 'TN_LAG_06', 'TN_LAG_07',
       'TN_LAG_08', 'TN_LAG_09', 'TN_LAG_10', 'TN_LAG_11', 'TN_LAG_12',
       'TN_LAG_13', 'TN_LAG_14', 'TN_LAG_15', 'CLASE', 'CLASE_DELTA',
       'ORDINAL', 'TN_DELTA_01', 'TN_DELTA_02', 'TN_DELTA_03', 'TN_DELTA_04',
       'TN_DELTA_05', 'TN_DELTA_06', 'TN_DELTA_07', 'TN_DELTA_08',
       'TN_DELTA_09', 'TN_DELTA_10', 'TN_DELTA_11', 'TN_DELTA_12',
       'TN_DELTA_13', 'TN_DELTA_14', 'TN_DELTA_15', 'ANTIG_CLIENTE',
       'ANTIG_PRODUCTO', 'CANT_PROD_CLI_PER', 'A_PREDECIR']
# Filtrar el DataFrame para conservar solo las columnas deseadas y el periodo hasta 201910
df_full = df_full[columns_to_keep]
# Filtrar el DataFrame para conservar solo las filas con periodo hasta 201910
df_full = df_full[df_full['PERIODO'] <= 201910]

In [12]:
# Convertir el DataFrame a un DataFrame de Polars
df_full = pl.from_pandas(df_full)


In [13]:
# Agrupamiento y cálculo de media y desvío SOLO para TN
group_stats = (
    df_full.group_by(["CUSTOMER_ID", "PRODUCT_ID"])
    .agg([
        pl.col("TN").mean().alias("TN_MEAN"),
        pl.col("TN").std().alias("TN_STD")
    ])
)

# Join con el DataFrame original y cálculo de TODOS los Z-Scores usando TN_MEAN y TN_STD
df_norm = (df_full
    .join(group_stats, on=["CUSTOMER_ID", "PRODUCT_ID"], how="left")
    .with_columns([
        # TN_ZSCORE
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_ZSCORE"),
        
        # CUST_REQUEST_TN_ZSCORE usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("CUST_REQUEST_TN") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("CUST_REQUEST_TN_ZSCORE"),
        
        # Medias móviles ZSCORE usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("MEDIA_MOVIL_3M_CLI_PROD") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("MEDIA_MOVIL_3M_CLI_PROD_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("MEDIA_MOVIL_6M_CLI_PROD") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("MEDIA_MOVIL_6M_CLI_PROD_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("MEDIA_MOVIL_12M_CLI_PROD") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("MEDIA_MOVIL_12M_CLI_PROD_ZSCORE"),
        
        # TN_LAG Z-Scores (1-15) usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_01") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_01_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_02") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_02_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_03") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_03_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_04") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_04_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_05") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_05_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_06") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_06_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_07") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_07_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_08") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_08_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_09") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_09_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_10") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_10_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_11") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_11_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_12") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_12_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_13") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_13_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_14") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_14_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_15") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_15_ZSCORE"),
        
        # CLASE y CLASE_DELTA Z-Scores usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("CLASE") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("CLASE_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("CLASE_DELTA") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("CLASE_DELTA_ZSCORE"),

        # TN_DELTA Z-Scores (1-15) usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_01") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_01_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_02") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_02_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_03") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_03_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_04") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_04_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_05") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_05_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_06") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_06_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_07") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_07_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_08") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_08_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_09") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_09_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_10") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_10_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_11") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_11_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_12") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_12_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_13") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_13_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_14") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_14_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_15") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_15_ZSCORE"),
    ])
)

# Lista de todas las columnas Z-Score para aplicar las correcciones
zscore_columns = [
    "TN_ZSCORE", "CUST_REQUEST_TN_ZSCORE",
    "MEDIA_MOVIL_3M_CLI_PROD_ZSCORE", "MEDIA_MOVIL_6M_CLI_PROD_ZSCORE", "MEDIA_MOVIL_12M_CLI_PROD_ZSCORE",
    "TN_LAG_01_ZSCORE", "TN_LAG_02_ZSCORE", "TN_LAG_03_ZSCORE", "TN_LAG_04_ZSCORE", "TN_LAG_05_ZSCORE",
    "TN_LAG_06_ZSCORE", "TN_LAG_07_ZSCORE", "TN_LAG_08_ZSCORE", "TN_LAG_09_ZSCORE", "TN_LAG_10_ZSCORE",
    "TN_LAG_11_ZSCORE", "TN_LAG_12_ZSCORE", "TN_LAG_13_ZSCORE", "TN_LAG_14_ZSCORE", "TN_LAG_15_ZSCORE",
    "CLASE_ZSCORE", "CLASE_DELTA_ZSCORE",
    "TN_DELTA_01_ZSCORE", "TN_DELTA_02_ZSCORE", "TN_DELTA_03_ZSCORE", "TN_DELTA_04_ZSCORE", "TN_DELTA_05_ZSCORE",
    "TN_DELTA_06_ZSCORE", "TN_DELTA_07_ZSCORE", "TN_DELTA_08_ZSCORE", "TN_DELTA_09_ZSCORE", "TN_DELTA_10_ZSCORE",
    "TN_DELTA_11_ZSCORE", "TN_DELTA_12_ZSCORE", "TN_DELTA_13_ZSCORE", "TN_DELTA_14_ZSCORE", "TN_DELTA_15_ZSCORE",  
]

# Aplicar correcciones para null, NaN e infinito a todas las columnas Z-Score
for col in zscore_columns:
    df_norm = df_norm.with_columns([
        pl.when(pl.col(col).is_null() | pl.col(col).is_nan() | pl.col(col).is_infinite())
        .then(0)
        .otherwise(pl.col(col))
        .alias(col)
    ])

# Convertir de nuevo a DataFrame de Pandas
df_norm = df_norm.to_pandas()

In [14]:
# Mostrar los valores de PERIODO y TN para CUSTOMER_ID = 10003 y PRODUCT_ID = 21294 ordenados por PERIODO
filtered_data = df_norm[(df_norm["CUSTOMER_ID"] == 10003) & (df_norm["PRODUCT_ID"] == 20001)]
result = filtered_data[["PERIODO", "TN","CLASE","CLASE_DELTA","TN_ZSCORE","CLASE_ZSCORE","CLASE_DELTA_ZSCORE","TN_MEAN","TN_STD"]].sort_values("PERIODO")
print(result)

       PERIODO         TN      CLASE  CLASE_DELTA  TN_ZSCORE  CLASE_ZSCORE  \
69      201701  143.49426  137.87537     -5.61889   0.373357      0.294495   
480     201702   20.48319   68.89292     48.40973  -1.353125     -0.673686   
1357    201703  137.87537  135.12190     -2.75347   0.294495      0.255849   
1933    201704   68.89292  171.01785    102.12493  -0.673686      0.759655   
2329    201705  135.12190   64.66196    -70.45994   0.255849     -0.733068   
2941    201706  171.01785   83.63410    -87.38375   0.759655     -0.466791   
2968    201707   64.66196   70.61664      5.95468  -0.733068     -0.649494   
3587    201708   83.63410   62.23308    -21.40102  -0.466791     -0.767158   
4525    201709   70.61664  125.76455     55.14791  -0.649494      0.124517   
4738    201710   62.23308  134.49509     72.26201  -0.767158      0.247052   
5051    201711  125.76455   42.49982    -83.26473   0.124517     -1.044118   
5605    201712  134.49509   77.45132    -57.04377   0.247052    