In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import numpy as np
import gc
import os
import optuna
import sqlite3
import ray
import matplotlib.pyplot as plt
import polars as pl
from optuna.integration import LightGBMPruningCallback
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import Parallel, delayed
from more_itertools import chunked


In [2]:
gc.collect()
df_full = pd.read_parquet('./data/l_vm_completa_train.parquet', engine='fastparquet')# Abrir el archivo parquet y cargarlo en un DataFrame data/l_vm_completa_train_pendientes.parquet

In [None]:
# Eliminar de df_full las filas donde la columna A_PREDECIR sea 'N'
#df_resultado['A_PREDECIR'] = df_resultado['A_PREDECIR'].map({'S': True, 'N': False})
df_full = df_full[df_full['A_PREDECIR'] != 'N']
# Eliminar de df_full la columna A_PREDECIR
df_full = df_full.drop(columns=['A_PREDECIR'])



In [None]:
# Conservar las siguientes columnas
columns_to_keep = ['PERIODO', 'ANIO', 'MES', 'MES_SIN', 'MES_COS', 'TRIMESTRE', 'ID_CAT1',
       'ID_CAT2', 'ID_CAT3', 'ID_BRAND', 'SKU_SIZE', 'CUSTOMER_ID',
       'PRODUCT_ID', 'PLAN_PRECIOS_CUIDADOS', 'CUST_REQUEST_QTY',
       'CUST_REQUEST_TN', 'TN', 'STOCK_FINAL', 'TN_LAG_01', 'TN_LAG_02',
       'TN_LAG_03', 'TN_LAG_04', 'TN_LAG_05', 'TN_LAG_06', 'TN_LAG_07',
       'TN_LAG_08', 'TN_LAG_09', 'TN_LAG_10', 'TN_LAG_11', 'TN_LAG_12',
       'TN_LAG_13', 'TN_LAG_14', 'TN_LAG_15', 'CLASE', 'CLASE_DELTA',
       'ORDINAL', 'TN_DELTA_01', 'TN_DELTA_02', 'TN_DELTA_03', 'TN_DELTA_04',
       'TN_DELTA_05', 'TN_DELTA_06', 'TN_DELTA_07', 'TN_DELTA_08',
       'TN_DELTA_09', 'TN_DELTA_10', 'TN_DELTA_11', 'TN_DELTA_12',
       'TN_DELTA_13', 'TN_DELTA_14', 'TN_DELTA_15', 'ANTIG_CLIENTE',
       'ANTIG_PRODUCTO', 'CANT_PROD_CLI_PER', 'A_PREDECIR']
# Filtrar el DataFrame para conservar solo las columnas deseadas y el periodo hasta 201910
df_full = df_full[columns_to_keep]

In [None]:
# Convertir el DataFrame a un DataFrame de Polars
df_full = pl.from_pandas(df_full)

In [None]:
# Agrupamiento y cálculo de media y desvío SOLO para TN
group_stats = (
    df_full.group_by(["CUSTOMER_ID", "PRODUCT_ID"])
    .agg([
        pl.col("TN").mean().alias("TN_MEAN"),
        pl.col("TN").std().alias("TN_STD")
    ])
)
# Agrupamiento y cálculo de media y desvío SOLO para CLASE_DELTA
group_stats_clase = (
    df_full.group_by(["CUSTOMER_ID", "PRODUCT_ID"])
    .agg([
        pl.col("CLASE_DELTA").mean().alias("CLASE_DELTA_MEAN"),
        pl.col("CLASE_DELTA").std().alias("CLASE_DELTA_STD")
    ])
)


In [None]:

# Lo mismo para CLASE_DELTA usando group_stats_clase y CLASE_DELTA_MEAN y CLASE_DELTA_STD
df_norm = (df_full
    .join(group_stats_clase, on=["CUSTOMER_ID", "PRODUCT_ID"], how="left")
    .with_columns([
        # CLASE_DELTA Z-Scores usando CLASE_DELTA_MEAN y CLASE_DELTA_STD
        pl.when(pl.col("CLASE_DELTA_STD") > 0)
        .then((pl.col("CLASE_DELTA") - pl.col("CLASE_DELTA_MEAN")) / pl.col("CLASE_DELTA_STD"))
        .otherwise(pl.lit(0))
        .alias("CLASE_DELTA_ZSCORE"),
    ])
)

In [None]:


# Join con el DataFrame original y cálculo de TODOS los Z-Scores usando TN_MEAN y TN_STD
df_norm = (df_norm
    .join(group_stats, on=["CUSTOMER_ID", "PRODUCT_ID"], how="left")
    .with_columns([
        # TN_ZSCORE
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_ZSCORE"),
        
        # CUST_REQUEST_TN_ZSCORE usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("CUST_REQUEST_TN") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("CUST_REQUEST_TN_ZSCORE"),
        
        # TN_LAG Z-Scores (1-15) usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_01") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_01_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_02") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_02_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_03") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_03_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_04") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_04_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_05") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_05_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_06") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_06_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_07") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_07_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_08") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_08_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_09") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_09_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_10") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_10_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_11") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_11_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_12") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_12_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_13") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_13_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_14") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_14_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_LAG_15") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_LAG_15_ZSCORE"),
        
        # CLASE y CLASE_DELTA Z-Scores usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("CLASE") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("CLASE_ZSCORE"),

        # TN_DELTA Z-Scores (1-15) usando TN_MEAN y TN_STD
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_01") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_01_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_02") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_02_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_03") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_03_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_04") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_04_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_05") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_05_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_06") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_06_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_07") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_07_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_08") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_08_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_09") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_09_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_10") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_10_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_11") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_11_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_12") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_12_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_13") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_13_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_14") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_14_ZSCORE"),
        
        pl.when(pl.col("TN_STD") > 0)
        .then((pl.col("TN_DELTA_15") - pl.col("TN_MEAN")) / pl.col("TN_STD"))
        .otherwise(pl.lit(0))
        .alias("TN_DELTA_15_ZSCORE"),
    ])
)

In [None]:

# Lista de todas las columnas Z-Score para aplicar las correcciones
zscore_columns = [
    "TN_ZSCORE", "CUST_REQUEST_TN_ZSCORE",
    "TN_LAG_01_ZSCORE", "TN_LAG_02_ZSCORE", "TN_LAG_03_ZSCORE", "TN_LAG_04_ZSCORE", "TN_LAG_05_ZSCORE",
    "TN_LAG_06_ZSCORE", "TN_LAG_07_ZSCORE", "TN_LAG_08_ZSCORE", "TN_LAG_09_ZSCORE", "TN_LAG_10_ZSCORE",
    "TN_LAG_11_ZSCORE", "TN_LAG_12_ZSCORE", "TN_LAG_13_ZSCORE", "TN_LAG_14_ZSCORE", "TN_LAG_15_ZSCORE",
    "CLASE_ZSCORE", "CLASE_DELTA_ZSCORE",
    "TN_DELTA_01_ZSCORE", "TN_DELTA_02_ZSCORE", "TN_DELTA_03_ZSCORE", "TN_DELTA_04_ZSCORE", "TN_DELTA_05_ZSCORE",
    "TN_DELTA_06_ZSCORE", "TN_DELTA_07_ZSCORE", "TN_DELTA_08_ZSCORE", "TN_DELTA_09_ZSCORE", "TN_DELTA_10_ZSCORE",
    "TN_DELTA_11_ZSCORE", "TN_DELTA_12_ZSCORE", "TN_DELTA_13_ZSCORE", "TN_DELTA_14_ZSCORE", "TN_DELTA_15_ZSCORE",  
]

# Aplicar correcciones para null, NaN e infinito a todas las columnas Z-Score
for col in zscore_columns:
    df_norm = df_norm.with_columns([
        pl.when(pl.col(col).is_null() | pl.col(col).is_nan() | pl.col(col).is_infinite())
        .then(0)
        .otherwise(pl.col(col))
        .alias(col)
    ])


In [None]:

# Eliminar las columnas originales para las que se calcularon los Z-Scores
df_norm = df_norm.drop(["TN", "CUST_REQUEST_TN",     
    "TN_LAG_01", "TN_LAG_02", "TN_LAG_03", "TN_LAG_04", "TN_LAG_05",
    "TN_LAG_06", "TN_LAG_07", "TN_LAG_08", "TN_LAG_09", "TN_LAG_10",
    "TN_LAG_11", "TN_LAG_12", "TN_LAG_13", "TN_LAG_14", "TN_LAG_15",
    "CLASE", "CLASE_DELTA",
    "TN_DELTA_01", "TN_DELTA_02", "TN_DELTA_03", "TN_DELTA_04", "TN_DELTA_05",
    "TN_DELTA_06", "TN_DELTA_07", "TN_DELTA_08", "TN_DELTA_09", "TN_DELTA_10",
    "TN_DELTA_11", "TN_DELTA_12", "TN_DELTA_13", "TN_DELTA_14", "TN_DELTA_15"
])

# Convertir de nuevo a DataFrame de Pandas
df_norm = df_norm.to_pandas()

In [None]:
del df_full, group_stats, group_stats_clase
gc.collect()

In [None]:
# --- Cálculo de features por grupo ---
def calcular_pendientes_grupo(group, periodos_list):
    group = group.sort_values(by='PERIODO').copy()
    n = len(group)
    y_series = pd.Series(group['TN_ZSCORE'].values)

    new_cols = {}

    for cant in periodos_list:
        x = np.arange(cant)
        rolling = y_series.rolling(window=cant)

        # Medidas estadísticas
        mean_vals = rolling.mean().values
        std_vals = rolling.std().values
        median_vals = rolling.median().values
        min_vals = rolling.min().values
        max_vals = rolling.max().values
        ewma_vals = y_series.ewm(span=cant, adjust=False).mean().values

        new_cols[f'TN_MEAN_ZSCORE_{cant}'] = mean_vals
        new_cols[f'TN_STD_ZSCORE_{cant}'] = std_vals
        new_cols[f'TN_MEDIAN_ZSCORE_{str(cant).zfill(2)}'] = median_vals
        new_cols[f'TN_MIN_ZSCORE_{str(cant).zfill(2)}'] = min_vals
        new_cols[f'TN_MAX_ZSCORE_{str(cant).zfill(2)}'] = max_vals
        new_cols[f'TN_EWMA_ZSCORE_{str(cant).zfill(2)}'] = ewma_vals

        # Pendiente de regresión lineal
        if n >= cant:
            y_rolling = np.lib.stride_tricks.sliding_window_view(y_series.values, window_shape=cant)
            X = np.vstack([x, np.ones(cant)]).T
            XTX_inv_XT = np.linalg.pinv(X)
            betas = XTX_inv_XT @ y_rolling.T
            pendientes = np.full(n, np.nan)
            pendientes[cant - 1:] = betas[0]
        else:
            pendientes = np.full(n, np.nan)
        new_cols[f'PENDIENTE_TENDENCIA_ZSCORE_{cant}'] = pendientes

        # Medidas de variabilidad respecto a la media
        abs_diff = np.abs(y_series.values - mean_vals)
        residuals = y_series.values - mean_vals
        res_std = pd.Series(residuals).rolling(window=cant).std().values
        cv_vals = std_vals / np.where(mean_vals == 0, np.nan, mean_vals)

        new_cols[f'TN_ABS_DIFF_MEAN_ZSCORE_{cant}'] = abs_diff
        new_cols[f'TN_RESIDUAL_STD_ZSCORE_{cant}'] = res_std
        new_cols[f'TN_CV_ZSCORE_{cant}'] = cv_vals

    df_features = pd.DataFrame(new_cols, index=group.index)
    group = pd.concat([group, df_features], axis=1)
    return group

# --- Procesar un chunk de grupos ---
def procesar_chunk(chunk, periodos_list):
    return pd.concat([calcular_pendientes_grupo(g, periodos_list) for g in chunk], ignore_index=True)

# --- Paralelización eficiente ---
def calcular_pendientes_parallel_optimizado(df, periodos_list, n_jobs=28, chunk_size=100):
    df = df.copy()  # conserva todas las columnas originales
    grupos = [group for _, group in df.groupby(['PRODUCT_ID', 'CUSTOMER_ID'])]
    chunks = list(chunked(grupos, chunk_size))

    resultados = Parallel(n_jobs=n_jobs, backend='loky', verbose=10)(
        delayed(procesar_chunk)(chunk, periodos_list) for chunk in chunks
    )

    df_final = pd.concat(resultados, ignore_index=True)
    return df_final

# --- Script principal ---
if __name__ == "__main__":
    import time
    start = time.time()

    df_resultado = calcular_pendientes_parallel_optimizado(
        df_norm,
        periodos_list=[2, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 34],
        n_jobs=28,
        chunk_size=200
    )

    print(f"Tiempo total: {time.time() - start:.2f} segundos")


In [None]:
del df_norm
gc.collect()

In [None]:
# Agregar a df_resultado una variable categorica MES_PROBLEMATICO que sea 1 si PERIODO es 201906 o 201908 o 201910, y 0 en caso contrario
df_resultado['MES_PROBLEMATICO'] = df_resultado['PERIODO'].apply(lambda x: True if x in [201906, 201908] else False)
df_resultado['PLAN_PRECIOS_CUIDADOS'] = df_resultado['PLAN_PRECIOS_CUIDADOS'].map({1 : True, 0: False})

In [None]:
#Optimizar tipos de datos numéricos
for col in df_resultado.select_dtypes(include=['int64']).columns:
    df_resultado[col] = pd.to_numeric(df_resultado[col], downcast='integer')
for col in df_resultado.select_dtypes(include=['float64']).columns:
    df_resultado[col] = pd.to_numeric(df_resultado[col], downcast='float')
categorical_features = []

In [None]:
# Guardar el DataFrame resultante en un archivo parquet
df_resultado.to_parquet('./data/l_vm_completa_normalizada_fe.parquet', engine='fastparquet', index=False)
