# DSMarket - Forecasting

## Instalacion de paquetes

In [86]:
#pip install statsmodels
#pip install xgboost

## Importamos librerias necesarias

In [87]:
import pandas as pd
import os
import numpy as np
import statsmodels
import re
import pickle

from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import xgboost as xgb

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from typing import Callable, List, Dict, Union

## 1. Cargamos fichero de datos

In [88]:
df = pd.read_parquet("df_prediction_1", engine="fastparquet")
print(df.shape)
print(df.head(3))

(8476220, 14)
                       id  item  category  department  store_code  region  \
0  ACCESORIES_1_001_BOS_1     0         0           0           0       0   
1  ACCESORIES_1_001_BOS_1     0         0           0           0       0   
2  ACCESORIES_1_001_BOS_1     0         0           0           0       0   

   yearweek  avg_sell_price  event  cluster_store  cluster_item  n_sales  \
0    201104             NaN      0              2             0        0   
1    201105             NaN      1              2             0        0   
2    201106             NaN      0              2             0        0   

        date  revenue  
0 2011-01-24      0.0  
1 2011-01-31      0.0  
2 2011-02-07      0.0  


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8476220 entries, 0 to 8476219
Data columns (total 14 columns):
 #   Column          Dtype         
---  ------          -----         
 0   id              object        
 1   item            int64         
 2   category        int64         
 3   department      int64         
 4   store_code      int64         
 5   region          int64         
 6   yearweek        int64         
 7   avg_sell_price  float64       
 8   event           int64         
 9   cluster_store   int32         
 10  cluster_item    int32         
 11  n_sales         int64         
 12  date            datetime64[ns]
 13  revenue         float64       
dtypes: datetime64[ns](1), float64(2), int32(2), int64(8), object(1)
memory usage: 840.7+ MB


In [90]:
df["revenue"][df["yearweek"]==201617] = df["n_sales"] * df["avg_sell_price"]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["revenue"][df["yearweek"]==201617] = df["n_sales"] * df["avg_sell_price"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [91]:
series = df["yearweek"].nunique()
print("Cantidad de series temporales: ", series)

Cantidad de series temporales:  278


In [92]:
series = df["id"].nunique()
print("Cantidad de productos: ", series)

Cantidad de productos:  30490


In [93]:
first_week = df["yearweek"].min()
last_week = df["yearweek"].max()
print(f"Semana inicial es {first_week}\nSemana final es {last_week}")

Semana inicial es 201104
Semana final es 201620


## 3. Validacion de nulos

In [94]:
def contar_nulos(*dataframes):
    """
    Cuenta los valores nulos, la cantidad de registros por columna y el porcentaje
    de nulos para cada DataFrame proporcionado.
    """
    for i, df in enumerate(dataframes):
        print(f"\nAnálisis del DataFrame {i+1}:")
        null_counts = df.isnull().sum()
        total_counts = df.shape[0]
        null_percentage = (null_counts / total_counts) * 100

        null_info_df = pd.DataFrame({
            'Cantidad Registros': total_counts,
            'Cantidad Nulos': null_counts,
            '% Nulos': null_percentage
        })

        # Mostrar la información para cada columna
        print(null_info_df[null_info_df['Cantidad Nulos'] > 0])

In [95]:
contar_nulos(df)


Análisis del DataFrame 1:
                Cantidad Registros  Cantidad Nulos    % Nulos
avg_sell_price             8476220         1775233  20.943687


Hacemos copia para usar al fianal

In [96]:
df_copy = df.copy()

Creación diferencia - eliminar estacionalidad

In [97]:
df_ventas_origen = df[["date", "id", "n_sales", "yearweek", "avg_sell_price"]]

In [98]:
df = df.sort_values(['id', 'date'])

# Diferencia mensual por producto: y_t - y_{t-1}
df["monthly_sales_diff"] = df.groupby("id")["n_sales"].diff(1)

In [99]:
df = df.drop(columns="n_sales")

In [100]:
df["n_sales"] = df["monthly_sales_diff"]

In [101]:
df = df.drop(columns="monthly_sales_diff")

In [102]:
df = df.dropna(subset="n_sales")

Creación semana sin y  cos

In [103]:
df["yearweek"]

1          201105
2          201106
3          201107
4          201108
5          201109
            ...  
8354259    201616
8476216    201617
8476217    201618
8476218    201619
8476219    201620
Name: yearweek, Length: 8445730, dtype: int64

In [104]:
# Asegurar que sea string
df["yearweek"] = df["yearweek"].astype(str)

# Separar: 4 primeros dígitos = año, el resto = semana
df["year"] = df["yearweek"].str[:4].astype(int)
df["week"] = df["yearweek"].str[4:].astype(int)


In [105]:
df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)

In [106]:
df.isnull().sum()

id                      0
item                    0
category                0
department              0
store_code              0
region                  0
yearweek                0
avg_sell_price    1753491
event                   0
cluster_store           0
cluster_item            0
date                    0
revenue                 0
n_sales                 0
year                    0
week                    0
week_sin                0
week_cos                0
dtype: int64

## 7. Creacion de features para time series

In [107]:
def build_ts_vars(df, gb_list, target_column, agg_func, agg_func_name, nr_lags):

    assert "date" in df.columns.tolist(), "Date must be in df columns"

    new_name = "_".join(gb_list + [target_column, agg_func_name])

    # 1) Agregar a nivel semanal (date ya corresponde al lunes ISO)
    gb_df_ = (
        df.groupby(gb_list + ["date"], as_index=False)[target_column]
          .agg(agg_func)
          .rename(columns={target_column: new_name})
          .sort_values(gb_list + ["date"])
    )

    # 2) Crear lags semanales por cada grupo
    for lag in range(1, nr_lags + 1):
        gb_df_[f"{new_name}_lag{lag}"] = (
            gb_df_.groupby(gb_list, sort=False)[new_name].shift(lag)
        )

    # 4) Evitar target leakage
    print(f"Dropping columns that might cause target leakage {new_name}")
    gb_df_.drop(columns=[new_name], inplace=True)

    return gb_df_

In [108]:
def _resolve_agg_func(agg_func: Union[str, Callable]) -> Union[str, Callable]:
    if isinstance(agg_func, str):
        return agg_func.lower()  # "sum", "mean", etc.
    if callable(agg_func):
        # 🔽 Mapea np.sum → "sum", np.mean → "mean", etc.
        lut = {"sum": "sum", "mean": "mean", "max": "max", "min": "min", "median": "median"}
        name = getattr(agg_func, "__name__", "")
        if name in lut:
            return lut[name]
        return agg_func  # callables raros se devuelven tal cual
    raise ValueError(f"agg_func inválido: {agg_func}")

def _resolve_agg_name(agg_func: Union[str, Callable], agg_func_name: str) -> str:
    if agg_func_name:
        return agg_func_name
    if isinstance(agg_func, str):
        return agg_func.lower()
    if callable(agg_func) and getattr(agg_func, "__name__", None):
        return agg_func.__name__.lower()
    return "agg"

def generate_and_merge_features(
    df: pd.DataFrame,
    specs: List[Dict],
    *,
    verbose: bool = True,
) -> pd.DataFrame:
    """
    specs: lista de diccionarios con llaves:
      - gb_list: List[str]
      - target_column: str
      - agg_func: str | Callable
      - agg_func_name: str (opcional; si vacío se infiere)
      - nr_lags: int
    Retorna df base + todas las columnas de lags generadas en specs.
    """
    base = df.copy()

    for i, spec in enumerate(specs, start=1):
        gb_list        = spec["gb_list"]
        target_column  = spec["target_column"]
        agg_func       = _resolve_agg_func(spec["agg_func"])
        agg_func_name  = _resolve_agg_name(spec["agg_func"], spec.get("agg_func_name", ""))
        nr_lags        = int(spec["nr_lags"])

        # construir feature table con tu función base (sin resample)
        feats = build_ts_vars(
            df=base,
            gb_list=gb_list,
            target_column=target_column,
            agg_func=agg_func,
            agg_func_name=agg_func_name,
            nr_lags=nr_lags,
        )
        # claves de unión para este feature set
        on_keys = gb_list + ["date"]

        if verbose:
            created_cols = [c for c in feats.columns if c.endswith(tuple([f"_lag{k}" for k in range(1, nr_lags+1)]))]
            print(f"[{i}/{len(specs)}] Merge on {on_keys} | new cols: {created_cols[:5]}{' ...' if len(created_cols)>5 else ''}")

        # merge left al df base
        base = base.merge(feats, on=on_keys, how="left")

    return base


#### Configuracion de features

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8445730 entries, 1 to 8476219
Data columns (total 18 columns):
 #   Column          Dtype         
---  ------          -----         
 0   id              object        
 1   item            int64         
 2   category        int64         
 3   department      int64         
 4   store_code      int64         
 5   region          int64         
 6   yearweek        object        
 7   avg_sell_price  float64       
 8   event           int64         
 9   cluster_store   int32         
 10  cluster_item    int32         
 11  date            datetime64[ns]
 12  revenue         float64       
 13  n_sales         float64       
 14  year            int64         
 15  week            int64         
 16  week_sin        float64       
 17  week_cos        float64       
dtypes: datetime64[ns](1), float64(5), int32(2), int64(8), object(2)
memory usage: 1.1+ GB


Benchmark:
1. Store_code/Revenue/sum/l=5 - store_code,department/n_sales/mean/l=4 --> RMSE 18.94

In [110]:
df

Unnamed: 0,id,item,category,department,store_code,region,yearweek,avg_sell_price,event,cluster_store,cluster_item,date,revenue,n_sales,year,week,week_sin,week_cos
1,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201105,,1,2,0,2011-01-31,0.0,0.0,2011,5,0.568065,0.822984
2,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201106,,0,2,0,2011-02-07,0.0,0.0,2011,6,0.663123,0.748511
3,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201107,,1,2,0,2011-02-14,0.0,0.0,2011,7,0.748511,0.663123
4,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201108,,0,2,0,2011-02-21,0.0,0.0,2011,8,0.822984,0.568065
5,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201109,,0,2,0,2011-02-28,0.0,0.0,2011,9,0.885456,0.464723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8354259,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201616,1.2,0,2,0,2016-04-18,0.0,0.0,2016,16,0.935016,-0.354605
8476216,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201617,1.2,0,2,0,2016-04-25,13.2,11.0,2016,17,0.885456,-0.464723
8476217,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201618,1.2,0,2,0,2016-05-02,0.0,-11.0,2016,18,0.822984,-0.568065
8476218,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201619,1.2,0,2,0,2016-05-09,0.0,0.0,2016,19,0.748511,-0.663123


In [111]:
features = [
    {
        "gb_list": ["item"],
        "target_column": "n_sales",
        "agg_func": "sum",
        "agg_func_name": "",  # se inferirá el mismo nombre de la funcion si no se diligencia
        "nr_lags": 4,
    },
    {
        "gb_list": ["item"],
        "target_column": "n_sales",
        "agg_func": "mean",
        "agg_func_name": "",  # se inferirá el mismo nombre de la funcion si no se diligencia
        "nr_lags": 4,
    },
    {
        "gb_list": ["region"],
        "target_column": "n_sales",
        "agg_func": "mean",
        "agg_func_name": "",  # se inferirá el mismo nombre de la funcion si no se diligencia
        "nr_lags": 4,
    },
    {
        "gb_list": ["item", "store_code"],
        "target_column": "revenue",
        "agg_func": "mean",
        "agg_func_name": "",  # se inferirá el mismo nombre de la funcion si no se diligencia
        "nr_lags": 4,
    },
    {
        "gb_list": ["store_code", "item"],
        "target_column": "n_sales",
        "agg_func": "sum",
        "agg_func_name": "",
        "nr_lags": 12,
    },
    {
        "gb_list": ["department", "store_code"],
        "target_column": "n_sales",
        "agg_func": "max",
        "agg_func_name": "",
        "nr_lags": 4,
    },
    {
        "gb_list": ["region", "store_code"],
        "target_column": "n_sales",
        "agg_func": "sum",
        "agg_func_name": "",
        "nr_lags": 4,
    }
]

In [112]:
df_features = generate_and_merge_features(df, features, verbose=True)
df_col = df.shape[1]
df_feat_col = df_features.shape[1]
dif_cols = df_feat_col-df_col
print(f"Se han creado {dif_cols} nuevas variables")

Dropping columns that might cause target leakage item_n_sales_sum
[1/7] Merge on ['item', 'date'] | new cols: ['item_n_sales_sum_lag1', 'item_n_sales_sum_lag2', 'item_n_sales_sum_lag3', 'item_n_sales_sum_lag4']
Dropping columns that might cause target leakage item_n_sales_mean
[2/7] Merge on ['item', 'date'] | new cols: ['item_n_sales_mean_lag1', 'item_n_sales_mean_lag2', 'item_n_sales_mean_lag3', 'item_n_sales_mean_lag4']
Dropping columns that might cause target leakage region_n_sales_mean
[3/7] Merge on ['region', 'date'] | new cols: ['region_n_sales_mean_lag1', 'region_n_sales_mean_lag2', 'region_n_sales_mean_lag3', 'region_n_sales_mean_lag4']
Dropping columns that might cause target leakage item_store_code_revenue_mean
[4/7] Merge on ['item', 'store_code', 'date'] | new cols: ['item_store_code_revenue_mean_lag1', 'item_store_code_revenue_mean_lag2', 'item_store_code_revenue_mean_lag3', 'item_store_code_revenue_mean_lag4']
Dropping columns that might cause target leakage store_code_

# Nuevas funciones

In [113]:
def add_shift_ratio_features_with_shifts(
    df: pd.DataFrame,
    target_column: str,
    group_col: str = "unique_id"
) -> None:
    """
    Agrega al DataFrame:
    - columnas con shift(52), shift(53), shift(54)
    - ratios entre target y esos shifts

    Todo agrupado por group_col (por defecto: "unique_id"), y modificado in-place.
    """

    df.sort_values([group_col, "date"], inplace=True)

    # Crear columnas intermedias de shift
    df[f"{target_column}_shift52"] = (
        df.groupby(group_col)[target_column].transform(lambda x: x.shift(52))
    )
    df[f"{target_column}_shift26"] = (
        df.groupby(group_col)[target_column].transform(lambda x: x.shift(26))
    )
    
    # Crear ratios
    df[f"{target_column}_vs_shift52"] = (
        df[target_column] / df[f"{target_column}_shift52"]
    )

    df[f"{target_column}_vs_shift26"] = (
        df[target_column] / df[f"{target_column}_shift26"]
    )

In [114]:
import pandas as pd

def add_rolling_feature_inplace(
    df: pd.DataFrame,
    target_column: str,
    groupby_cols,
    window: int = 52,
    agg_func: str = "mean",
    n_lags: int = 0,
    new_column_name: str = None
) -> None:
    """
    Añade una feature de ventana móvil (rolling) agrupada y sus lags, modificando el DataFrame in-place.

    Parámetros:
    - df: DataFrame de entrada
    - target_column: columna sobre la que se aplica el rolling
    - groupby_cols: str o lista de columnas por las que agrupar
    - window: tamaño de la ventana rolling
    - agg_func: función a aplicar: "mean", "sum", "std", "min", "max"
    - n_lags: número de lags a crear a partir del rolling
    - new_column_name: nombre base de la nueva columna (opcional)
    """

    if isinstance(groupby_cols, str):
        groupby_cols = [groupby_cols]

    if new_column_name is None:
        new_column_name = f"{'_'.join(groupby_cols)}_{target_column}_roll{window}_{agg_func}"

    # Ordenar para el cálculo correcto
    df.sort_values(groupby_cols + ["date"], inplace=True)

    # Calcular rolling
    df[new_column_name] = (
        df.groupby(groupby_cols)[target_column]
          .transform(lambda x: x.rolling(window=window, min_periods=window)
                                .__getattribute__(agg_func)())
    )

    # Crear lags si se solicita
    if n_lags > 0:
        for lag in range(1, n_lags + 1):
            lag_col = f"{new_column_name}_lag{lag}"
            df[lag_col] = (
                df.groupby(groupby_cols)[new_column_name]
                  .shift(lag)
            )

        # Eliminar la columna rolling original
        df.drop(columns=[new_column_name], inplace=True)

In [115]:
from typing import Union, Sequence, Optional
import pandas as pd

def add_rolling_quantile_inplace(
    df: pd.DataFrame,
    target_column: str,
    groupby_cols: Union[str, Sequence[str]],
    window: int = 3,
    q: Union[float, Sequence[float]] = 0.5,
    interpolation: str = "linear",
    min_periods: Optional[int] = None,
    n_lags: int = 0,
    new_column_name: Optional[str] = None,
) -> None:
    """
    Añade cuantiles rolling agrupados sobre `target_column`, genera sus lags y
    modifica `df` in-place. Las columnas de cuantiles intermedias se eliminan al final.

    Parámetros:
    - df: DataFrame de entrada (debe tener columna 'date' ordenable).
    - target_column: columna numérica sobre la que calcular el rolling quantile.
    - groupby_cols: columna(s) por las que agrupar (str o lista).
    - window: tamaño de la ventana rolling.
    - q: cuantíl o lista de cuantiles en [0, 1] (p.ej. 0.25, [0.25, 0.5, 0.75]).
    - interpolation: método de interpolación para quantile (p.ej. "linear").
    - min_periods: mínimos periodos para calcular (por defecto = window).
    - n_lags: número de lags a generar para cada cuantíl calculado.
    - new_column_name: nombre de la nueva columna (solo válido si q es escalar).
    """

    if isinstance(groupby_cols, str):
        groupby_cols = [groupby_cols]

    if min_periods is None:
        min_periods = window

    # Normaliza q a lista
    q_list = list(q) if isinstance(q, (list, tuple)) else [q]

    # Validaciones
    for qi in q_list:
        if not (0.0 <= float(qi) <= 1.0):
            raise ValueError(f"Cada q debe estar entre 0 y 1. Recibido: {qi}")

    if new_column_name is not None and len(q_list) != 1:
        raise ValueError("`new_column_name` solo se permite cuando `q` es escalar.")

    # Orden por grupo y fecha
    df.sort_values(groupby_cols + ["date"], inplace=True)

    # Definición del nombre base para los cuantiles
    def _default_col_name(qi: float) -> str:
        q_str = f"{qi:.3f}".rstrip("0").rstrip(".")
        return f"{'_'.join(groupby_cols)}_{target_column}_roll{window}_q{q_str}"

    grp = df.groupby(groupby_cols, sort=False)[target_column]
    temp_cols = []  # Para luego eliminar las columnas temporales

    # 1️⃣ Crear columnas de cuantiles rolling
    for qi in q_list:
        col = new_column_name if (new_column_name and len(q_list) == 1) else _default_col_name(qi)
        df[col] = grp.transform(
            lambda s: pd.to_numeric(s, errors="coerce")
                        .rolling(window=window, min_periods=min_periods)
                        .quantile(q=qi, interpolation=interpolation)
        )
        temp_cols.append(col)

    # 2️⃣ Crear columnas de lags
    if n_lags > 0:
        for col in temp_cols:
            for lag in range(1, n_lags + 1):
                lag_col = f"{col}_lag{lag}"
                df[lag_col] = df.groupby(groupby_cols, sort=False)[col].shift(lag)

    # 3️⃣ Eliminar columnas temporales de cuantiles (si se crearon)
    df.drop(columns=temp_cols, inplace=True)

# Nuevas Features

In [116]:
quantiles = [0.1, 0.5, 0.9]

## ventas últimos 26 semanas

### Por unique_id

In [117]:
add_rolling_feature_inplace(
    df_features,
    target_column = "n_sales",
    groupby_cols = "id",
    window = 4,
    agg_func = "mean",
    n_lags = 1
)

In [118]:
add_rolling_quantile_inplace(
    df_features,
    target_column= "n_sales",
    groupby_cols = "id",
    window = 4,
    q= quantiles,
    n_lags = 1
)

In [119]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "id",
#    window = 4,
#    agg_func = "max",
#    n_lags = 1
# )

In [120]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "id",
#    window = 4,
#    agg_func = "min",
#    n_lags = 1
# )

In [121]:
add_rolling_feature_inplace(
   df_features,
   target_column = "n_sales",
   groupby_cols = "id",
   window = 4,
   agg_func = "std",
   n_lags = 1
)

In [122]:
# add_rolling_feature_inplace(
#     df_features,
#     target_column = "n_sales",
#     groupby_cols = "id",
#     window = 3,
#     agg_func = "mean",
#     n_lags = 1
# )

In [123]:
# add_rolling_quantile_inplace(
#     df_features,
#     target_column= "n_sales",
#     groupby_cols = "id",
#     window = 3,
#     q= quantiles,
#     n_lags = 1
# )

In [124]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "id",
#    window = 3,
#    agg_func = "max",
#    n_lags = 1
# )

In [125]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "id",
#    window = 3,
#    agg_func = "min",
#    n_lags = 1
# )

In [126]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "id",
#    window = 3,
#    agg_func = "std",
#    n_lags = 1
# )

Por cluster

In [127]:
add_rolling_feature_inplace(
    df_features,
    target_column = "n_sales",
    groupby_cols = "cluster_item",
    window = 4,
    agg_func = "mean",
    n_lags = 1
)

In [128]:
# add_rolling_quantile_inplace(
#     df_features,
#     target_column= "n_sales",
#     groupby_cols = "cluster_item",
#     window = 4,
#     q= quantiles,
#     n_lags = 1
# )

In [129]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "cluster_item",
#    window = 4,
#    agg_func = "max",
#    n_lags = 1
# )

In [130]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "cluster_item",
#    window = 4,
#    agg_func = "min",
#    n_lags = 1
# )

In [131]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "cluster_item",
#    window = 4,
#    agg_func = "std",
#    n_lags = 1
# )

### Por ventas tienda

In [132]:
# add_rolling_feature_inplace(
#     df_features,
#     target_column = "n_sales",
#     groupby_cols = ["department","store_code"],
#     window = 4,
#     agg_func = "mean",
#     n_lags = 1
# )

In [133]:
# add_rolling_feature_inplace(
#     df_features,
#     target_column = "n_sales",
#     groupby_cols = ["department","store_code"],
#     window = 4,
#     agg_func = "max",
#     n_lags = 1
# )

In [134]:
# add_rolling_quantile_inplace(
#     df_features,
#     target_column= "n_sales",
#     groupby_cols = "store_code",
#     window = 4,
#     q= quantiles,
#     n_lags = 1
# )

In [135]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "store_code",
#    window = 4,
#    agg_func = "max",
#    n_lags = 1
# )

In [136]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "store_code",
#    window = 4,
#    agg_func = "min",
#    n_lags = 1
# )

In [137]:
# add_rolling_feature_inplace(
#    df_features,
#    target_column = "n_sales",
#    groupby_cols = "store_code",
#    window = 4,
#    agg_func = "std",
#    n_lags = 1
# )

## 8. Split Train, Test y Predict

In [138]:
# df_features['random_feature'] = np.random.rand(len(df_features))

In [139]:
from sklearn.preprocessing import OrdinalEncoder

# Asegurarte de que sea string
df_features["yearweek"] = df_features["yearweek"].astype(str)

# Crear el encoder
encoder = OrdinalEncoder()

# Ajustar y transformar
df_features["yearweek_encoded"] = encoder.fit_transform(df_features[["yearweek"]])

# Verificar
print(df_features[["yearweek", "yearweek_encoded"]].head())

     yearweek  yearweek_encoded
0      201105               0.0
277    201105               0.0
554    201105               0.0
831    201105               0.0
1108   201105               0.0


In [140]:
df_features.set_index(["yearweek", "id"], inplace=True)

In [141]:
# df_features.drop(columns=["year"], inplace=True)

# df_features.drop(columns=[     "store_code",
#     "store_code_item_n_sales_sum_lag7",
#     "event",
#     "store_code_item_n_sales_sum_lag5",
#     "id_n_sales_roll4_q0.5_lag1",
#     "store_code_item_n_sales_sum_lag6",
#     "store_code_item_n_sales_sum_lag12",
#     "store_code_item_n_sales_sum_lag11",
#     "store_code_item_n_sales_sum_lag8",
#     "store_code_item_n_sales_sum_lag3",
#     "department",
#     "store_code_item_n_sales_sum_lag10"], inplace=True)

In [142]:
# df_features.drop(columns=[ 'department', 'store_code', 'year', 'item_n_sales_sum_lag2', 'item_n_sales_sum_lag3', 'item_n_sales_mean_lag1', 'item_n_sales_mean_lag2', 'item_n_sales_mean_lag3', 'item_n_sales_mean_lag4', 'region_n_sales_mean_lag1', 'region_n_sales_mean_lag2', 'region_n_sales_mean_lag3', 'region_n_sales_mean_lag4', 'store_code_item_n_sales_sum_lag5', 'store_code_item_n_sales_sum_lag6', 'store_code_item_n_sales_sum_lag7', 'store_code_item_n_sales_sum_lag8', 'store_code_item_n_sales_sum_lag9', 'store_code_item_n_sales_sum_lag10', 'store_code_item_n_sales_sum_lag11', 'store_code_item_n_sales_sum_lag12', 'department_store_code_n_sales_max_lag3', 'region_store_code_n_sales_sum_lag1', 'region_store_code_n_sales_sum_lag3', 'region_store_code_n_sales_sum_lag4', 'id_n_sales_roll4_q0.5_lag1', 'id_n_sales_roll3_q0.5_lag1', 'id_n_sales_roll3_min_lag1', 'store_code_n_sales_roll4_max_lag1', 'random_feature' 
# ], inplace=True)

In [143]:
df_features = df_features.replace([np.inf, -np.inf], 0)

In [144]:
# Asegurar orden por fecha
df_features = df_features.sort_values("date")

# Obtener las últimas 4 fechas únicas
ultimas_fechas = df_features["date"].drop_duplicates().sort_values().iloc[-3:]

# Separar las filas de las últimas 4 fechas
mask_ultimas = df_features["date"].isin(ultimas_fechas)

# Dividir en dos partes: anteriores y últimas
df_anteriores = df_features.loc[~mask_ultimas].dropna(subset=["avg_sell_price"])
df_ultimas = df_features.loc[mask_ultimas]

# Unirlas nuevamente (manteniendo el orden)
df_features = pd.concat([df_anteriores, df_ultimas]).sort_values("date")

In [145]:
df_features.columns.tolist()

['item',
 'category',
 'department',
 'store_code',
 'region',
 'avg_sell_price',
 'event',
 'cluster_store',
 'cluster_item',
 'date',
 'revenue',
 'n_sales',
 'year',
 'week',
 'week_sin',
 'week_cos',
 'item_n_sales_sum_lag1',
 'item_n_sales_sum_lag2',
 'item_n_sales_sum_lag3',
 'item_n_sales_sum_lag4',
 'item_n_sales_mean_lag1',
 'item_n_sales_mean_lag2',
 'item_n_sales_mean_lag3',
 'item_n_sales_mean_lag4',
 'region_n_sales_mean_lag1',
 'region_n_sales_mean_lag2',
 'region_n_sales_mean_lag3',
 'region_n_sales_mean_lag4',
 'item_store_code_revenue_mean_lag1',
 'item_store_code_revenue_mean_lag2',
 'item_store_code_revenue_mean_lag3',
 'item_store_code_revenue_mean_lag4',
 'store_code_item_n_sales_sum_lag1',
 'store_code_item_n_sales_sum_lag2',
 'store_code_item_n_sales_sum_lag3',
 'store_code_item_n_sales_sum_lag4',
 'store_code_item_n_sales_sum_lag5',
 'store_code_item_n_sales_sum_lag6',
 'store_code_item_n_sales_sum_lag7',
 'store_code_item_n_sales_sum_lag8',
 'store_code_item_n_

In [146]:
weeks = sorted(df_features["date"].unique())
weeks

[Timestamp('2011-01-31 00:00:00'),
 Timestamp('2011-02-07 00:00:00'),
 Timestamp('2011-02-14 00:00:00'),
 Timestamp('2011-02-21 00:00:00'),
 Timestamp('2011-02-28 00:00:00'),
 Timestamp('2011-03-07 00:00:00'),
 Timestamp('2011-03-14 00:00:00'),
 Timestamp('2011-03-21 00:00:00'),
 Timestamp('2011-03-28 00:00:00'),
 Timestamp('2011-04-04 00:00:00'),
 Timestamp('2011-04-11 00:00:00'),
 Timestamp('2011-04-18 00:00:00'),
 Timestamp('2011-04-25 00:00:00'),
 Timestamp('2011-05-02 00:00:00'),
 Timestamp('2011-05-09 00:00:00'),
 Timestamp('2011-05-16 00:00:00'),
 Timestamp('2011-05-23 00:00:00'),
 Timestamp('2011-05-30 00:00:00'),
 Timestamp('2011-06-06 00:00:00'),
 Timestamp('2011-06-13 00:00:00'),
 Timestamp('2011-06-20 00:00:00'),
 Timestamp('2011-06-27 00:00:00'),
 Timestamp('2011-07-04 00:00:00'),
 Timestamp('2011-07-11 00:00:00'),
 Timestamp('2011-07-18 00:00:00'),
 Timestamp('2011-07-25 00:00:00'),
 Timestamp('2011-08-01 00:00:00'),
 Timestamp('2011-08-08 00:00:00'),
 Timestamp('2011-08-

In [147]:

train_pred_weeks = weeks[:-4]
predict_weeks = weeks[-3]

recuperacion_week = weeks[-4]


In [148]:
predict_weeks

Timestamp('2016-05-02 00:00:00')

In [149]:
COLS_DROP = ["n_sales"]

In [150]:
df_features.reset_index(inplace=True)

In [151]:

X_train_pred = df_features[df_features["date"].isin(train_pred_weeks)].drop(columns = COLS_DROP, axis = 1).set_index(["yearweek", "id", "date", "revenue", "avg_sell_price"])
X_pred = df_features[df_features["date"]==predict_weeks].drop(columns = COLS_DROP, axis = 1).set_index(["yearweek", "id", "date", "revenue", "avg_sell_price"])


In [152]:

y_train_pred = df_features[df_features["date"].isin(train_pred_weeks)]["n_sales"]
y_pred = df_features[df_features["date"]==predict_weeks]["n_sales"]


## 9. Predicción

In [153]:
X_train_pred.reset_index(inplace=True)

In [154]:
X_train_pred.set_index(["id","date","yearweek", "avg_sell_price", "revenue"], inplace=True)

In [155]:
# Elige la clase correcta según tu problema
model = xgb.XGBRegressor()   # o xgb.XGBRegressor()

# Cargar directamente desde el .json guardado
model.load_model("xgb_model_ds_market.json")

In [156]:
model.fit(
    X_train_pred, y_train_pred
)

0,1,2
,objective,'reg:squarederror'
,base_score,'3.3620946E-2'
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [157]:
Y_pred_predict = model.predict(X_pred)
Y_pred_predict

array([ 0.8688316,  1.4244406,  2.4607024, ..., -0.4264788, -0.4862673,
       -1.4369506], shape=(30490,), dtype=float32)

In [158]:
df_ventas_predict_lag1 = df_ventas_origen[df_ventas_origen["date"]==recuperacion_week]

In [159]:
df_ventas_predict_lag1

Unnamed: 0,date,id,n_sales,yearweek,avg_sell_price
8354260,2016-04-25,ACCESORIES_1_001_BOS_1,3,201617,10.9858
8354264,2016-04-25,ACCESORIES_1_001_BOS_2,2,201617,10.9858
8354268,2016-04-25,ACCESORIES_1_001_BOS_3,3,201617,10.9858
8354272,2016-04-25,ACCESORIES_1_001_NYC_1,7,201617,11.1454
8354276,2016-04-25,ACCESORIES_1_001_NYC_2,5,201617,11.1454
...,...,...,...,...,...
8476200,2016-04-25,SUPERMARKET_3_827_NYC_3,33,201617,1.2000
8476204,2016-04-25,SUPERMARKET_3_827_NYC_4,8,201617,1.2000
8476208,2016-04-25,SUPERMARKET_3_827_PHI_1,13,201617,1.2000
8476212,2016-04-25,SUPERMARKET_3_827_PHI_2,15,201617,1.2000


In [160]:
df_ventas_predict_lag1 = df_ventas_predict_lag1[["id", "n_sales", "avg_sell_price"]]

In [161]:
X_pred.reset_index(inplace=True)

In [162]:
df_prediction = pd.DataFrame({
    'id': X_pred['id'],          # toma el id original
    'Y_pred_predict': Y_pred_predict  # agrega las predicciones
})

In [163]:
df_prediction.reset_index()

Unnamed: 0,index,id,Y_pred_predict
0,0,SUPERMARKET_3_444_PHI_3,0.868832
1,1,SUPERMARKET_3_446_BOS_2,1.424441
2,2,SUPERMARKET_3_446_BOS_1,2.460702
3,3,SUPERMARKET_3_444_PHI_2,1.167160
4,4,SUPERMARKET_3_455_BOS_1,4.911260
...,...,...,...
30485,30485,SUPERMARKET_3_096_BOS_2,0.353778
30486,30486,SUPERMARKET_3_096_BOS_3,-0.812378
30487,30487,SUPERMARKET_3_096_NYC_1,-0.426479
30488,30488,SUPERMARKET_3_097_PHI_2,-0.486267


In [164]:
df_prediction = df_prediction.merge(
    df_ventas_predict_lag1[['id', 'n_sales', "avg_sell_price"]],
    on='id',
    how='left'  # usa 'left' para mantener todos los ids de df_prediction
)

In [165]:
df_prediction["ventas"] = df_prediction["Y_pred_predict"] + df_prediction["n_sales"]

In [166]:
df_prediction

Unnamed: 0,id,Y_pred_predict,n_sales,avg_sell_price,ventas
0,SUPERMARKET_3_444_PHI_3,0.868832,0,8.376,0.868832
1,SUPERMARKET_3_446_BOS_2,1.424441,22,1.416,23.424441
2,SUPERMARKET_3_446_BOS_1,2.460702,17,1.416,19.460702
3,SUPERMARKET_3_444_PHI_2,1.167160,0,8.376,1.167160
4,SUPERMARKET_3_455_BOS_1,4.911260,70,2.976,74.911260
...,...,...,...,...,...
30485,SUPERMARKET_3_096_BOS_2,0.353778,4,5.376,4.353778
30486,SUPERMARKET_3_096_BOS_3,-0.812378,3,5.376,2.187622
30487,SUPERMARKET_3_096_NYC_1,-0.426479,7,5.616,6.573521
30488,SUPERMARKET_3_097_PHI_2,-0.486267,9,2.688,8.513733


In [167]:
df_prediction = df_prediction[["id", "ventas", "avg_sell_price"]]

In [168]:
df_prediction["n_sales"] = df_prediction["ventas"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction["n_sales"] = df_prediction["ventas"]


In [169]:
df_prediction.drop(columns="ventas", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction.drop(columns="ventas", inplace=True)


In [170]:
df_prediction["n_sales"] = df_prediction["n_sales"].clip(lower=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction["n_sales"] = df_prediction["n_sales"].clip(lower=0)


In [171]:
df_prediction["n_sales"].describe()

count    30490.000000
mean         7.726647
std         16.402132
min          0.000000
25%          1.842849
50%          3.918632
75%          8.014355
max        668.410557
Name: n_sales, dtype: float64

In [172]:
df_prediction["yearweek"] = "201618"
df_prediction["yearweek"] = df_prediction["yearweek"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction["yearweek"] = "201618"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prediction["yearweek"] = df_prediction["yearweek"].astype(int)


In [173]:
df_nuevo = df_copy.merge(
    df_prediction[['id', 'yearweek', 'n_sales']],
    on=['id', 'yearweek'],
    how='left',
    suffixes=('', '_predict')
)

In [174]:
df_nuevo

Unnamed: 0,id,item,category,department,store_code,region,yearweek,avg_sell_price,event,cluster_store,cluster_item,n_sales,date,revenue,n_sales_predict
0,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201104,,0,2,0,0,2011-01-24,0.0,
1,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201105,,1,2,0,0,2011-01-31,0.0,
2,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201106,,0,2,0,0,2011-02-07,0.0,
3,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201107,,1,2,0,0,2011-02-14,0.0,
4,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201108,,0,2,0,0,2011-02-21,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476215,SUPERMARKET_3_827_PHI_2,3048,2,6,8,2,201620,1.2,0,0,0,0,2016-05-16,0.0,
8476216,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201617,1.2,0,2,0,11,2016-04-25,13.2,
8476217,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201618,1.2,0,2,0,0,2016-05-02,0.0,8.142345
8476218,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201619,1.2,0,2,0,0,2016-05-09,0.0,


In [175]:
df_nuevo['n_sales_predict'] = df_nuevo['n_sales_predict'].fillna(df_nuevo['n_sales'])

In [176]:
df_nuevo['n_sales_predict'].describe()

count    8.476220e+06
mean     7.807700e+00
std      2.356298e+01
min      0.000000e+00
25%      0.000000e+00
50%      2.000000e+00
75%      7.000000e+00
max      3.976000e+03
Name: n_sales_predict, dtype: float64

In [177]:
df_nuevo_copy = df_nuevo.copy()

In [178]:
df_nuevo.drop(columns="n_sales", inplace=True)

In [179]:
df_nuevo["n_sales"] = df_nuevo['n_sales_predict']

In [180]:
df_nuevo.drop(columns="n_sales_predict", inplace=True)

In [181]:
df_nuevo = df_nuevo[["id", "item", "category", "department", "store_code", "region", "yearweek", "avg_sell_price", "event", "cluster_store", "cluster_item", "n_sales", "date", "revenue"]]

In [182]:
df_nuevo

Unnamed: 0,id,item,category,department,store_code,region,yearweek,avg_sell_price,event,cluster_store,cluster_item,n_sales,date,revenue
0,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201104,,0,2,0,0.000000,2011-01-24,0.0
1,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201105,,1,2,0,0.000000,2011-01-31,0.0
2,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201106,,0,2,0,0.000000,2011-02-07,0.0
3,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201107,,1,2,0,0.000000,2011-02-14,0.0
4,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201108,,0,2,0,0.000000,2011-02-21,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476215,SUPERMARKET_3_827_PHI_2,3048,2,6,8,2,201620,1.2,0,0,0,0.000000,2016-05-16,0.0
8476216,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201617,1.2,0,2,0,11.000000,2016-04-25,13.2
8476217,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201618,1.2,0,2,0,8.142345,2016-05-02,0.0
8476218,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201619,1.2,0,2,0,0.000000,2016-05-09,0.0


In [183]:
df_prediction_2 = df_nuevo.copy()

In [184]:
df_prediction_2["n_sales"] = round(df_prediction_2["n_sales"],0).astype(int)

In [185]:
df_prediction_2

Unnamed: 0,id,item,category,department,store_code,region,yearweek,avg_sell_price,event,cluster_store,cluster_item,n_sales,date,revenue
0,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201104,,0,2,0,0,2011-01-24,0.0
1,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201105,,1,2,0,0,2011-01-31,0.0
2,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201106,,0,2,0,0,2011-02-07,0.0
3,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201107,,1,2,0,0,2011-02-14,0.0
4,ACCESORIES_1_001_BOS_1,0,0,0,0,0,201108,,0,2,0,0,2011-02-21,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476215,SUPERMARKET_3_827_PHI_2,3048,2,6,8,2,201620,1.2,0,0,0,0,2016-05-16,0.0
8476216,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201617,1.2,0,2,0,11,2016-04-25,13.2
8476217,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201618,1.2,0,2,0,8,2016-05-02,0.0
8476218,SUPERMARKET_3_827_PHI_3,3048,2,6,9,2,201619,1.2,0,2,0,0,2016-05-09,0.0


In [186]:
df_prediction_2.to_parquet('df_prediction_2', engine='pyarrow', index=False, compression='snappy')