In [1]:
import sys
#import utilities that it is outside the project folder
sys.path.append("..")

from utilities import *

warnings.filterwarnings("ignore")

raw_data = pd.read_excel("../data/Datos_Market_copy.xlsx")

sa = SalesAnalysis(raw_data)

data = sa.data 




TRANSFORMACI√ìN BOX-COX

In [2]:
def boxcox_transform(y, warning=False):
    # ========== TRANSFORMACI√ìN BOX-COX DE LA VARIABLE OBJETIVO ==========
    # Verificar que volume.sales sea positiva para aplicar Box-Cox

    # y = data['volume.sales'].copy() 

    if warning:
        print("üìä Verificaci√≥n de datos para transformaci√≥n Box-Cox:")
        print(f"   - Longitud de datos: {len(y)}")
        print(f"   - Valor m√≠nimo: {y.min():.6f}")
        print(f"   - Valor m√°ximo: {y.max():.6f}")
        print(f"   - Valores <= 0: {(y <= 0).sum()}")
        print(f"   - Media: {y.mean():.2f}")

    # Box-Cox requiere valores estrictamente positivos
    if (y <= 0).any():
        # Si hay ceros o negativos, necesitamos hacer un shift
        min_positive = y[y > 0].min() if (y > 0).any() else 1.0
        constant = max(1.0, min_positive)
        y_shifted = y + constant
        if warning:
            print(f"   ‚ö†Ô∏è  Se detectaron valores <= 0. Aplicando shift: y + {constant:.6f}")
    else:
        y_shifted = y.copy()
        constant = 0.0
        if warning:
            print(f"   ‚úÖ Todos los valores son positivos. No se requiere shift")

    # Aplicar transformaci√≥n Box-Cox
    # scipy.stats.boxcox encuentra el lambda √≥ptimo y aplica la transformaci√≥n
    y_boxcox, lambda_optimal = stats.boxcox(y_shifted)

    if warning:
        print(f"\nüìà Estad√≠sticas despu√©s de la transformaci√≥n Box-Cox:")
        print(f"   - Lambda √≥ptimo: {lambda_optimal:.6f}")
        print(f"   - Media Box-Cox: {y_boxcox.mean():.6f}")
        print(f"   - Desviaci√≥n est√°ndar Box-Cox: {y_boxcox.std():.6f}")
        print(f"   - Constante de shift aplicada: {constant:.6f}")

    # Guardar informaci√≥n de la transformaci√≥n para reversi√≥n futura
    boxcox_transformation_info = {
        'type': 'boxcox',
        'lambda': lambda_optimal,
        'constant': constant
    }

    # Nota: Para revertir la transformaci√≥n Box-Cox:
    # y_reversed = np.power(y_boxcox * lambda_optimal + 1, 1/lambda_optimal) - constant
    # Si lambda_optimal == 0, entonces: y_reversed = np.exp(y_boxcox) - constant
    return y_boxcox, boxcox_transformation_info
    
def inverse_boxcox(y_transformed, lambda_val: float, constant: float = 0.0):
    """
    Inversa de la transformaci√≥n Box-Cox usada en `boxcox_transform`.

    Parameters
    ----------
    y_transformed : array-like
        Valores transformados
    lambda_val : float
        Lambda de Box-Cox
    constant : float
        Shift aplicado antes del Box-Cox (se resta al final)
    """
    y_transformed = np.asarray(y_transformed)
    if lambda_val == 0:
        return np.exp(y_transformed) - constant
    return np.power(y_transformed * lambda_val + 1, 1 / lambda_val) - constant

In [3]:
def run_preprocessing(ARIMA_model: bool = False):

    if ARIMA_model:

        filter_data = data[
            (data["brand"] == "brand-35")
            & (data["supermarket"] == "supermarket-A")
            & (data["variant"] == "standard")
            & (data["pack.size"] == "351 - 500 GR")
        ].copy()

        
    else:
        filter_data = None

    # Split train/test por tiempo (basado en fecha, no en √≠ndice)
    # Asegurar que date est√© en formato datetime
    if ARIMA_model:
        filter_data = filter_data.copy()
        filter_data['date'] = pd.to_datetime(filter_data['date'])
        # Ordenar por fecha para asegurar orden temporal
        filter_data = filter_data.sort_values('date').reset_index(drop=True)
        
        # Calcular fecha de corte (80% del rango temporal)
        date_min = filter_data['date'].min()
        date_max = filter_data['date'].max()
        date_range = date_max - date_min
        train_cutoff = date_min + date_range * 0.8
        
        train_data = filter_data[filter_data['date'] <= train_cutoff].copy()
        test_data = filter_data[filter_data['date'] > train_cutoff].copy()
    else:
        # Trabajar con copia para no modificar el dataframe original
        data_work = data.copy()
        data_work['date'] = pd.to_datetime(data_work['date'])
        # Ordenar por fecha para asegurar orden temporal
        data_work = data_work.sort_values('date').reset_index(drop=True)
        
        # Calcular fecha de corte (80% del rango temporal)
        date_min = data_work['date'].min()
        date_max = data_work['date'].max()
        date_range = date_max - date_min
        train_cutoff = date_min + date_range * 0.8
        
        train_data = data_work[data_work['date'] <= train_cutoff].copy()
        test_data = data_work[data_work['date'] > train_cutoff].copy()

    # Box-Cox: fit solo en train
    y_train_boxcox, boxcox_transformation_info = boxcox_transform(
        train_data["volume.sales"].copy()
    )

    # Aplicar misma transformaci√≥n a test
    y_test = test_data["volume.sales"].copy()
    y_test_shifted = y_test + boxcox_transformation_info["constant"]

    if (y_test_shifted <= 0).any():
        raise ValueError(
            "Box-Cox requiere valores positivos. Tras aplicar el shift aprendido en train, "
            "test sigue teniendo valores <= 0. Revisa la serie o la estrategia de shift."
        )

    y_test_boxcox = stats.boxcox(
        y_test_shifted, lmbda=boxcox_transformation_info["lambda"]
    )

    return (
        data,
        filter_data,
        train_data,
        test_data,
        y_train_boxcox,
        y_test_boxcox,
        boxcox_transformation_info,
    )