# Andre marroquin 22266
# sergio orellana 221122
# nelson garcia
# joaquin puente

---------
# Task 1

## preparacion de datos

In [4]:
# importacion de librerias
import os
import math
import numpy as np
import pandas as pd
from typing import Tuple, Dict
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

random_seed = 42
np.random.seed(random_seed)

# carga de datasets
def load_datasets(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)

    train.columns = [c.strip().lower() for c in train.columns]
    test.columns  = [c.strip().lower() for c in test.columns]

    if 'date' in train.columns:
        train['date'] = pd.to_datetime(train['date'], errors='coerce')
    if 'date' in test.columns:
        test['date'] = pd.to_datetime(test['date'], errors='coerce')

    for c in ['store', 'item']:
        if c in train.columns:
            train[c] = pd.to_numeric(train[c], errors='coerce').fillna(0).astype(int)
        if c in test.columns:
            test[c]  = pd.to_numeric(test[c], errors='coerce').fillna(0).astype(int)

    if 'sales' in train.columns:
        train['sales'] = pd.to_numeric(train['sales'], errors='coerce')

    # ordenar sin reset_index 
    train = train.sort_values([col for col in ['store', 'item', 'date'] if col in train.columns])
    test  = test.sort_values([col for col in ['store', 'item', 'date'] if col in test.columns])
    return train, test

# limppieza de datos
# faltanetes en sales => 0.0 negativos => 0.0 winsorizacion por (store,item) en p1 y p99
def clean_sales(train: pd.DataFrame) -> pd.DataFrame:
    if 'sales' not in train.columns:
        raise ValueError("no se encontro la columna 'sales' en train")

    df = train.copy()
    df['sales'] = df['sales'].fillna(0.0).astype(float)
    df.loc[df['sales'] < 0.0, 'sales'] = 0.0

    if {'store', 'item'}.issubset(df.columns):
        g = df.groupby(['store', 'item'])['sales']
        q01 = g.transform(lambda s: s.quantile(0.01) if s.notna().any() else np.nan)
        q99 = g.transform(lambda s: s.quantile(0.99) if s.notna().any() else np.nan)

        sales_vals = df['sales'].to_numpy()
        q01_vals = q01.to_numpy()
        q99_vals = q99.to_numpy()

        mask = ~np.isnan(q01_vals)
        sales_vals[mask] = np.maximum(sales_vals[mask], q01_vals[mask])
        mask = ~np.isnan(q99_vals)
        sales_vals[mask] = np.minimum(sales_vals[mask], q99_vals[mask])

        df = df.copy()
        df.loc[:, 'sales'] = sales_vals
    else:
        q1  = df['sales'].quantile(0.01)
        q99 = df['sales'].quantile(0.99)
        df['sales'] = df['sales'].clip(lower=q1, upper=q99)

    return df

# dict de listas para insertar columna a columna
def _build_df_from_dict_of_lists(base: pd.DataFrame, extra: dict) -> pd.DataFrame:
    data = {col: base[col].tolist() for col in base.columns}
    for k, v in extra.items():
        data[k] = list(v) if not isinstance(v, list) else v
    return pd.DataFrame(data)

# agregar caracteristicas de calendario
def add_calendar_features(df: pd.DataFrame) -> pd.DataFrame:
    if 'date' not in df.columns:
        raise ValueError("no se encontro la columna 'date'")
    dow = df['date'].dt.dayofweek.astype(int).tolist()  
    month = df['date'].dt.month.astype(int).tolist()    
    is_weekend = [1 if d >= 5 else 0 for d in dow]

    sin_dow   = [math.sin(2 * math.pi * d / 7.0) for d in dow]
    cos_dow   = [math.cos(2 * math.pi * d / 7.0) for d in dow]
    sin_month = [math.sin(2 * math.pi * (m - 1) / 12.0) for m in month]
    cos_month = [math.cos(2 * math.pi * (m - 1) / 12.0) for m in month]

    extras = {
        'dow': dow,
        'month': month,
        'is_weekend': is_weekend,
        'sin_dow': sin_dow,
        'cos_dow': cos_dow,
        'sin_month': sin_month,
        'cos_month': cos_month,
    }
    return _build_df_from_dict_of_lists(df, extras)

# transformacion log1p del target sales
def add_transformed_target(train: pd.DataFrame) -> pd.DataFrame:
    if 'sales' not in train.columns:
        raise ValueError("no se encontro la columna 'sales'")
    sales = train['sales'].astype(float).clip(lower=0.0)
    sales_log1p = np.log1p(sales.to_numpy())
    return _build_df_from_dict_of_lists(train, {'sales_log1p': sales_log1p})

train_path = "train.csv" 
test_path  = "test.csv"  

train_raw, test_raw = load_datasets(train_path, test_path)
train_clean = clean_sales(train_raw)
train_feat  = add_calendar_features(train_clean)
train_feat  = add_transformed_target(train_feat)
test_feat   = add_calendar_features(test_raw)

# imprimir informacion basica
print("fechas train:", train_feat['date'].min(), "->", train_feat['date'].max())
print("fechas test :", test_feat['date'].min(),  "->", test_feat['date'].max())
print("cols train:", list(train_feat.columns))
print("cols test :", list(test_feat.columns))


fechas train: 2013-01-01 00:00:00 -> 2017-12-31 00:00:00
fechas test : 2018-01-01 00:00:00 -> 2018-03-31 00:00:00
cols train: ['date', 'store', 'item', 'sales', 'dow', 'month', 'is_weekend', 'sin_dow', 'cos_dow', 'sin_month', 'cos_month', 'sales_log1p']
cols test : ['id', 'date', 'store', 'item', 'dow', 'month', 'is_weekend', 'sin_dow', 'cos_dow', 'sin_month', 'cos_month']


## Preprocesamiento de datos

In [5]:
# procesamiento de datos split temporal y secuencias rapidas con stride semanal
# horizonte de 3 meses 
horizon = 90
# ventana de historia 120 dias
window_size = 120

# split temporal
def compute_time_split(train_df: pd.DataFrame, horizon: int) -> Dict[str, pd.Timestamp]:
    max_date = train_df['date'].max()
    val_start = max_date - pd.Timedelta(days=horizon) + pd.Timedelta(days=1)
    return {'max_date': max_date, 'val_start': val_start}

# secuencias rapidas con stride semanal
def series_to_tensor_fast(
    df: pd.DataFrame,
    window_size: int,
    horizon: int,
    val_start: pd.Timestamp,
    stride_train: int = 7,
    stride_val: int = 7
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    # genera ventanas con stride
    feats = ['sales_log1p', 'sin_dow', 'cos_dow', 'sin_month', 'cos_month', 'is_weekend']
    max_store = df['store'].max()
    max_item  = df['item'].max()

    x_tr, y_tr, x_va, y_va = [], [], [], []

    for (s, it), g in df.groupby(['store', 'item'], sort=True):
        g = g.sort_values('date') 
        if len(g) < window_size + horizon:
            continue

        feat_mat = g[feats].to_numpy(dtype=np.float32)
        store_norm = np.full((len(g), 1), s / max_store, dtype=np.float32)
        item_norm  = np.full((len(g), 1), it / max_item,  dtype=np.float32)
        x_full = np.concatenate([feat_mat, store_norm, item_norm], axis=1)
        y_full = g['sales_log1p'].to_numpy(dtype=np.float32)
        dates  = g['date'].to_numpy()

        last_start = len(g) - (window_size + horizon)
        start = 0
        while start <= last_start:
            end_hist   = start + window_size
            end_target = end_hist + horizon
            target_end_date = dates[end_target - 1]

            x_win = x_full[start:end_hist, :]
            y_win = y_full[end_hist:end_target]

            if target_end_date >= val_start:
                x_va.append(x_win); y_va.append(y_win)
                start += stride_val
            else:
                x_tr.append(x_win); y_tr.append(y_win)
                start += stride_train

    x_tr = np.asarray(x_tr, dtype=np.float32)
    y_tr = np.asarray(y_tr, dtype=np.float32)
    x_va = np.asarray(x_va, dtype=np.float32)
    y_va = np.asarray(y_va, dtype=np.float32)
    return x_tr, y_tr, x_va, y_va

# metricas de evaluacion inversa
def smape(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 1e-6) -> float:
    yt = np.expm1(y_true)
    yp = np.clip(np.expm1(y_pred), 0.0, None)
    denom = (np.abs(yt) + np.abs(yp) + eps)
    return float(np.mean(2.0 * np.abs(yp - yt) / denom))

# error absoluto medio inverso
def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    yt = np.expm1(y_true)
    yp = np.clip(np.expm1(y_pred), 0.0, None)
    return float(np.mean(np.abs(yp - yt)))

# ejecutar
split_info = compute_time_split(train_feat, horizon=horizon)
x_tr, y_tr, x_va, y_va = series_to_tensor_fast(
    train_feat, window_size, horizon, split_info['val_start'],
    stride_train=7, stride_val=7
)

# informacion de los tensores
input_timesteps = x_tr.shape[1]
input_features = x_tr.shape[2]
print("x_tr:", x_tr.shape, "y_tr:", y_tr.shape, "| x_va:", x_va.shape, "y_va:", y_va.shape)
print("input shape (timesteps, features):", (input_timesteps, input_features))


x_tr: (109500, 120, 8) y_tr: (109500, 90) | x_va: (6000, 120, 8) y_va: (6000, 90)
input shape (timesteps, features): (120, 8)


## Selección de modelo:

### justificacion

Elegimos gru porque capta dependencias de largo plazo con menos parámetros que lstm, por lo que entrena y predice más rápido sin sacrificar precisión en ventanas largas. Frente a conv1d, el gru modela mejor estacionalidad y tendencias multidiarias, clave para un horizonte de 3 meses en series diarias.

In [6]:

# seleccion de modelo GRU 
tf.random.set_seed(42)

# gpu + mixed precision si disponible
gpus = tf.config.list_physical_devices('GPU')
using_gpu = False
if gpus:
    try:
        for g in gpus:
            tf.config.experimental.set_memory_growth(g, True)
        from tensorflow.keras import mixed_precision
        mixed_precision.set_global_policy('mixed_float16')
        using_gpu = True
    except Exception as e:
        print("mixed precision no disponible:", e)

# tf.data datasets
def make_dataset(x, y, batch_size, training=True):
    ds = tf.data.Dataset.from_tensor_slices((x, y))
    if training:
        ds = ds.shuffle(min(len(x), 10000), seed=42, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size, drop_remainder=False)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

# crear datasets batch size adaptativo
batch_size = 1024 if using_gpu else 256
train_ds = make_dataset(x_tr, y_tr, batch_size, training=True)
val_ds   = make_dataset(x_va, y_va, batch_size, training=False)

# modelo gru 
def build_gru(input_timesteps: int, input_features: int, horizon: int) -> keras.Model:
    inp = keras.Input(shape=(input_timesteps, input_features))
    x = layers.GRU(64, return_sequences=True)(inp)
    x = layers.GRU(32)(x)
    x = layers.Dense(64, activation='relu')(x)
    out = layers.Dense(horizon, activation='linear', dtype='float32')(x)  
    try:
        model = keras.Model(inp, out)
        model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mae', jit_compile=True)
        return model
    except Exception:
        model = keras.Model(inp, out)
        model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mae')
        return model

# construir modelo
model = build_gru(input_timesteps, input_features, horizon)

# callbacks para reducir lr y early stopping
callbacks = [
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-5, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=1),
]

# entrenar por 10 epocas maximo, pero con early stopping
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    verbose=2,
    callbacks=callbacks
)

# evaluacion en validacion
y_pred_va = model.predict(x_va, batch_size=batch_size, verbose=0)
val_mae = mae(y_va, y_pred_va)
val_smape = smape(y_va, y_pred_va)
print(f"\nvalidacion -> mae: {val_mae:.5f} | smape: {val_smape:.5f}")

# pronostico para el conjunto de prueba
def build_last_windows_for_test(
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    window_size: int
) -> pd.DataFrame:
    feats = ['sales_log1p', 'sin_dow', 'cos_dow', 'sin_month', 'cos_month', 'is_weekend']
    max_store = df_train['store'].max()
    max_item  = df_train['item'].max()
    test_start_date = df_test['date'].min()

    rows = []
    for (s, it), g in df_train.groupby(['store', 'item'], sort=True):
        g = g[g['date'] < test_start_date].sort_values('date') 
        if len(g) < window_size:
            continue
        g_last = g.iloc[-window_size:]
        feat_mat = g_last[feats].to_numpy(dtype=np.float32)
        store_norm = np.full((window_size, 1), s / max_store, dtype=np.float32)
        item_norm  = np.full((window_size, 1), it / max_item,  dtype=np.float32)
        x_win = np.concatenate([feat_mat, store_norm, item_norm], axis=1)
        rows.append({'store': int(s), 'item': int(it), 'x_win': x_win})
    return pd.DataFrame(rows)

# predecir para test
def forecast_test(
    model: keras.Model,
    df_train_feat: pd.DataFrame,
    df_test_feat: pd.DataFrame,
    window_size: int,
    horizon: int
) -> pd.DataFrame:
    windows_df = build_last_windows_for_test(df_train_feat, df_test_feat, window_size)
    if windows_df.empty:
        return pd.DataFrame(columns=['store', 'item', 'date', 'sales'])

    future_dates = pd.date_range(
        start=df_test_feat['date'].min(),
        end=df_test_feat['date'].max(),
        freq='D'
    )
    if len(future_dates) > horizon:
        future_dates = future_dates[:horizon]
    elif len(future_dates) < horizon:
        future_dates = pd.date_range(start=future_dates.min(), periods=horizon, freq='D')

    x_stack = np.stack(windows_df['x_win'].to_list(), axis=0).astype(np.float32)
    y_hat = model.predict(x_stack, batch_size=batch_size, verbose=0)

    preds_rows = []
    for idx, row in windows_df.iterrows():
        s = row['store']; it = row['item']
        yh = np.expm1(y_hat[idx]); yh = np.clip(yh, 0.0, None)
        preds_rows.append(pd.DataFrame({'store': s, 'item': it, 'date': future_dates, 'sales': yh}))
    return pd.concat(preds_rows, ignore_index=True)

preds_daily = forecast_test(
    model,
    df_train_feat=train_feat,
    df_test_feat=test_feat,
    window_size=window_size,
    horizon=horizon
)

# construir archivo de envio
def build_submission(preds_daily: pd.DataFrame, test_df: pd.DataFrame, out_path: str) -> pd.DataFrame:
    out = test_df.merge(preds_daily, on=['store', 'item', 'date'], how='left').copy()
    out['sales'] = out['sales'].fillna(0.0)
    if 'id' in out.columns:
        submit = out[['id', 'sales']].copy()
    else:
        submit = out[['store', 'item', 'date', 'sales']].copy()
    submit.to_csv(out_path, index=False)
    return submit

# archivo de envio
submission_path = "submission.csv"
submission_df = build_submission(preds_daily, test_raw, submission_path)
print("\narchivo de envio guardado en:", submission_path)
print(submission_df.head(10))


Epoch 1/10
428/428 - 149s - 348ms/step - loss: 0.5102 - val_loss: 0.1371 - learning_rate: 0.0010
Epoch 2/10
428/428 - 135s - 316ms/step - loss: 0.1332 - val_loss: 0.1269 - learning_rate: 0.0010
Epoch 3/10
428/428 - 147s - 343ms/step - loss: 0.1309 - val_loss: 0.1245 - learning_rate: 0.0010
Epoch 4/10
428/428 - 141s - 328ms/step - loss: 0.1297 - val_loss: 0.1240 - learning_rate: 0.0010
Epoch 5/10
428/428 - 140s - 327ms/step - loss: 0.1290 - val_loss: 0.1226 - learning_rate: 0.0010
Epoch 6/10
428/428 - 138s - 322ms/step - loss: 0.1281 - val_loss: 0.1214 - learning_rate: 0.0010
Epoch 7/10
428/428 - 162s - 378ms/step - loss: 0.1276 - val_loss: 0.1208 - learning_rate: 0.0010
Epoch 8/10

Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
428/428 - 141s - 329ms/step - loss: 0.1270 - val_loss: 0.1214 - learning_rate: 0.0010
Epoch 9/10
428/428 - 146s - 341ms/step - loss: 0.1262 - val_loss: 0.1191 - learning_rate: 5.0000e-04
Epoch 10/10

Epoch 10: ReduceLROnPlateau reduc

## Output Submission
### 45,000 filas 10 tiendas x 50 articulos = 500 series que cubren 3 meses