In [1]:
# =============================================================================
# File: lesson_04_new_product_launch_forecasting_dl.py
# Topic: Deep Learning for New Product Launch Forecasting (cold-start demand)
# Input: historical SKUs (attributes + weekly demand) + new SKU attributes only
# Output: (1) cold-start forecast, (2) top-K similar SKUs via Siamese Networks
# =============================================================================

from __future__ import annotations

import random
from dataclasses import dataclass
from typing import Dict, List, Tuple

import numpy as np
import tensorflow as tf


In [2]:
# =============================================================================
# 0. TOP-LEVEL CONFIG AND CONSTANTS
# =============================================================================

SEED = 42
LAUNCH_HORIZON_WEEKS = 12

# Synthetic dataset sizes
N_EXISTING_SKUS = 600
N_NEW_SKUS = 8

# Attribute schema (categorical + numeric)
CATEGORIES = ["Beverage", "Snack", "Dairy", "HomeCare", "PersonalCare"]
PACK_SIZES = ["S", "M", "L"]
BRANDS = ["Value", "Core", "Premium"]

# Numeric features: must match encode_attributes() exactly
# [price_index, promo_intensity, innovation_score, distribution_points_scaled]
NUMERIC_DIM = 4

# Business demo settings
SERVICE_LEVEL = 0.95
TOP_K_SIMILARS = 5

# Training settings
BATCH_SIZE = 64
EPOCHS_BASE = 15
EPOCHS_SIAMESE = 10
LEARNING_RATE = 1e-3

# Embedding sizes (categoricals)
EMB_DIM_CATEGORY = 4
EMB_DIM_PACK = 3
EMB_DIM_BRAND = 3

# Regularization
DROPOUT_RATE = 0.15
L2 = 1e-5

# =============================================================================
# 1. DATA STRUCTURES
# =============================================================================

@dataclass(frozen=True)
class ProductAttributes:
    sku_id: str
    category: str
    pack_size: str
    brand_tier: str
    price_index: float
    promo_intensity: float
    innovation_score: float
    distribution_points: int


@dataclass(frozen=True)
class SkuWeeklyDemand:
    sku_id: str
    demand_weekly: np.ndarray  # shape: [weeks]


In [3]:
# =============================================================================
# 1. DATA STRUCTURES
# =============================================================================

@dataclass(frozen=True)
class ProductAttributes:
    sku_id: str
    category: str
    pack_size: str
    brand_tier: str
    price_index: float
    promo_intensity: float
    innovation_score: float
    distribution_points: int


@dataclass(frozen=True)
class SkuWeeklyDemand:
    sku_id: str
    demand_weekly: np.ndarray  # shape: [weeks]


In [4]:
# =============================================================================
# 2. UTILITIES
# =============================================================================

def set_global_determinism(seed: int) -> None:
    """Fix random seeds for reproducibility."""
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


def estimate_launch_fill_rate(forecast_weekly: np.ndarray, initial_inventory: float) -> float:
    """Simple fill-rate proxy across the launch horizon."""
    total_demand = float(np.sum(forecast_weekly))
    if total_demand <= 1e-6:
        return 1.0
    return float(min(1.0, initial_inventory / total_demand))

In [5]:
# =============================================================================
# 3. SYNTHETIC DATA GENERATION
# =============================================================================

def sample_attributes(sku_id: str) -> ProductAttributes:
    """Sample plausible attributes for a SKU."""
    category = random.choice(CATEGORIES)
    pack_size = random.choice(PACK_SIZES)
    brand_tier = random.choice(BRANDS)

    price_index = float(np.clip(np.random.normal(loc=1.0, scale=0.15), 0.6, 1.6))
    promo_intensity = float(np.clip(np.random.beta(a=2.0, b=4.5), 0.0, 1.0))
    innovation_score = float(np.clip(np.random.beta(a=2.5, b=2.5), 0.0, 1.0))
    distribution_points = int(np.clip(np.random.normal(loc=120, scale=50), 20, 260))

    return ProductAttributes(
        sku_id=sku_id,
        category=category,
        pack_size=pack_size,
        brand_tier=brand_tier,
        price_index=price_index,
        promo_intensity=promo_intensity,
        innovation_score=innovation_score,
        distribution_points=distribution_points,
    )


def category_base_demand(category: str) -> float:
    """Different categories have different baseline volumes."""
    mapping = {
        "Beverage": 900.0,
        "Snack": 750.0,
        "Dairy": 620.0,
        "HomeCare": 420.0,
        "PersonalCare": 380.0,
    }
    return float(mapping[category])


def simulate_adoption_curve(weeks: int, innovation_score: float) -> np.ndarray:
    """
    Logistic adoption curve (ramp-up then plateau).
    """
    t = np.arange(weeks)
    k = 0.25 + 0.9 * innovation_score
    midpoint = 3 + (1.0 - innovation_score) * 4
    curve = 1.0 / (1.0 + np.exp(-k * (t - midpoint)))
    return curve.astype(np.float32)


def generate_weekly_demand(attrs: ProductAttributes, weeks: int) -> np.ndarray:
    """
    Generate weekly demand based on attributes.
    """
    base = category_base_demand(attrs.category)

    # Distribution effect: more points => more reach
    dist_mult = np.interp(attrs.distribution_points, [20, 260], [0.4, 1.3])

    # Price effect: higher price reduces volume
    price_mult = (1.0 / attrs.price_index) ** 0.6

    adoption = simulate_adoption_curve(weeks, attrs.innovation_score)

    # Promo weeks and uplift
    is_promo = np.random.binomial(n=1, p=attrs.promo_intensity, size=weeks)
    promo_uplift = 1.0 + is_promo * np.random.uniform(0.10, 0.40, size=weeks)

    noise = np.random.normal(loc=1.0, scale=0.08, size=weeks)

    demand = base * dist_mult * price_mult * adoption * promo_uplift * noise
    demand = np.clip(demand, 0.0, None)

    return demand.astype(np.float32)


def build_synthetic_dataset(
    n_existing_skus: int,
    n_new_skus: int,
    weeks: int,
) -> Tuple[List[ProductAttributes], List[SkuWeeklyDemand], List[ProductAttributes]]:
    """Build existing SKUs with history + new SKUs with only attributes."""
    existing_attrs: List[ProductAttributes] = []
    existing_demand: List[SkuWeeklyDemand] = []
    new_attrs: List[ProductAttributes] = []

    for i in range(n_existing_skus):
        sku_id = f"SKU_{i:04d}"
        attrs = sample_attributes(sku_id)
        demand = generate_weekly_demand(attrs, weeks)
        existing_attrs.append(attrs)
        existing_demand.append(SkuWeeklyDemand(sku_id=sku_id, demand_weekly=demand))

    for j in range(n_new_skus):
        sku_id = f"NEW_{j:03d}"
        new_attrs.append(sample_attributes(sku_id))

    return existing_attrs, existing_demand, new_attrs


In [6]:
# =============================================================================
# 4. ENCODING (attributes -> tensors)
# =============================================================================

def build_vocab(values: List[str]) -> Dict[str, int]:
    """String-to-index mapping for categorical attributes."""
    return {v: i for i, v in enumerate(sorted(set(values)))}


def encode_attributes(
    attrs_list: List[ProductAttributes],
    vocab_category: Dict[str, int],
    vocab_pack: Dict[str, int],
    vocab_brand: Dict[str, int],
) -> Dict[str, np.ndarray]:
    """
    Produce model-ready inputs.

    Why:
    - Categorical -> embeddings
    - Numeric -> scaled floats
    """
    category_idx = np.array([vocab_category[a.category] for a in attrs_list], dtype=np.int32)
    pack_idx = np.array([vocab_pack[a.pack_size] for a in attrs_list], dtype=np.int32)
    brand_idx = np.array([vocab_brand[a.brand_tier] for a in attrs_list], dtype=np.int32)

    price_index = np.array([a.price_index for a in attrs_list], dtype=np.float32)
    promo_intensity = np.array([a.promo_intensity for a in attrs_list], dtype=np.float32)
    innovation_score = np.array([a.innovation_score for a in attrs_list], dtype=np.float32)

    # Scale distribution points to 0..1
    distribution_points = np.array([a.distribution_points for a in attrs_list], dtype=np.float32)
    distribution_scaled = (distribution_points - 20.0) / (260.0 - 20.0)

    numeric_features = np.stack(
        [price_index, promo_intensity, innovation_score, distribution_scaled],
        axis=1
    ).astype(np.float32)

    return {
        "category_idx": category_idx,
        "pack_idx": pack_idx,
        "brand_idx": brand_idx,
        "numeric_features": numeric_features,
    }


def demand_targets(existing_demand: List[SkuWeeklyDemand]) -> np.ndarray:
    """Stack demand history into shape [n_skus, weeks]."""
    return np.stack([d.demand_weekly for d in existing_demand], axis=0).astype(np.float32)



In [7]:
# =============================================================================
# 5. MODEL BUILDING (encoder, forecaster, siamese)
# =============================================================================

def build_product_encoder(
    n_categories: int,
    n_packs: int,
    n_brands: int,
    numeric_dim: int,
    embedding_dim_out: int = 16,
) -> tf.keras.Model:
    """
    Product encoder = product embedding model.

    Inputs are explicitly named. This matters for dict-based training/inference.
    """
    category_in = tf.keras.Input(shape=(), dtype=tf.int32, name="category_idx")
    pack_in = tf.keras.Input(shape=(), dtype=tf.int32, name="pack_idx")
    brand_in = tf.keras.Input(shape=(), dtype=tf.int32, name="brand_idx")
    numeric_in = tf.keras.Input(shape=(numeric_dim,), dtype=tf.float32, name="numeric_features")

    category_emb = tf.keras.layers.Embedding(n_categories, EMB_DIM_CATEGORY, name="emb_category")(category_in)
    pack_emb = tf.keras.layers.Embedding(n_packs, EMB_DIM_PACK, name="emb_pack")(pack_in)
    brand_emb = tf.keras.layers.Embedding(n_brands, EMB_DIM_BRAND, name="emb_brand")(brand_in)

    x = tf.keras.layers.Concatenate(name="concat_all")([category_emb, pack_emb, brand_emb, numeric_in])

    x = tf.keras.layers.Dense(
        32, activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(L2),
        name="enc_dense_1"
    )(x)
    x = tf.keras.layers.Dropout(DROPOUT_RATE, name="enc_dropout")(x)

    x = tf.keras.layers.Dense(
        embedding_dim_out, activation=None,
        kernel_regularizer=tf.keras.regularizers.l2(L2),
        name="enc_dense_out"
    )(x)

    # Normalize embedding so cosine similarity works reliably
    out = tf.keras.layers.Lambda(lambda t: tf.math.l2_normalize(t, axis=1), name="l2_norm")(x)

    return tf.keras.Model(
        inputs={
            "category_idx": category_in,
            "pack_idx": pack_in,
            "brand_idx": brand_in,
            "numeric_features": numeric_in,
        },
        outputs=out,
        name="product_encoder",
    )


def build_attribute_forecaster(encoder: tf.keras.Model, horizon_weeks: int) -> tf.keras.Model:
    """
    Attribute-based demand forecaster.

    Important:
    - We train on log1p(demand) to avoid flat/too-small predictions.
    - Output is linear and represents log1p(demand).
    """
    emb = encoder.output

    x = tf.keras.layers.Dense(
        32, activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(L2),
        name="fc_dense_1"
    )(emb)
    x = tf.keras.layers.Dropout(DROPOUT_RATE, name="fc_dropout")(x)

    y_log = tf.keras.layers.Dense(
        horizon_weeks, activation="linear",
        name="forecast_log_demand"
    )(x)

    model = tf.keras.Model(inputs=encoder.inputs, outputs=y_log, name="attribute_forecaster")
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss="mae",
        metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")],
    )
    return model


def build_siamese_network(encoder: tf.keras.Model, numeric_dim: int) -> tf.keras.Model:
    """
    Siamese network (pairwise similarity classification).

    Key implementation choice:
    - Siamese model inputs are named left_* and right_*.
    - That must match the keys you provide to siamese.fit(...).
    """
    # Left inputs
    left_category = tf.keras.Input(shape=(), dtype=tf.int32, name="left_category_idx")
    left_pack = tf.keras.Input(shape=(), dtype=tf.int32, name="left_pack_idx")
    left_brand = tf.keras.Input(shape=(), dtype=tf.int32, name="left_brand_idx")
    left_numeric = tf.keras.Input(shape=(numeric_dim,), dtype=tf.float32, name="left_numeric_features")

    # Right inputs
    right_category = tf.keras.Input(shape=(), dtype=tf.int32, name="right_category_idx")
    right_pack = tf.keras.Input(shape=(), dtype=tf.int32, name="right_pack_idx")
    right_brand = tf.keras.Input(shape=(), dtype=tf.int32, name="right_brand_idx")
    right_numeric = tf.keras.Input(shape=(numeric_dim,), dtype=tf.float32, name="right_numeric_features")

    left_for_encoder = {
        "category_idx": left_category,
        "pack_idx": left_pack,
        "brand_idx": left_brand,
        "numeric_features": left_numeric,
    }
    right_for_encoder = {
        "category_idx": right_category,
        "pack_idx": right_pack,
        "brand_idx": right_brand,
        "numeric_features": right_numeric,
    }

    # Shared tower embeddings
    left_emb = encoder(left_for_encoder)
    right_emb = encoder(right_for_encoder)

    # Cosine distance: 1 - cosine similarity
    dist = tf.keras.layers.Lambda(
        lambda z: 1.0 - tf.reduce_sum(z[0] * z[1], axis=1, keepdims=True),
        name="cosine_dist"
    )([left_emb, right_emb])

    prob_similar = tf.keras.layers.Dense(1, activation="sigmoid", name="prob_similar")(dist)

    siamese = tf.keras.Model(
        inputs={
            "left_category_idx": left_category,
            "left_pack_idx": left_pack,
            "left_brand_idx": left_brand,
            "left_numeric_features": left_numeric,
            "right_category_idx": right_category,
            "right_pack_idx": right_pack,
            "right_brand_idx": right_brand,
            "right_numeric_features": right_numeric,
        },
        outputs=prob_similar,
        name="siamese_similarity_model",
    )

    siamese.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.AUC(name="auc")],
    )
    return siamese

In [8]:
# =============================================================================
# 6. SIAMESE PAIR LABELS + SAMPLING
# =============================================================================

def make_similarity_labels_from_demand(demand: np.ndarray) -> np.ndarray:
    """
    Create pseudo similarity labels from demand curve shape.

    Approach:
    - Normalize each SKU curve by its mean.
    - Define “similar” as being in the closest 15% pairs by L2 distance.
    """
    demand_norm = demand / (np.mean(demand, axis=1, keepdims=True) + 1e-6)

    n = demand_norm.shape[0]
    labels = np.zeros((n, n), dtype=np.int32)

    # Compute distance distribution (toy O(n^2))
    dists = []
    for i in range(n):
        for j in range(i + 1, n):
            dists.append(np.linalg.norm(demand_norm[i] - demand_norm[j]))

    threshold = float(np.quantile(np.array(dists), 0.15))

    for i in range(n):
        for j in range(i + 1, n):
            d = float(np.linalg.norm(demand_norm[i] - demand_norm[j]))
            lab = 1 if d <= threshold else 0
            labels[i, j] = lab
            labels[j, i] = lab

    return labels


def sample_pairs(
    encoded_existing: Dict[str, np.ndarray],
    labels: np.ndarray,
    n_pairs: int,
    positive_ratio: float,
) -> Tuple[Dict[str, np.ndarray], np.ndarray]:
    """
    Sample pairs for Siamese training.

    Output keys match Siamese model inputs exactly.
    """
    n = labels.shape[0]

    pos_pairs: List[Tuple[int, int]] = []
    neg_pairs: List[Tuple[int, int]] = []

    for i in range(n):
        for j in range(i + 1, n):
            if labels[i, j] == 1:
                pos_pairs.append((i, j))
            else:
                neg_pairs.append((i, j))

    n_pos = int(n_pairs * positive_ratio)
    n_neg = n_pairs - n_pos

    pos_sample = random.sample(pos_pairs, k=min(n_pos, len(pos_pairs)))
    neg_sample = random.sample(neg_pairs, k=min(n_neg, len(neg_pairs)))

    pairs = pos_sample + neg_sample
    random.shuffle(pairs)

    y = np.array([1.0 if labels[i, j] == 1 else 0.0 for i, j in pairs], dtype=np.float32)

    pair_X = {
        "left_category_idx": np.array([encoded_existing["category_idx"][i] for i, _ in pairs], dtype=np.int32),
        "right_category_idx": np.array([encoded_existing["category_idx"][j] for _, j in pairs], dtype=np.int32),
        "left_pack_idx": np.array([encoded_existing["pack_idx"][i] for i, _ in pairs], dtype=np.int32),
        "right_pack_idx": np.array([encoded_existing["pack_idx"][j] for _, j in pairs], dtype=np.int32),
        "left_brand_idx": np.array([encoded_existing["brand_idx"][i] for i, _ in pairs], dtype=np.int32),
        "right_brand_idx": np.array([encoded_existing["brand_idx"][j] for _, j in pairs], dtype=np.int32),
        "left_numeric_features": np.array([encoded_existing["numeric_features"][i] for i, _ in pairs], dtype=np.float32),
        "right_numeric_features": np.array([encoded_existing["numeric_features"][j] for _, j in pairs], dtype=np.float32),
    }

    return pair_X, y



In [9]:
# =============================================================================
# 7. INFERENCE HELPERS
# =============================================================================

def predict_weekly_demand_from_log(forecaster: tf.keras.Model, X: Dict[str, np.ndarray]) -> np.ndarray:
    """Convert log1p predictions back into demand."""
    y_log = forecaster.predict(X, verbose=0).astype(np.float32)
    y = np.expm1(y_log)
    return np.clip(y, 0.0, None).astype(np.float32)


def top_k_similar_skus(
    encoder: tf.keras.Model,
    X_existing: Dict[str, np.ndarray],
    X_new: Dict[str, np.ndarray],
    existing_ids: List[str],
    new_ids: List[str],
    top_k: int,
) -> Dict[str, List[Tuple[str, float]]]:
    """Top-K retrieval by embedding cosine similarity."""
    emb_existing = encoder.predict(X_existing, verbose=0)
    emb_new = encoder.predict(X_new, verbose=0)

    sim = np.matmul(emb_new, emb_existing.T)

    result: Dict[str, List[Tuple[str, float]]] = {}
    for i, new_id in enumerate(new_ids):
        idx = np.argsort(-sim[i])[:top_k]
        result[new_id] = [(existing_ids[j], float(sim[i, j])) for j in idx]
    return result



In [10]:
# =============================================================================
# 8. PROGRESSIVE EXAMPLES
# =============================================================================

def example_1_attribute_forecasting_only() -> None:
    existing_attrs, existing_demand, new_attrs = build_synthetic_dataset(
        n_existing_skus=N_EXISTING_SKUS,
        n_new_skus=N_NEW_SKUS,
        weeks=LAUNCH_HORIZON_WEEKS,
    )

    vocab_category = build_vocab([a.category for a in existing_attrs + new_attrs])
    vocab_pack = build_vocab([a.pack_size for a in existing_attrs + new_attrs])
    vocab_brand = build_vocab([a.brand_tier for a in existing_attrs + new_attrs])

    X_existing = encode_attributes(existing_attrs, vocab_category, vocab_pack, vocab_brand)
    y_existing = demand_targets(existing_demand)
    y_existing_log = np.log1p(y_existing).astype(np.float32)

    X_new = encode_attributes(new_attrs, vocab_category, vocab_pack, vocab_brand)

    encoder = build_product_encoder(
        n_categories=len(vocab_category),
        n_packs=len(vocab_pack),
        n_brands=len(vocab_brand),
        numeric_dim=NUMERIC_DIM,
        embedding_dim_out=16,
    )
    forecaster = build_attribute_forecaster(encoder, horizon_weeks=LAUNCH_HORIZON_WEEKS)

    forecaster.fit(
        X_existing,
        y_existing_log,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS_BASE,
        validation_split=0.15,
        verbose=0,
    )

    preds_new = predict_weekly_demand_from_log(forecaster, X_new)

    print("\n=== Example 1: Attribute-based forecasting (cold-start) ===")
    for i, attrs in enumerate(new_attrs[:3]):
        weekly = preds_new[i]
        total = float(np.sum(weekly))
        avg = float(np.mean(weekly))

        print(f"\nNew SKU: {attrs.sku_id} | {attrs.category}/{attrs.brand_tier}/{attrs.pack_size}")
        print(f"  Forecast total (12w): {total:,.0f} units | Avg weekly: {avg:,.0f}")

        initial_inventory = 0.7 * total
        fill_rate = estimate_launch_fill_rate(weekly, initial_inventory=initial_inventory)
        print(f"  Launch fill-rate proxy (inventory=70% of forecast): {fill_rate:.2%}")


def example_2_transfer_learning_freeze_encoder() -> None:
    existing_attrs, existing_demand, new_attrs = build_synthetic_dataset(
        n_existing_skus=500,
        n_new_skus=3,
        weeks=LAUNCH_HORIZON_WEEKS,
    )

    vocab_category = build_vocab([a.category for a in existing_attrs + new_attrs])
    vocab_pack = build_vocab([a.pack_size for a in existing_attrs + new_attrs])
    vocab_brand = build_vocab([a.brand_tier for a in existing_attrs + new_attrs])

    X_existing = encode_attributes(existing_attrs, vocab_category, vocab_pack, vocab_brand)
    y_existing = demand_targets(existing_demand)
    y_existing_log = np.log1p(y_existing).astype(np.float32)

    X_new = encode_attributes(new_attrs, vocab_category, vocab_pack, vocab_brand)

    encoder = build_product_encoder(
        n_categories=len(vocab_category),
        n_packs=len(vocab_pack),
        n_brands=len(vocab_brand),
        numeric_dim=NUMERIC_DIM,
        embedding_dim_out=16,
    )
    forecaster = build_attribute_forecaster(encoder, horizon_weeks=LAUNCH_HORIZON_WEEKS)

    forecaster.fit(X_existing, y_existing_log, batch_size=BATCH_SIZE, epochs=10, verbose=0)

    # Freeze encoder for small-data fine-tuning
    encoder.trainable = False
    forecaster.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
        loss="mae",
        metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")],
    )

    idx = np.random.choice(len(existing_attrs), size=120, replace=False)
    X_small = {k: v[idx] for k, v in X_existing.items()}
    y_small_log = y_existing_log[idx]

    forecaster.fit(X_small, y_small_log, batch_size=BATCH_SIZE, epochs=6, verbose=0)

    preds_new = predict_weekly_demand_from_log(forecaster, X_new)

    print("\n=== Example 2: Transfer learning (freeze encoder, fine-tune head) ===")
    for i, attrs in enumerate(new_attrs):
        print(
            f"{attrs.sku_id}: total={np.sum(preds_new[i]):,.0f} | "
            f"cat={attrs.category}, brand={attrs.brand_tier}, pack={attrs.pack_size}"
        )


def example_3_siamese_similarity_and_analog_forecast() -> None:
    existing_attrs, existing_demand, new_attrs = build_synthetic_dataset(
        n_existing_skus=450,
        n_new_skus=5,
        weeks=LAUNCH_HORIZON_WEEKS,
    )

    vocab_category = build_vocab([a.category for a in existing_attrs + new_attrs])
    vocab_pack = build_vocab([a.pack_size for a in existing_attrs + new_attrs])
    vocab_brand = build_vocab([a.brand_tier for a in existing_attrs + new_attrs])

    X_existing = encode_attributes(existing_attrs, vocab_category, vocab_pack, vocab_brand)
    y_existing = demand_targets(existing_demand)
    y_existing_log = np.log1p(y_existing).astype(np.float32)

    X_new = encode_attributes(new_attrs, vocab_category, vocab_pack, vocab_brand)

    existing_ids = [a.sku_id for a in existing_attrs]
    new_ids = [a.sku_id for a in new_attrs]

    # Pre-train encoder using forecasting task
    encoder = build_product_encoder(
        n_categories=len(vocab_category),
        n_packs=len(vocab_pack),
        n_brands=len(vocab_brand),
        numeric_dim=NUMERIC_DIM,
        embedding_dim_out=16,
    )
    forecaster = build_attribute_forecaster(encoder, horizon_weeks=LAUNCH_HORIZON_WEEKS)
    forecaster.fit(X_existing, y_existing_log, batch_size=BATCH_SIZE, epochs=10, verbose=0)

    # Siamese training on pseudo-labels
    labels = make_similarity_labels_from_demand(y_existing)
    pair_X, pair_y = sample_pairs(X_existing, labels, n_pairs=7000, positive_ratio=0.5)

    siamese = build_siamese_network(encoder, numeric_dim=NUMERIC_DIM)
    siamese.fit(pair_X, pair_y, batch_size=BATCH_SIZE, epochs=EPOCHS_SIAMESE, verbose=0)

    # Similar SKUs by embedding cosine similarity
    topk = top_k_similar_skus(
        encoder=encoder,
        X_existing=X_existing,
        X_new=X_new,
        existing_ids=existing_ids,
        new_ids=new_ids,
        top_k=TOP_K_SIMILARS,
    )

    preds_new = predict_weekly_demand_from_log(forecaster, X_new)

    print("\n=== Example 3: Siamese similarity + analog explanation ===")
    for i, attrs in enumerate(new_attrs[:3]):
        analog_ids = [sid for sid, _ in topk[attrs.sku_id]]
        analog_idx = [existing_ids.index(sid) for sid in analog_ids]

        analog_curve = np.mean(y_existing[analog_idx], axis=0)
        model_curve = preds_new[i]

        print(f"\nNew SKU: {attrs.sku_id} | {attrs.category}/{attrs.brand_tier}/{attrs.pack_size}")
        print("  Top similar SKUs (id, cosine sim):")
        for sid, s in topk[attrs.sku_id]:
            print(f"   - {sid} | {s:.3f}")

        print(f"  Model forecast total:  {np.sum(model_curve):,.0f}")
        print(f"  Analog forecast total: {np.sum(analog_curve):,.0f}")


In [11]:
# =============================================================================
# 9. WHEN TO USE WHAT (recommendations)
# =============================================================================

def recommendations_when_to_use_which() -> None:
    print("\n=== Recommendations: when to use what ===")
    print("1) Attribute NN: best default for scalable cold-start forecasting.")
    print("2) Transfer learning: best when new data is small or market shifts.")
    print("3) Siamese similarity: best for analog explanation and portfolio logic.")

In [12]:
# =============================================================================
# 10. DEMO / BUILT-IN CHECKS (run examples end-to-end)
# =============================================================================

if __name__ == "__main__":
    set_global_determinism(SEED)

    example_1_attribute_forecasting_only()
    example_2_transfer_learning_freeze_encoder()
    example_3_siamese_similarity_and_analog_forecast()
    recommendations_when_to_use_which()

    # Try-yourself tasks:
    # - TODO: Change TOP_K_SIMILARS and check analog stability.
    # - TODO: Replace pseudo similarity labels with rule-based labels (category + price band).
    # - TODO: Add a new categorical attribute (e.g., "channel") and re-train models.



=== Example 1: Attribute-based forecasting (cold-start) ===

New SKU: NEW_000 | Snack/Core/M
  Forecast total (12w): 459 units | Avg weekly: 38
  Launch fill-rate proxy (inventory=70% of forecast): 70.00%

New SKU: NEW_001 | HomeCare/Premium/S
  Forecast total (12w): 463 units | Avg weekly: 39
  Launch fill-rate proxy (inventory=70% of forecast): 70.00%

New SKU: NEW_002 | Beverage/Premium/M
  Forecast total (12w): 463 units | Avg weekly: 39
  Launch fill-rate proxy (inventory=70% of forecast): 70.00%

=== Example 2: Transfer learning (freeze encoder, fine-tune head) ===
NEW_000: total=40 | cat=PersonalCare, brand=Core, pack=M
NEW_001: total=41 | cat=Snack, brand=Premium, pack=M
NEW_002: total=41 | cat=Snack, brand=Premium, pack=S

=== Example 3: Siamese similarity + analog explanation ===

New SKU: NEW_000 | Dairy/Premium/M
  Top similar SKUs (id, cosine sim):
   - SKU_0067 | 0.999
   - SKU_0230 | 0.997
   - SKU_0441 | 0.995
   - SKU_0012 | 0.995
   - SKU_0386 | 0.994
  Model foreca