# Sales Forecast Demo

This notebook demonstrates:
- Querying recent data from BigQuery
- Calling the deployed Vertex AI Endpoint for online prediction



In [None]:
import os
from google.cloud import bigquery, aiplatform
import pandas as pd

PROJECT_ID = os.environ.get("PROJECT_ID", "my-forecast-project-18870")
REGION = os.environ.get("REGION", "us-central1")
DATASET = os.environ.get("DATASET", "gk2_takeaway_sales")
FINAL_TABLE = os.environ.get("FINAL_TABLE", "daily_sales_features")

client = bigquery.Client(project=PROJECT_ID)

df = client.query(f"""
SELECT * FROM `{PROJECT_ID}.{DATASET}.{FINAL_TABLE}`
ORDER BY Date DESC
LIMIT 30
""").to_dataframe()
df.tail(3)


In [None]:
# Call the deployed endpoint (set ENDPOINT_ID env var beforehand)
ENDPOINT_ID = os.environ.get("ENDPOINT_ID")
assert ENDPOINT_ID, "Set ENDPOINT_ID in environment to your deployed endpoint id"

aiplatform.init(project=PROJECT_ID, location=REGION)
endpoint_name = ENDPOINT_ID if ENDPOINT_ID.startswith("projects/") else f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{ENDPOINT_ID}"
endpoint = aiplatform.Endpoint(endpoint_name)

# Construct a single instance (dummy example, replace features accordingly)
instance = {"feature1": 1.0, "feature2": 2.0}

pred = endpoint.predict([instance])
pred.predictions


## Evaluate on last 20% (validation) via Vertex Endpoint

This section:
- Loads the full table from BigQuery
- Rebuilds features exactly as in training
- Splits by time (80/20) to get the same validation window
- Sends the validation features to the deployed Endpoint in batches
- Computes RMSE/MAE/RÂ² against ground truth


In [None]:
import os
import sys
import pathlib
import math
from typing import Any, Dict, List

import numpy as np
import pandas as pd
from google.cloud import bigquery, aiplatform
from sklearn.metrics import mean_absolute_error, r2_score
try:
    from sklearn.metrics import root_mean_squared_error
except Exception:  # older sklearn fallback
    from sklearn.metrics import mean_squared_error
    def root_mean_squared_error(y_true, y_pred):
        return math.sqrt(mean_squared_error(y_true, y_pred))

# Make project modules importable
repo_root = pathlib.Path.cwd()
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from sales_forecast.config import config
from sales_forecast.features import generate_features, prepare_dataset_for_modeling

PROJECT_ID = os.environ.get("PROJECT_ID", config.bq.PROJECT_ID)
REGION = os.environ.get("REGION", "us-central1")
DATASET = os.environ.get("DATASET", config.bq.DATASET)
FINAL_TABLE = os.environ.get("FINAL_TABLE", config.bq.FINAL_TABLE)

# Prefer env ENDPOINT_ID; fallback to provided id
ENDPOINT_ID = os.environ.get("ENDPOINT_ID", "5630246102608379904")
assert ENDPOINT_ID, "Set ENDPOINT_ID or provide it inline"

# Initialize clients
bq = bigquery.Client(project=PROJECT_ID)
aiplatform.init(project=PROJECT_ID, location=REGION)
endpoint_name = ENDPOINT_ID if ENDPOINT_ID.startswith("projects/") else f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{ENDPOINT_ID}"
endpoint = aiplatform.Endpoint(endpoint_name)

# Load the full table (ascending by Date for feature generation)
raw_df = bq.query(f"""
SELECT * FROM `{PROJECT_ID}.{DATASET}.{FINAL_TABLE}`
ORDER BY Date ASC
""").to_dataframe()
raw_df["Date"] = pd.to_datetime(raw_df["Date"])  # ensure dtype
raw_df.head(3)


In [None]:
# Rebuild features exactly as in training
featured_df = generate_features(raw_df)

# Time-based 80/20 split to replicate training's validation window
split_index = int(len(featured_df) * 0.8)
val_df = featured_df.iloc[split_index:].copy()

# Build model inputs (X) and ground truth (y)
X_val, y_val = prepare_dataset_for_modeling(val_df, config.features.TARGET_COL)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

len(featured_df), len(X_val)


In [None]:
# Convert validation features to JSON-serializable instances

def to_native(v):
    if isinstance(v, (np.floating,)):
        return float(v)
    if isinstance(v, (np.integer,)):
        return int(v)
    return v

instances: List[Dict[str, Any]] = [
    {k: to_native(v) for k, v in row.items()}
    for row in X_val.to_dict(orient="records")
]
len(instances), list(instances[0].keys())[:5]


In [None]:
# Predict in batches to avoid request size limits

def predict_in_batches(endpoint, instances: List[Dict[str, Any]], batch_size: int = 100) -> List[float]:
    preds: List[float] = []
    for i in range(0, len(instances), batch_size):
        chunk = instances[i:i+batch_size]
        resp = endpoint.predict(chunk)
        # resp.predictions is typically a list of scalars or lists
        for p in resp.predictions:
            # handle [value] vs value
            if isinstance(p, list) and len(p) == 1:
                preds.append(float(p[0]))
            else:
                preds.append(float(p))
    return preds

preds = predict_in_batches(endpoint, instances, batch_size=100)
len(preds), preds[:3]


In [None]:
# Compute metrics against ground truth (validation window)
import pandas as pd

assert len(preds) == len(y_val), "Mismatch between predictions and labels"
rmse = root_mean_squared_error(y_val.values, np.array(preds))
mae = mean_absolute_error(y_val.values, np.array(preds))
r2 = r2_score(y_val.values, np.array(preds))

print({"rmse": float(rmse), "mae": float(mae), "r2": float(r2)})

# Preview a few rows
preview = pd.DataFrame({
    "Date": val_df["Date"].reset_index(drop=True),
    "actual": y_val,
    "pred": preds,
}).tail(10)
preview
