# PantryPalML: A2 Cloud Demo (Colab/Binder)

This notebook runs the full pipeline in a cloud runtime for the A2 submission:

- Environment setup (pip install, repo clone if needed)
- Load small sample CSVs from the repo (no external DB)
- Train LightGBM model (fast config)
- Evaluate on held-out split
- Demo inference: get top-N recommendations for a sample user

Run: Runtime → Run all.


In [None]:
# If running on Colab, install dependencies and clone repo without IPython magics
import sys, subprocess, os, pathlib

IN_COLAB = "google.colab" in sys.modules
repo_root = pathlib.Path.cwd()

if IN_COLAB:
    # Install required packages
    try:
        subprocess.run([sys.executable, "-m", "pip", "install", "-q",
                        "lightgbm", "pandas", "numpy", "scikit-learn", "matplotlib", "seaborn"],
                       check=False)
    except Exception as e:
        print(f"pip install warning: {e}")

    # Clone the repo if not present
    if not (repo_root / "recipe_recommender").exists():
        subprocess.run(["git", "clone", "-q", "https://github.com/marcel-qayoom-taylor/PantryPalML.git"], check=True)
        os.chdir("PantryPalML")
        repo_root = pathlib.Path.cwd()

print(f"Environment ready. Project root: {repo_root}")


In [None]:
# Fast config: paths to sample CSVs included in repo
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
if (ROOT / "notebooks").exists() and (ROOT / "recipe_recommender").exists():
    PROJECT_ROOT = ROOT
elif (ROOT / "PantryPalML").exists():
    PROJECT_ROOT = ROOT / "PantryPalML"
else:
    PROJECT_ROOT = ROOT

OUTPUT_DIR = PROJECT_ROOT / "recipe_recommender" / "output"

# Load small sample CSVs (fallback to generated minimal frames if missing)
train_path = OUTPUT_DIR / "hybrid_train_data.csv"
val_path = OUTPUT_DIR / "hybrid_val_data.csv"
test_path = OUTPUT_DIR / "hybrid_test_data.csv"

if all(p.exists() for p in [train_path, val_path, test_path]):
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
else:
    # Minimal fallback for demo
    import numpy as np
    rng = np.random.default_rng(42)
    cols = ["user_id", "recipe_id", "label", "avg_rating", "ingredient_count", "complexity_score"]
    train_df = pd.DataFrame({
        "user_id": [f"u{i}" for i in range(100)],
        "recipe_id": [f"r{i%20}" for i in range(100)],
        "label": rng.integers(0, 2, size=100),
        "avg_rating": rng.uniform(2.0, 5.0, size=100),
        "ingredient_count": rng.integers(3, 15, size=100),
        "complexity_score": rng.uniform(1.0, 10.0, size=100),
    })[cols]
    val_df = train_df.sample(frac=0.2, random_state=1).reset_index(drop=True)
    test_df = train_df.sample(frac=0.2, random_state=2).reset_index(drop=True)

print(train_df.shape, val_df.shape, test_df.shape)


In [None]:
# Train a small LightGBM model (fast settings)
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

feature_cols = [c for c in train_df.columns if c not in ["user_id", "recipe_id", "label", "datetime"]]
X_train, y_train = train_df[feature_cols], train_df["label"]
X_val, y_val = val_df[feature_cols], val_df["label"]

train_set = lgb.Dataset(X_train, label=y_train)
val_set = lgb.Dataset(X_val, label=y_val)

params = {
    "objective": "binary",
    "metric": ["auc"],
    "learning_rate": 0.1,
    "num_leaves": 31,
    "min_data_in_leaf": 10,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 1,
    "verbosity": -1,
}

model = lgb.train(
    params,
    train_set,
    num_boost_round=100,
    valid_sets=[val_set],
    valid_names=["val"],
    callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)],
)

val_pred = model.predict(X_val)
val_label = (val_pred >= 0.5).astype(int)

print({
    "AUC": float(roc_auc_score(y_val, val_pred)),
    "Precision": float(precision_score(y_val, val_label, zero_division=0)),
    "Recall": float(recall_score(y_val, val_label, zero_division=0)),
    "F1": float(f1_score(y_val, val_label, zero_division=0)),
})


In [None]:
# Demo inference: score and rank recipes for a sample user
import numpy as np

def recommend_for_user(user_id: str, candidate_df: pd.DataFrame, top_n: int = 5):
    # In a full system, we'd build personalized features; here we score with trained model
    X = candidate_df[feature_cols]
    scores = model.predict(X)
    out = candidate_df[["recipe_id"]].copy()
    out["score"] = scores
    out["user_id"] = user_id
    return out.sort_values("score", ascending=False).head(top_n)

sample_user = "demo_user"
# Use test_df as candidates for demo
recs = recommend_for_user(sample_user, test_df, top_n=10)
recs.head(10)


In [None]:
# Smoke test: end-to-end checks
import numpy as np

def smoke_test():
    errors = []

    # Basic dataset checks
    try:
        for name, df in [("train_df", train_df), ("val_df", val_df), ("test_df", test_df)]:
            if df is None or len(df) == 0:
                errors.append(f"{name} is empty or not loaded")
    except NameError as e:
        errors.append(f"Dataframes not defined: {e}")

    # Feature column checks
    try:
        missing = [c for c in feature_cols if c not in train_df.columns]
        if missing:
            errors.append(f"Missing feature columns in train_df: {missing[:5]} ...")

        non_numeric = [c for c in feature_cols if not np.issubdtype(train_df[c].dtype, np.number)]
        if non_numeric:
            errors.append(f"Non-numeric features found: {non_numeric[:5]} ...")
    except Exception as e:
        errors.append(f"Feature column validation failed: {e}")

    # Model prediction checks
    try:
        _val_pred = model.predict(val_df[feature_cols])
        if len(_val_pred) != len(val_df):
            errors.append("Prediction length mismatch with validation data")
        if not np.all(np.isfinite(_val_pred)):
            errors.append("Non-finite values in predictions")
    except Exception as e:
        errors.append(f"Model prediction failed: {e}")

    # Recommendation output checks
    try:
        _recs = recommend_for_user("smoke_user", test_df.head(20).copy(), top_n=5)
        required_cols = {"user_id", "recipe_id", "score"}
        if not required_cols.issubset(set(_recs.columns)):
            errors.append(f"Recommendation output missing columns: {required_cols - set(_recs.columns)}")
        if len(_recs) == 0:
            errors.append("No recommendations returned")
    except Exception as e:
        errors.append(f"Recommendation function failed: {e}")

    if errors:
        print("SMOKE TEST: FAIL")
        for err in errors:
            print(" -", err)
        raise AssertionError("Smoke test failed")
    else:
        print("SMOKE TEST: PASS")
        print(f"Train/Val/Test sizes: {len(train_df)}, {len(val_df)}, {len(test_df)}")
        print(f"Features used: {len(feature_cols)}")

smoke_test()
