# PantryPalML: A2 Cloud Demo (Colab/Binder)

This notebook runs the full pipeline in a cloud runtime for the A2 submission:

- Environment setup (pip install, repo clone if needed)
- Load small sample CSVs from the repo (no external DB)
- Train LightGBM model (fast config)
- Evaluate on held-out split
- Demo inference: get top-N recommendations for a sample user

Run: Runtime → Run all.


## Environment Setup


### Env Setup

In [10]:
# If running on Colab, install dependencies and clone repo without IPython magics
import sys, subprocess, os, pathlib

IN_COLAB = "google.colab" in sys.modules
repo_root = pathlib.Path.cwd()

if IN_COLAB:
    # Install required packages
    try:
        subprocess.run([sys.executable, "-m", "pip", "install", "-q",
                        "lightgbm", "pandas", "numpy", "scikit-learn", "matplotlib", "seaborn"],
                       check=False)
    except Exception as e:
        print(f"pip install warning: {e}")

    # Clone the repo if not present
    if not (repo_root / "recipe_recommender").exists():
        subprocess.run(["git", "clone", "-q", "https://github.com/marcel-qayoom-taylor/PantryPalML.git"], check=True)
        os.chdir("PantryPalML")
        repo_root = pathlib.Path.cwd()

print(f"Environment ready. Project root: {repo_root}")


Environment ready. Project root: /Users/marcelqayoomtaylor/Documents/GitHub/PantryPalML/notebooks


### Source Data

In [11]:
# Fast config: paths to sample CSVs included in repo
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
# If we're in the notebooks directory, go up one level to find the project root
if ROOT.name == "notebooks" and (ROOT.parent / "recipe_recommender").exists():
    PROJECT_ROOT = ROOT.parent
elif (ROOT / "notebooks").exists() and (ROOT / "recipe_recommender").exists():
    PROJECT_ROOT = ROOT
elif (ROOT / "PantryPalML").exists():
    PROJECT_ROOT = ROOT / "PantryPalML"
else:
    PROJECT_ROOT = ROOT

print(f"Project root set to: {PROJECT_ROOT}")

OUTPUT_DIR = PROJECT_ROOT / "recipe_recommender" / "output"

# Load small sample CSVs (fallback to generated minimal frames if missing)
train_path = OUTPUT_DIR / "hybrid_train_data.csv"
val_path = OUTPUT_DIR / "hybrid_val_data.csv"
test_path = OUTPUT_DIR / "hybrid_test_data.csv"
print("Checking for dataframes")
print(train_path)

if all(p.exists() for p in [train_path, val_path, test_path]):
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
else:
    # Minimal fallback for demo
    import numpy as np
    print("Generating minimal fallback dataframes")
    rng = np.random.default_rng(42)
    cols = ["user_id", "recipe_id", "label", "avg_rating", "ingredient_count", "complexity_score"]
    train_df = pd.DataFrame({
        "user_id": [f"u{i}" for i in range(100)],
        "recipe_id": [f"r{i%20}" for i in range(100)],
        "label": rng.integers(0, 2, size=100),
        "avg_rating": rng.uniform(2.0, 5.0, size=100),
        "ingredient_count": rng.integers(3, 15, size=100),
        "complexity_score": rng.uniform(1.0, 10.0, size=100),
    })[cols]
    val_df = train_df.sample(frac=0.2, random_state=1).reset_index(drop=True)
    test_df = train_df.sample(frac=0.2, random_state=2).reset_index(drop=True)

print(train_df.shape, val_df.shape, test_df.shape)


Project root set to: /Users/marcelqayoomtaylor/Documents/GitHub/PantryPalML
Checking for dataframes
/Users/marcelqayoomtaylor/Documents/GitHub/PantryPalML/recipe_recommender/output/hybrid_train_data.csv
(12438, 40) (4146, 40) (4146, 40)


### Train Model

In [18]:
# Train a small LightGBM model (fast settings)
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import numpy as np

# Check class balance first
print(f"Class distribution:")
print(f"Positive (label=1): {train_df['label'].sum()} ({train_df['label'].mean():.2%})")
print(f"Negative (label=0): {(train_df['label'] == 0).sum()} ({(train_df['label'] == 0).mean():.2%})")

# Select numeric features and remove likely leaky features
exclude_cols = ["user_id", "recipe_id", "label", "datetime"]
# Remove likely leaky features that might directly relate to the target
leaky_features = ["rating", "total_rating", "avg_rating", "rating_std", "total_interactions"]
exclude_cols.extend(leaky_features)

numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c not in exclude_cols]

print(f"\nUsing {len(feature_cols)} numeric features for training:")
print(feature_cols[:10], "..." if len(feature_cols) > 10 else "")

X_train, y_train = train_df[feature_cols], train_df["label"]
X_val, y_val = val_df[feature_cols], val_df["label"]

# Check if we have any features left
if len(feature_cols) == 0:
    print("WARNING: No features available after filtering!")
    # Use minimal features for demo
    feature_cols = ["unique_recipes", "activity_days", "prep_time"][:3]  # Use first available
    X_train, y_train = train_df[feature_cols], train_df["label"]
    X_val, y_val = val_df[feature_cols], val_df["label"]

train_set = lgb.Dataset(X_train, label=y_train)
val_set = lgb.Dataset(X_val, label=y_val)

params = {
    "objective": "binary",
    "metric": ["auc"],
    "learning_rate": 0.1,
    "num_leaves": 31,
    "min_data_in_leaf": 10,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 1,
    "verbosity": -1,
}

model = lgb.train(
    params,
    train_set,
    num_boost_round=100,
    valid_sets=[val_set],
    valid_names=["val"],
    callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)],
)

# Get predictions
val_pred = model.predict(X_val)
print(f"\nPrediction stats: min={val_pred.min():.4f}, mean={val_pred.mean():.4f}, max={val_pred.max():.4f}")



# Fixed production threshold chosen via ROC curve (Youden's J) on validation set
MODEL_THRESHOLD = 0.0475
val_label_fixed = (val_pred >= MODEL_THRESHOLD).astype(int)

print(f"\nResults with fixed threshold = {MODEL_THRESHOLD:.4f} (from ROC curve / Youden's J on validation set):")
print({
    "AUC": float(roc_auc_score(y_val, val_pred)),
    "Precision": float(precision_score(y_val, val_label_fixed, zero_division=0)),
    "Recall": float(recall_score(y_val, val_label_fixed, zero_division=0)),
    "F1": float(f1_score(y_val, val_label_fixed, zero_division=0)),
})


Class distribution:
Positive (label=1): 2488 (20.00%)
Negative (label=0): 9950 (80.00%)

Using 20 numeric features for training:
['unique_recipes', 'activity_days', 'interactions_per_day', 'engagement_score', 'prep_time', 'cook_time', 'total_time', 'servings', 'author_id', 'instruction'] ...

Prediction stats: min=0.0297, mean=0.2007, max=0.8890

Results with fixed threshold = 0.0475 (from ROC curve / Youden's J on validation set):
{'AUC': 0.9994948710684768, 'Precision': 0.8962162162162162, 'Recall': 1.0, 'F1': 0.9452679589509693}


### Perform Inference on a Sample User

In [21]:
# Demo inference: score and rank recipes for a sample user
def recommend_for_user(user_id: str, candidate_df: pd.DataFrame, top_n: int = 5):
    # In a full system, we'd build personalized features; here we score with trained model
    X = candidate_df[feature_cols]
    scores = model.predict(X)
    out = candidate_df[["recipe_id"]].copy()
    out["score"] = scores
    out["user_id"] = user_id
    return out.sort_values("score", ascending=False).head(top_n)

sample_user = "demo_user"
# Use test_df as candidates for demo
recs = recommend_for_user(sample_user, test_df, top_n=10)
recs.head(10)


Unnamed: 0,recipe_id,score,user_id
1714,B90D1B38-A110-4488-9976-673EB98BB878,0.889048,demo_user
3840,28BA7C9C-F2D9-446A-B5EF-CDA07AC97F7D,0.889048,demo_user
3564,3D52259D-B5A1-4805-A464-6D45A478BCEC,0.889048,demo_user
1193,22BA225A-FC6B-46E4-895F-C95FC01074BF,0.889048,demo_user
3836,EA49A478-2600-424D-8C77-40D1B3C6F263,0.889048,demo_user
3838,4A604C96-D440-4F0D-883E-6735E5DB0F18,0.889048,demo_user
2813,D9746C0A-8C57-4327-8420-743ABD422E8C,0.889048,demo_user
520,5FA20A04-7FFF-4F62-8038-E4B5487A7A15,0.889048,demo_user
3560,0C3BC142-6A8E-4EBD-944D-D22DADF1D4F5,0.889048,demo_user
525,F93C37F8-02D2-4A8D-B6F0-5BDF0CEC1C2E,0.889048,demo_user


### Test

In [22]:
# Smoke test: end-to-end checks
import numpy as np

def smoke_test():
    errors = []

    # Basic dataset checks
    try:
        for name, df in [("train_df", train_df), ("val_df", val_df), ("test_df", test_df)]:
            if df is None or len(df) == 0:
                errors.append(f"{name} is empty or not loaded")
    except NameError as e:
        errors.append(f"Dataframes not defined: {e}")

    # Feature column checks
    try:
        missing = [c for c in feature_cols if c not in train_df.columns]
        if missing:
            errors.append(f"Missing feature columns in train_df: {missing[:5]} ...")

        non_numeric = [c for c in feature_cols if not np.issubdtype(train_df[c].dtype, np.number)]
        if non_numeric:
            errors.append(f"Non-numeric features found: {non_numeric[:5]} ...")
    except Exception as e:
        errors.append(f"Feature column validation failed: {e}")

    # Model prediction checks
    try:
        _val_pred = model.predict(val_df[feature_cols])
        if len(_val_pred) != len(val_df):
            errors.append("Prediction length mismatch with validation data")
        if not np.all(np.isfinite(_val_pred)):
            errors.append("Non-finite values in predictions")
    except Exception as e:
        errors.append(f"Model prediction failed: {e}")

    # Recommendation output checks
    try:
        _recs = recommend_for_user("smoke_user", test_df.head(20).copy(), top_n=5)
        required_cols = {"user_id", "recipe_id", "score"}
        if not required_cols.issubset(set(_recs.columns)):
            errors.append(f"Recommendation output missing columns: {required_cols - set(_recs.columns)}")
        if len(_recs) == 0:
            errors.append("No recommendations returned")
    except Exception as e:
        errors.append(f"Recommendation function failed: {e}")

    if errors:
        print("SMOKE TEST: FAIL")
        for err in errors:
            print(" -", err)
        raise AssertionError("Smoke test failed")
    else:
        print("SMOKE TEST: PASS")
        print(f"Train/Val/Test sizes: {len(train_df)}, {len(val_df)}, {len(test_df)}")
        print(f"Features used: {len(feature_cols)}")

smoke_test()


SMOKE TEST: PASS
Train/Val/Test sizes: 12438, 4146, 4146
Features used: 20
