In [None]:
!pip install pandas scikit-learn xgboost sentence-transformers torch

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer, util
import torch


In [None]:
# Load dataset
DATA_PATH = "menu_dataset.json"  # 👈 replace with your dataset path

with open(DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
print("✅ Dataset loaded successfully!")
print(df.head())

# Check column availability
required_cols = ["dish_name", "cuisine_type", "occasion_tags", "description"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")


In [None]:
# Prepare features and labels
X = df["occasion_tags"].astype(str)
y = df["cuisine_type"].astype(str)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Build TF-IDF + XGBoost pipeline
clf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", XGBClassifier(
        use_label_encoder=False,
        eval_metric="mlogloss",
        max_depth=6,
        n_estimators=200,
        learning_rate=0.1,
    )),
])

# Train model
clf_pipeline.fit(X_train, y_train)

# Evaluate
y_pred = clf_pipeline.predict(X_test)
print("\n📊 Classifier Performance:\n")
print(classification_report(y_test, y_pred))


In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Combine dish info for better context
df["dish_text"] = (
    df["dish_name"]
    + " - "
    + df["description"].fillna("")
    + " (" + df["cuisine_type"] + ")"
)

# Encode all dishes
dish_embeddings = embedder.encode(df["dish_text"].tolist(), convert_to_tensor=True)
print("✅ Embeddings generated for all dishes!")


In [None]:
def generate_menu(occasion_text, df, clf_pipeline, embedder, dish_embeddings, top_k=10):
    """Generate a menu based on occasion using hybrid model."""
    # Step 1: Predict cuisine
    predicted_cuisine = clf_pipeline.predict([occasion_text])[0]

    # Step 2: Filter dishes by predicted cuisine
    subset = df[df["cuisine_type"] == predicted_cuisine].reset_index(drop=True)
    if subset.empty:
        subset = df  # fallback to all dishes if none found

    # Step 3: Compute similarity with subset
    query_embedding = embedder.encode(occasion_text, convert_to_tensor=True)
    subset_embeddings = embedder.encode(subset["dish_text"].tolist(), convert_to_tensor=True)

    cosine_scores = util.cos_sim(query_embedding, subset_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=min(top_k, len(subset)))

    # Step 4: Format results
    recommended = subset.iloc[top_results.indices.cpu().numpy()]
    menu_items = recommended[["dish_name", "course", "veg"]].to_dict(orient="records")

    return {
        "occasion": occasion_text,
        "predicted_cuisine": predicted_cuisine,
        "menu": menu_items,
    }


In [None]:
# Test on sample occasions
test_occasions = [
    "Corporate lunch",
    "South Indian wedding dinner",
    "Festival buffet",
    "Birthday party",
]

for occasion in test_occasions:
    menu = generate_menu(occasion, df, clf_pipeline, embedder, dish_embeddings, top_k=8)
    print(f"\n=== {occasion.upper()} ===")
    print(f"Predicted Cuisine: {menu['predicted_cuisine']}\n")
    for dish in menu["menu"]:
        veg_label = "Veg" if dish["veg"] else "Non-Veg"
        print(f" - {dish['dish_name']} ({dish['course']}, {veg_label})")


In [None]:
import pickle
import torch

# Save classifier
with open("occasion_classifier.pkl", "wb") as f:
    pickle.dump(clf_pipeline, f)

# Save embeddings and data
torch.save(dish_embeddings, "dish_embeddings.pt")
df.to_csv("menu_data_processed.csv", index=False)

print("✅ Models and embeddings saved!")
