In [4]:
import re
import random
import webbrowser
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union

import numpy as np
import pandas as pd

# ML / NLP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# GUI
import tkinter as tk
from tkinter import messagebox

# =========================
# Config
# =========================
DATASETS = [
    r"C:\Users\MANASVI\Desktop\Hybrid recipe generator\final_indian_recipes.csv"
]
TOP_K_DEFAULT = 5
RANDOM_SEED = 42
PAGE_SIZE = 5  # items per page in GUI

# Ingredient synonym / alias mapping
SYNONYM_MAP_PHRASES: Dict[str, str] = {
    r"\bcapsicum\b": "bell pepper",
    r"\bmirchi\b": "chili",
    r"\bchilli(es)?\b": "chili",
    r"\bchilies\b": "chili",
    r"\bgreen chili(es)?\b": "chili",
    r"\bred chili(es)?\b": "chili",
    r"\bcoriander leaves\b": "coriander",
    r"\bdhania\b": "coriander",
    r"\bcilantro\b": "coriander",
    r"\bjeera\b": "cumin",
    r"\bzeera\b": "cumin",
    r"\bhari mirch\b": "chili",
    r"\bhing\b": "asafoetida",
    r"\bgur\b": "jaggery",
    r"\bgud\b": "jaggery",
    r"\bmaida\b": "all purpose flour",
    r"\batta\b": "wheat flour",
    r"\bdahi\b": "yogurt",
    r"\bcurd\b": "yogurt",
    r"\bmethi\b": "fenugreek",
    r"\bkasuri methi\b": "fenugreek",
    r"\baloo\b": "potato",
    r"\bbhindi\b": "okra",
    r"\btadka\b": "tempering",
}

# Minimal noise words
NOISE_TOKENS = {
    "fresh", "finely", "chopped", "sliced", "diced", "ground", "powder",
    "optional", "to", "taste", "medium", "large", "small", "cup", "cups",
    "tsp", "tbsp", "tablespoon", "teaspoon", "pinch", "piece", "pieces",
    "handful", "and", "or", "of"
}

# =========================
# Loading & Unification
# =========================
def load_dataset(path: Union[str, Path]) -> pd.DataFrame:
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"Data file not found: {path}")
    if path.suffix.lower() == ".csv":
        df = pd.read_csv(path, encoding="utf-8", engine="python", on_bad_lines="skip")
    elif path.suffix.lower() == ".json":
        df = pd.read_json(path)
    else:
        raise ValueError("Unsupported file type. Provide CSV or JSON.")
    df.columns = [re.sub(r"\s+", "_", c.strip().lower()) for c in df.columns]
    return df


def unify_dataset(df: pd.DataFrame, src_name: str) -> pd.DataFrame:
    name_col = next((c for c in ["name", "title", "recipe_name", "dish", "recipe"] if c in df.columns), None)
    ing_col = next((c for c in ["ingredients", "ingredient", "ingredients_name", "translatedingredients"] if c in df.columns), None)
    ins_col = next((c for c in ["instructions", "steps", "directions", "method", "translatedinstructions"] if c in df.columns), None)
    cuisine_col = next((c for c in ["recipecuisine", "cuisine", "recipe_cuisine"] if c in df.columns), None)
    source_col = next((c for c in ["url", "source", "link"] if c in df.columns), None)

    out = pd.DataFrame({
        "name": df[name_col] if name_col else [f"recipe_{i}" for i in range(len(df))],
        "ingredients": df[ing_col] if ing_col else "",
        "instructions": df[ins_col] if ins_col else "",
        "cuisine": df[cuisine_col] if cuisine_col else "",
        "source": df[source_col] if source_col else "",
    })
    for c in ["name", "ingredients", "instructions", "cuisine", "source"]:
        out[c] = out[c].astype(str).fillna("")
    return out


def load_all_datasets(paths: List[str]) -> pd.DataFrame:
    dfs = []
    for p in paths:
        df = load_dataset(p)
        df_u = unify_dataset(df, p)
        dfs.append(df_u)
    return pd.concat(dfs, ignore_index=True)

# =========================
# Normalization helpers
# =========================
def apply_synonyms(text: str) -> str:
    s = text
    for pattern, repl in SYNONYM_MAP_PHRASES.items():
        s = re.sub(pattern, repl, s)
    return s


def normalize_ingredients(value: Any) -> str:
    if pd.isna(value):
        return ""
    s = str(value).lower()
    s = apply_synonyms(s)
    s = re.sub(r"(\d+\/\d+|\d+\.\d+|\d+)", " ", s)
    s = re.sub(r"[^a-z,;\s\-]", " ", s)

    tokens = [t for t in re.split(r"[\s,;]+", s) if t and t not in NOISE_TOKENS]
    s = " ".join(tokens)
    return re.sub(r"\s+", " ", s).strip()


def _tokenize_norm(text: str) -> List[str]:
    if not text:
        return []
    return [t for t in normalize_ingredients(text).split() if t]

# =========================
# Model
# =========================
class RecipeRecord:
    def __init__(self, idx: int, name: str, ing_text_norm: str, raw_ing: str, raw_ins: str, cuisine: str, source: str):
        self.idx = idx
        self.name = name
        self.ing_text = ing_text_norm
        self.raw_ingredients = raw_ing
        self.raw_instructions = raw_ins
        self.cuisine = cuisine
        self.source = source
        self.token_set = set(ing_text_norm.split())


class IngredientSearch:
    def __init__(self):
        self.vec_word = TfidfVectorizer(
            analyzer="word", ngram_range=(1, 2), min_df=1, strip_accents="unicode", sublinear_tf=True
        )
        self.vec_char = TfidfVectorizer(
            analyzer="char", ngram_range=(3, 6), min_df=1, strip_accents="unicode"
        )
        self.records: List[RecipeRecord] = []
        self.M_word = None
        self.M_char = None
        self.idf_lookup: Dict[str, float] = {}
        self.idf_default: float = 1.0

    def fit(self, df: pd.DataFrame):
        ing_norm = df["ingredients"].apply(normalize_ingredients)
        self.records = [
            RecipeRecord(i, str(df.iloc[i]["name"]), ing_norm.iloc[i], str(df.iloc[i]["ingredients"]), str(df.iloc[i]["instructions"]), str(df.iloc[i]["cuisine"]), str(df.iloc[i]["source"]))
            for i in range(len(df))
        ]
        corpus = [r.ing_text for r in self.records]
        self.M_word = self.vec_word.fit_transform(corpus)
        self.M_char = self.vec_char.fit_transform(corpus)

        vocab = self.vec_word.vocabulary_
        idfs = self.vec_word.idf_
        self.idf_lookup = {term: float(idfs[idx]) for term, idx in vocab.items()}
        self.idf_default = float(np.median(idfs)) if len(idfs) else 1.0
        return self

    def search(self, query: Union[str, List[str]], top_k: int = 5) -> List[Tuple[float, RecipeRecord]]:
        if isinstance(query, list):
            tokens_raw = [str(q) for q in query]
            tokens = []
            for t in tokens_raw:
                tokens.extend(_tokenize_norm(t))
        else:
            tokens = _tokenize_norm(str(query))

        tokens = sorted(set(tokens))
        if not tokens:
            return []

        q_text = " ".join(tokens)

        q_w = self.vec_word.transform([q_text])
        q_c = self.vec_char.transform([q_text])
        s_w = linear_kernel(q_w, self.M_word)[0]
        s_c = linear_kernel(q_c, self.M_char)[0]
        base_sim = 0.65 * s_w + 0.35 * s_c

        denom_idf = sum(self.idf_lookup.get(t, self.idf_default) for t in tokens) + 1e-6
        min_match = max(1, int(np.ceil(0.6 * len(tokens))))

        results: List[Tuple[float, RecipeRecord, int, float]] = []
        token_set_q = set(tokens)

        for i, rec in enumerate(self.records):
            rec_tokens = rec.token_set
            matched = token_set_q & rec_tokens
            overlap = len(matched)
            coverage_idf = sum(self.idf_lookup.get(t, self.idf_default) for t in matched) / denom_idf

            union_sz = len(rec_tokens | token_set_q) + 1e-6
            jaccard = overlap / union_sz
            length_penalty = 1.0 / (1.0 + 0.02 * max(0, len(rec_tokens) - len(tokens)))

            score = float(base_sim[i])
            score += 0.75 * coverage_idf
            score += 0.10 * jaccard
            score += 0.05 * length_penalty

            if overlap == len(tokens):
                score += 0.50

            results.append((score, rec, overlap, coverage_idf))

        results.sort(key=lambda x: (x[2], x[0]), reverse=True)

        filtered = [r for r in results if r[2] >= min_match]
        ranked = filtered if filtered else results

        top = ranked[:max(top_k, 50)]
        return [(float(score), rec) for score, rec, _, _ in top]


# =========================
# Evaluation
# =========================
def evaluate_model(model: IngredientSearch, top_k: int = 5, num_queries: int = 200, seed: int = RANDOM_SEED) -> Dict[str, float]:
    rng = random.Random(seed)
    candidates = [r for r in model.records if len(r.ing_text.split()) >= 3]
    sample = rng.sample(candidates, k=min(num_queries, len(candidates)))

    top1 = topk = 0
    mrr = 0.0
    prec_sum = 0.0
    rec_sum = 0.0
    f1_sum = 0.0

    for r in sample:
        toks = r.ing_text.split()
        q_len = min(len(toks), rng.randint(3, min(8, len(toks))))
        q = " ".join(rng.sample(toks, q_len))

        results = model.search(q, top_k=top_k)
        ranks = [i for i, (_, rec) in enumerate(results[:top_k], start=1) if rec.idx == r.idx]
        if ranks:
            rank = ranks[0]
            if rank == 1:
                top1 += 1
            topk += 1
            mrr += 1.0 / rank
            p = 1.0 / top_k
            r_ = 1.0
            prec_sum += p
            rec_sum += r_
            f1_sum += 2 * p * r_ / (p + r_)

    n = max(1, len(sample))
    return {
        "n_queries": float(n),
        "top1_acc": top1 / n,
        f"top{top_k}_acc": topk / n,
        f"mrr@{top_k}": mrr / n,
        f"precision@{top_k}": prec_sum / n,
        f"recall@{top_k}": rec_sum / n,
        f"f1@{top_k}": f1_sum / n,
        "overall_accuracy": (top1 + topk) / (2 * n)
    }

# =========================
# GUI
# =========================
class App:
    def __init__(self, model: IngredientSearch):
        self.model = model
        self.results_all: List[Tuple[float, RecipeRecord]] = []
        self.cursor = 0

        self.root = tk.Tk()
        self.root.title("Recipe Search (Ingredients â†’ Recipes)")

        tk.Label(self.root, text="Enter Ingredients (comma-separated):").pack(pady=(8, 2))
        self.entry = tk.Entry(self.root, width=90)
        self.entry.pack(padx=8, pady=(0, 6))

        self.btn_frame = tk.Frame(self.root)
        self.btn_frame.pack(pady=(0, 4))
        tk.Button(self.btn_frame, text="Search", command=self.on_search).pack(side=tk.LEFT, padx=4)
        self.show_more_btn = tk.Button(self.btn_frame, text="Show More", state=tk.DISABLED, command=self.on_show_more)
        self.show_more_btn.pack(side=tk.LEFT, padx=4)
        tk.Button(self.btn_frame, text="Clear", command=self.on_clear).pack(side=tk.LEFT, padx=4)

        canvas = tk.Canvas(self.root, height=500)
        scrollbar = tk.Scrollbar(self.root, orient="vertical", command=canvas.yview)
        self.results_frame = tk.Frame(canvas)
        self.results_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
        canvas.create_window((0, 0), window=self.results_frame, anchor="nw")
        canvas.configure(yscrollcommand=scrollbar.set)
        canvas.pack(side="left", fill="both", expand=True)
        scrollbar.pack(side="right", fill="y")

        tk.Label(
            self.results_frame,
            text="Type a few ingredients (e.g., 'chicken, garlic, chili, tomato') and press Search.",
            wraplength=600,
            justify="left",
            fg="gray",
        ).pack(pady=10)

    def display_results(self, start: int, page_size: int):
        end = min(start + page_size, len(self.results_all))
        batch = self.results_all[start:end]

        for score, rec in batch:
            card = tk.Frame(self.results_frame, relief="groove", borderwidth=2, padx=8, pady=6)
            card.pack(fill="x", padx=8, pady=6)

            # Recipe name
            tk.Label(card, text=rec.name, font=("Arial", 12, "bold")).pack(anchor="w")

            # Cuisine
            if rec.cuisine and rec.cuisine.lower() != "nan":
                tk.Label(card, text=f"Cuisine: {rec.cuisine}", fg="blue").pack(anchor="w")

            # Ingredients
            ing = rec.raw_ingredients.strip()
            if ing:
                tk.Label(
                    card, text=f"Ingredients:\n{ing}",
                    wraplength=700, justify="left"
                ).pack(anchor="w", pady=(2, 4))

            # Instructions
            ins = rec.raw_instructions.strip()
            if ins:
                if re.search(r"(?i)\bstep\s*\d*", ins):
                    steps = re.split(r"(?i)\bstep\s*\d*[:\-]?\s*", ins)
                    steps = [s.strip() for s in steps if s.strip()]
                else:
                    steps = re.split(r'(?<=[.!?])\s+', ins)
                    steps = [s.strip() for s in steps if s.strip()]

                ins_text = "\n".join([f"Step {i+1}: {s}" for i, s in enumerate(steps)])

                tk.Label(
                    card, text=f"Instructions:\n{ins_text}",
                    wraplength=700, justify="left"
                ).pack(anchor="w", pady=(2, 4))

            # View Source
            if rec.source and rec.source.lower() != "nan":
                link = tk.Label(card, text="View Source", fg="green", cursor="hand2")
                link.pack(anchor="w")
                link.bind("<Button-1>", lambda e, url=rec.source: webbrowser.open(url))

    def on_search(self):
        query = self.entry.get().strip()
        if not query:
            messagebox.showwarning("Input Error", "Please enter some ingredients.")
            return

        for widget in self.results_frame.winfo_children():
            widget.destroy()

        self.results_all = self.model.search(query, top_k=50)
        self.cursor = 0

        if not self.results_all:
            tk.Label(self.results_frame, text="No results. Try different ingredients.", fg="red").pack(pady=10)
            self.show_more_btn.config(state=tk.DISABLED)
            return

        self.display_results(0, PAGE_SIZE)
        self.cursor += PAGE_SIZE
        self.show_more_btn.config(state=(tk.NORMAL if self.cursor < len(self.results_all) else tk.DISABLED))

    def on_show_more(self):
        self.display_results(self.cursor, PAGE_SIZE)
        self.cursor += PAGE_SIZE
        if self.cursor >= len(self.results_all):
            self.show_more_btn.config(state=tk.DISABLED)

    def on_clear(self):
        for widget in self.results_frame.winfo_children():
            widget.destroy()
        self.cursor = 0
        self.results_all = []
        self.show_more_btn.config(state=tk.DISABLED)

    def run(self):
        self.root.mainloop()

# =========================
# Main
# =========================
if __name__ == "__main__":
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    df_all = load_all_datasets(DATASETS)
    model = IngredientSearch().fit(df_all)

    metrics = evaluate_model(model, top_k=TOP_K_DEFAULT, num_queries=200)
    print("\nEvaluation (synthetic retrieval):")
    for k, v in metrics.items():
        if isinstance(v, float):
            print(f"  {k:>16}: {v:.4f}")
        else:
            print(f"  {k:>16}: {v}")

    App(model).run()


Evaluation (synthetic retrieval):
         n_queries: 200.0000
          top1_acc: 0.7850
          top5_acc: 0.9100
             mrr@5: 0.8319
       precision@5: 0.1820
          recall@5: 0.9100
              f1@5: 0.3033
  overall_accuracy: 0.8475
