**Having performed a thorough exploratory data analysis, the roadmap for preprocessing the data is clear.**

## Required Preprocessing Steps:
1. [Load raw data.](#data-loading)

2. [Normalize `["recipe_title", "description", "ingredients", "directions"]` fields for consistent formating.](#normalization)

3. [Delete duolicates based on `["recipe_title", "description", "ingredients", "directions"]` combination.](#duplicate-deletion)

4. [Extract `["recipe_title", "ingredients", "directions"]` columns only.](#feature-extraction)

5. [Clean ingredients using *stage 1 and stage 2* functions.](#2-stage-ingredient-cleaning)

6. [Final Cleaning of Ingredients.](#final-cleaning)

7. [Build TF-IDF matrix with canonicalized ingredients](#tf-idf)

8. [Compute cosine similarity](#similarity-computation)

9. [Embeddings (Semantic Matching).](#embeddings-semantic-matching)

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import hdbscan
import pickle
import re
import ast
import json
import os
from collections import Counter, defaultdict
from datasets import load_dataset
from itertools import combinations
from functools import reduce
from spacy.matcher import PhraseMatcher
from rapidfuzz import process, fuzz  # faster fuzzy matching
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer

### Data Loading

In [2]:
# Loading raw data
raw_data = load_dataset("json", data_files="../data/recipe.json")
df = raw_data["train"].to_pandas()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62126 entries, 0 to 62125
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   recipe_title     62126 non-null  object
 1   category         62126 non-null  object
 2   subcategory      62126 non-null  object
 3   description      62126 non-null  object
 4   ingredients      62126 non-null  object
 5   directions       62126 non-null  object
 6   num_ingredients  62126 non-null  int64 
 7   num_steps        62126 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 3.8+ MB


### Normalization

In [3]:
# Ensuring consistency in string formatting
def normalize_column(col):
    """
    Normalize a column for duplicate detection:
    - Strings → lowercase, stripped
    - Lists/arrays → lowercase, stripped, sorted, converted to tuple
    """
    if isinstance(col, str):
        return col.strip().lower()
    elif isinstance(col, (list, np.ndarray)):
        # Lowercase each element, strip spaces, sort, convert to tuple
        cleaned = tuple(sorted([str(x).strip().lower() for x in col]))
        return cleaned
    return col

# Columns to check for duplicates
cols_to_check = ["recipe_title", "description", "ingredients", "directions"]

# Create normalized columns
normalized_cols = {col: df[col].apply(normalize_column) for col in cols_to_check}

# Combine into a DataFrame
norm_df = df.assign(**normalized_cols)

### Duplicate Deletion

In [4]:
# Deleting duplicates
deduplicated_df = norm_df.drop_duplicates(subset=["recipe_title", "description", "ingredients", "directions"])
deduplicated_df = deduplicated_df.copy()
deduplicated_df.shape

(25024, 8)

### Feature Extraction

In [5]:
# Extracting relevant fields
recipes_df = deduplicated_df[["recipe_title", "ingredients", "directions"]]
recipes_df = recipes_df.copy()
recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25024 entries, 0 to 62032
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   recipe_title  25024 non-null  object
 1   ingredients   25024 non-null  object
 2   directions    25024 non-null  object
dtypes: object(3)
memory usage: 782.0+ KB


### Capitalize each word in Recipe Title

In [6]:
recipes_df["recipe_title"] = recipes_df["recipe_title"].str.title()

### 2-Stage Ingredient-Cleaning

In [7]:
# Medium language model
nlp = spacy.load("en_core_web_md")

#### Stage 1

In [8]:
def clean_stage1(raw):
    raw = raw.lower().strip()
    
    # Replace unicode fractions
    UNICODE_FRAC = {"½": "1/2", "⅓": "1/3", "¼": "1/4", "⅛": "1/8", "⅔": "2/3", "¾" : "3/4", "⅜" : "3/8", "⅞" : "7/8"}
    for frac, val in UNICODE_FRAC.items():
        raw = raw.replace(frac, val)
        
    # Remove “such as ...” example clauses
    raw = re.sub(r"such as [a-zA-Z\s']+", " ", raw)
    
    # Remove numbers but keep B12/B6 etc.
    raw = re.sub(r'\b\d+(\.\d+)?\b(?![a-zA-Z])', ' ', raw)

    # Remove numeric quantities
    # raw = re.sub(r"(\d+\/\d+|\d+\.\d+|\d+)", " ", raw)

    # Remove measurement units
    raw = re.sub(r"\b(cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|gram|grams|kg|kilogram|kilograms|pound|pounds|lb|\
                 pinch|pint|quart|quartered|dash|sprig|inch|inches|pieces|sized|size|whole)\b", " ", raw)

    # Remove preparation-only words (but do not remove meaningful adjectives)
    PREP_WORDS = r"(chopped|diced|minced|sliced|slices|skinned|peeled|halved|shucked|shredded|ground|grated|granulated|trimmed|rinsed|patted|divided|optional|crush|crushed|garnish|\
        cooked|prepared|cut|pat|dry|thaw|drained|refrigerate|frozen|thawed|remove|dusting|squeezed|scrubbed|finely|coarse|coarsely|cold|unsalted|lightly|crumbled|thick|processed)"
    raw = re.sub(rf"\b{PREP_WORDS}\b", " ", raw)

    # Remove other terms
    OTHER_WORDS = r"(plus|more|into|for|extra|additional|taste|package|bag|box|can|cans|canned|tube|jar|bottle|container|about|total|desired|needed|serving|note|icing|dipping|wooden|\
        toothpicks|skewers|parchment|packet|baby|everything|italian-style|japanese-style|american)"
    raw = re.sub(rf"\b{OTHER_WORDS}\b", " ", raw)

    # Remove punctuation while leaving some possible ingredient connectors
    raw = re.sub(r"[^\w\s/,&-]", " ", raw)

    # Collapse whitespace
    raw = re.sub(r"\s+", " ", raw).strip()
    
    return raw

In [9]:
# Apply to dataset
recipes_df["clean_ingredients_stage1"] = recipes_df["ingredients"].apply(
    lambda lst: [clean_stage1(raw) for raw in lst]
)

# Preview
recipes_df[['ingredients', 'clean_ingredients_stage1']].head()

Unnamed: 0,ingredients,clean_ingredients_stage1
0,"(1 tablespoon worcestershire sauce, 1/2 cup be...","[worcestershire sauce, / beer, / garlic powder..."
1,"(1 pound pork belly, 1 tablespoon honey,, 1 te...","[pork belly, honey,, ginger, / gochujang, soy ..."
2,"(1 tablespoon olive oil, 1 ¼ pounds chicken te...","[olive oil, / chicken tenders, / bagel seasoni..."
3,"(1 1/2 cups panko breadcrumbs, 2 large eggs, 2...","[/ panko breadcrumbs, large eggs, milk, ablesp..."
4,"(1 tablespoon honey, 1 tablespoon soy sauce, 1...","[honey, soy sauce, sriracha, rice vinegar, / g..."


#### Stage 2

In [10]:
def split_tokens(text):
    """
    Split Stage-1 cleaned text into tokens.
    Uses commas, slashes, ' and ', ' or ' etc.
    """
    # replace and/or with comma (but not inside ingredient names)
    text = re.sub(r"\s+(and|or|\&)\s+", ",", text)

    # split on commas or slashes
    raw_tokens = re.split(r"[,/]", text)

    # clean whitespace
    return [t.strip() for t in raw_tokens if t.strip()]


def looks_like_garbage(token):
    """
    Shape-based garbage detection.
    No vocabulary lists — entirely rule-based.
    """
    t = token.lower().strip()

    # too short (except valid short ingredients)
    if len(t) <= 2 and t not in {"oil", "yam", "tea"}:
        return True

    # remove tokens ending in filler words
    if re.search(r"(needed|serving|taste|note)$", t):
        return True

    # remove repeated nonsense like "wet wet sauce"
    if re.search(r"\b(\w+)\s+\1\b", t):
        return True

    # no alphabetic characters
    if not re.search(r"[a-zA-Z]", t):
        return True
    
    return False


def extract_ingredient_phrase(t):
    """
    Extracts the main ingredient phrase using POS-based noun chunking,
    preserving multiword ingredients naturally.
    """
    t = t.strip().lower()

    # Salt-and-pepper pattern
    if " and " in t:
        parts = [extract_ingredient_phrase(x) for x in t.split(" and ")]
        flat = []
        for p in parts:
            if isinstance(p, list):
                flat.extend(p)
            else:
                flat.append(p)
        return flat

    doc = nlp(t)

    # POS-based chunks (noun phrases)
    noun_chunks = [chunk.text for chunk in doc.noun_chunks]

    if noun_chunks:
        phrase = noun_chunks[-1]  # get the main noun phrase
    else:
        phrase = t

    # Remove undesirable descriptors but keep food words
    DESCRIPTORS = {
        "large", "small", "fresh", "freshly", "boneless", "skinless", "zested", "juiced", "minced", "toasted", "cooked", "flaked", "unsweetened", "roasted",
        "4ounce", "bone-in", "skin", "round", "salted", "uncooked", "seasoned", "ground", "crushed", "sliced", "diced", "creamy", "halved", "beaten",
        "melted", "softened", "cooked", "split", "nugget", "dried", "s", "lbs", "-", "-half", "ablespoons", "nonstick", "cooking", "spray"
    }

    words = []
    for w in phrase.split():
        if w not in DESCRIPTORS:
            words.append(w)

    cleaned = " ".join(words).strip()

    return cleaned


def remove_filler_words(phrase):
    """
    Removes standalone filler words from final ingredient tokens,
    but does NOT destroy valid multiword ingredient names.
    """
    FILLER_STOPWORDS = {
        "and", "to", "or", "for", "with", "in", "of", "the",
        "a", "an", "as", "on", "into", "at"
        }
    words = phrase.split()
    words = [w for w in words if w not in FILLER_STOPWORDS]
    return " ".join(words).strip()


def clean_stage2(stage1_output):
    tokens = split_tokens(stage1_output)

    cleaned = []

    for t in tokens:
        t = t.strip().lower()
        
        # remove leading/trailing punctuation
        t = re.sub(r"^[^\w]+|[^\w]+$", "", t)

        # Skip garbage tokens
        if looks_like_garbage(t):
            continue

        # Extract ingredient phrase
        extracted = extract_ingredient_phrase(t)

        # handle salt-and-pepper cases (list return)
        if isinstance(extracted, list):
            for x in extracted:
                x = remove_filler_words(x)
                if x and not looks_like_garbage(x):
                    cleaned.append(x)
        else:
            x = remove_filler_words(extracted)
            if x and not looks_like_garbage(x):
                cleaned.append(x)

    # Remove duplicates while maintaining order
    final = list(dict.fromkeys(cleaned))
    return final

In [11]:
# Apply to actual dataset
recipes_df["clean_ingredients_stage2"] = recipes_df["clean_ingredients_stage1"].apply(
    lambda lst: [clean_stage2(raw) for raw in lst]
)

# Preview
recipes_df[['clean_ingredients_stage1', 'clean_ingredients_stage2']].head(10)

Unnamed: 0,clean_ingredients_stage1,clean_ingredients_stage2
0,"[worcestershire sauce, / beer, / garlic powder...","[[worcestershire sauce], [beer], [garlic powde..."
1,"[pork belly, honey,, ginger, / gochujang, soy ...","[[pork belly], [honey], [ginger], [gochujang],..."
2,"[olive oil, / chicken tenders, / bagel seasoni...","[[olive oil], [chicken tenders], [bagel season..."
3,"[/ panko breadcrumbs, large eggs, milk, ablesp...","[[breadcrumbs], [eggs], [milk], [], [all-purpo..."
4,"[honey, soy sauce, sriracha, rice vinegar, / g...","[[honey], [soy sauce], [sriracha], [rice vineg..."
5,"[lime juice, ears corn, and, cotija cheese, sp...","[[lime juice], [ears], [cotija cheese], [cilan..."
6,"[lime, zested and juiced, fresh ginger, soy sa...","[[lime], [ginger], [soy sauce], [sriracha], [g..."
7,"[onion, / cayenne pepper, / all-purpose flour,...","[[onion], [cayenne pepper], [all-purpose flour..."
8,"[carrots, butter,, hot honey,, nonstick cookin...","[[carrots], [butter], [hot honey], []]"
9,"[/ olive oil, large onion, salt and freshly bl...","[[olive oil], [onion], [salt, black pepper]]"


### Flatten Nested Lists

In [12]:
# flatten clean_ingredients_stage2 output one level: [[...], [...]] → [...]
def parse_ingredients(x):
    if isinstance(x, str):
        x = ast.literal_eval(x)
    return [item for sublist in x for item in sublist]

recipes_df["clean_ingredients"] = (
    recipes_df["clean_ingredients_stage2"]
    .apply(parse_ingredients)
)

recipes_df['clean_ingredients'].head()

0    [worcestershire sauce, beer, garlic powder, on...
1    [pork belly, honey, ginger, gochujang, soy sauce]
2        [olive oil, chicken tenders, bagel seasoning]
3    [breadcrumbs, eggs, milk, all-purpose flour, c...
4    [honey, soy sauce, sriracha, rice vinegar, gar...
Name: clean_ingredients, dtype: object

### Final Cleaning

In [13]:
def final_cleaning(ingredients):
    """
    ingredients: List[str]
    returns: List[str]
    """
    if not ingredients:
        return []
    
    cleaned = []
    
    # Remove undesirable descriptors
    UNWANTED_WORDS = {
        "large", "small", "fresh", "freshly", "boneless", "skinless", "zested", "juiced", "minced", "toasted", "cooked", "flaked", "unsweetened", "roasted",
        "4ounce", "bone-in", "skin", "round", "salted", "uncooked", "seasoned", "ground", "crushed", "sliced", "diced", "creamy", "halved", "beaten",
        "melted", "softened", "cooked", "split", "nugget", "dried", "s", "lbs", "-", "half", "ablespoons", "nonstick", "cooking", "spray", "all", "purpose",
        "2tablespoons", "4cup", "3x1", "added", "white", "brown", "red", "green", "black", "yellow", "undrained", "aluminium", "foil", "packaged", "reduced", "medium", "sodium", 
        "pure", "stemmed", "color", "flavor", "allpurpose", "almondflavored", "work", "surface", "very", "hot", "soft", "thin", "thick", "bunch", "plain", "italian", "glutenfree",
        "sweet", "sugarbased", "sugarfree", "semisweet", "seeded", "seedless"
        }
    
    for ing in ingredients:

        # remove punctuation
        ing = re.sub(r'[^\w\s]', '', ing)
        
        tokens = [
            t for t in ing.split() if t not in UNWANTED_WORDS
        ]
        if tokens:
            cleaned.append(" ".join(tokens))

    return cleaned

In [14]:
# Apply to dataset
recipes_df["clean_ingredients"] = recipes_df["clean_ingredients"].apply(final_cleaning)

# Preview
recipes_df['clean_ingredients'].head()

0    [worcestershire sauce, beer, garlic powder, on...
1    [pork belly, honey, ginger, gochujang, soy sauce]
2        [olive oil, chicken tenders, bagel seasoning]
3    [breadcrumbs, eggs, milk, flour, chicken cutle...
4    [honey, soy sauce, sriracha, rice vinegar, gar...
Name: clean_ingredients, dtype: object

In [None]:
# Save clean dataset
# clean_data = recipes_df[['recipe_title', 'clean_ingredients', 'directions']]
# clean_data.to_json("../data/cleaned.json")

# Save clean dataset with proper field naming for inference
# final_recipes = []
# for _, row in recipes_df.iterrows():
#     final_recipes.append({
#         "name": row["recipe_title"],  # Map recipe_title -> name for inference
#         "ingredients": row["clean_ingredients"],
#         "directions": row["directions"]
#     })

# output_path = os.path.join(os.path.dirname(__file__), "..", "data", "processed", "cleaned.json")
# os.makedirs(os.path.dirname(output_path), exist_ok=True)

# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(final_recipes, f, indent=2)

# print(f"Exported {len(final_recipes)} recipes to {output_path}")

### Singularize Nouns

In [16]:
def light_normalize(ingredients, nlp):
    normalized = []

    for ing in ingredients:
        doc = nlp(ing)
        words = []
        for t in doc:
            if t.pos_ == "NOUN":
                words.append(t.lemma_)
            else:
                words.append(t.text)
        normalized.append(" ".join(words))

    return normalized

In [17]:
recipes_df["clean_ingredients_norm"] = (
    recipes_df["clean_ingredients"]
    .apply(lambda x: light_normalize(x, nlp))
)

recipes_df['clean_ingredients_norm'].head()

0    [worcestershire sauce, beer, garlic powder, on...
1    [pork belly, honey, ginger, gochujang, soy sauce]
2         [olive oil, chicken tender, bagel seasoning]
3    [breadcrumb, egg, milk, flour, chicken cutlet,...
4    [honey, soy sauce, sriracha, rice vinegar, gar...
Name: clean_ingredients_norm, dtype: object

### TF-IDF

In [18]:
documents = recipes_df['clean_ingredients_norm'].apply("|".join)

In [19]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 1),
    min_df=3,                # drop very rare noise
    max_df=0.85,             # suppress salt/oil/water
    norm="l2",               # cosine similarity friendly
    use_idf=True,
    token_pattern=r"[^|]+",
    smooth_idf=True,
    sublinear_tf=True        # log(1 + tf)
)

In [20]:
X = vectorizer.fit_transform(documents)

In [21]:
feature_names = vectorizer.get_feature_names_out()
idf = vectorizer.idf_

In [22]:
# (very rare ingredients)
top = sorted(zip(feature_names, idf), key=lambda x: -x[1])[:1000]
top

[('acini di pepe pasta', np.float64(9.74133624306353)),
 ('aged cheddar cheese', np.float64(9.74133624306353)),
 ('ahi tuna steak', np.float64(9.74133624306353)),
 ('ajun seasoning', np.float64(9.74133624306353)),
 ('albacore tuna', np.float64(9.74133624306353)),
 ('aleppo chili', np.float64(9.74133624306353)),
 ('allbeef dog', np.float64(9.74133624306353)),
 ('almond breeze original almondmilk', np.float64(9.74133624306353)),
 ('almond oil', np.float64(9.74133624306353)),
 ('alphabet pasta', np.float64(9.74133624306353)),
 ('alum', np.float64(9.74133624306353)),
 ('amaranth', np.float64(9.74133624306353)),
 ('andouille sausage link', np.float64(9.74133624306353)),
 ('anjou', np.float64(9.74133624306353)),
 ('annatto powder', np.float64(9.74133624306353)),
 ('any type', np.float64(9.74133624306353)),
 ('apple brandy', np.float64(9.74133624306353)),
 ('asian chile paste', np.float64(9.74133624306353)),
 ('assorted cracker', np.float64(9.74133624306353)),
 ('assorted wild mushroom', np.f

In [23]:
# Least informative (very commom ingrdients)
bottom = sorted(zip(feature_names, idf), key=lambda x: x[1])[:20]
bottom

[('salt', np.float64(1.7262567348112625)),
 ('sugar', np.float64(2.0643990972454507)),
 ('pepper', np.float64(2.1611468251189896)),
 ('butter', np.float64(2.231727041967839)),
 ('onion', np.float64(2.2383222843405735)),
 ('egg', np.float64(2.2948970121870067)),
 ('flour', np.float64(2.456515330494927)),
 ('water', np.float64(2.7457990506978605)),
 ('clove garlic', np.float64(2.84085737305217)),
 ('vanilla', np.float64(2.947029656588973)),
 ('olive oil', np.float64(2.954901499317949)),
 ('milk', np.float64(3.004369285061675)),
 ('cinnamon', np.float64(3.251371721880195)),
 ('vegetable oil', np.float64(3.312020072148231)),
 ('powder', np.float64(3.336521093573394)),
 ('soda', np.float64(3.4923267179240067)),
 ('bell pepper', np.float64(3.6642675586634)),
 ('garlic powder', np.float64(3.771989501209168)),
 ('carrot', np.float64(3.7822657637665524)),
 ('lemon juice', np.float64(3.805780890395065))]

In [None]:
# Save tfidf vectorizer
# with open("tfidf_vectorizer.pkl", "wb") as f:
#     pickle.dump(vectorizer, f)

### Embeddings (Semantic Matching)

In [25]:
model = SentenceTransformer("all-MiniLM-L6-v2")

ingredient_texts = recipes_df["clean_ingredients_norm"].apply(
    lambda x: ", ".join(x)
).tolist()

embeddings = model.encode(
    ingredient_texts,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True
)

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

In [None]:
# Save
# embeddings = embeddings.astype(np.float32)
# np.save("recipe_embeddings.npy", embeddings)

### Similarity Computation

In [57]:
# X is the TF-IDF sparse matrix
X_sparse = csr_matrix(X)

# Normalize for cosine similarity (makes it dot product)
X_norm = normalize(X_sparse, norm='l2', axis=1)

# Convert ingredient lists to sets
# ingredient_sets = recipes_df['clean_ingredients_norm'].apply(set).tolist()

In [74]:
# TF-IDF similarity matrix
tfidf_sim = cosine_similarity(X_norm)

In [None]:
# Embeddings similarity matrix
embed_sim = cosine_similarity(embeddings)

## Test Recommendation Engine

In [28]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def find_similar_recipes(user_ingredients, top_k=5):
    """
    Calculates hybrid similarity (Keyword + Semantic) and returns top matches.
    
    Args:
        user_ingredients (List[str]): List of ingredients from user.
        artifacts (dict): The dictionary returned by load_model_artifacts().
        top_k (int): Number of recipes to return.
    """
    
    # Unpack the artifacts for easier use
    recipes = load_dataset("json", data_files="../data/processed/cleaned.json")["train"]
    tfidf_vectorizer = vectorizer
    tfidf_matrix = X
    recipe_embeddings = embeddings
    bert_model = model

    # 1. Preprocess the User Query
    # Join list ["tomato", "cheese"] -> string "tomato cheese"
    query_text = " ".join(user_ingredients)
    logger.info(f"Processing query: {query_text}")

    # ---------------------------------------------------------
    # PART A: Keyword Similarity (TF-IDF)
    # ---------------------------------------------------------
    # Convert query to vector
    query_tfidf = tfidf_vectorizer.transform([query_text])
    
    # Calculate Cosine Similarity against all recipes
    # Result is a list of scores: [0.1, 0.5, 0.9, ...]
    tfidf_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    # ---------------------------------------------------------
    # PART B: Semantic Similarity (Embeddings)
    # ---------------------------------------------------------
    # Convert query to vector
    query_embedding = bert_model.encode([query_text])
    
    # Calculate Cosine Similarity
    semantic_scores = cosine_similarity(query_embedding, recipe_embeddings).flatten()

    # ---------------------------------------------------------
    # PART C: Hybrid Scoring 
    # ---------------------------------------------------------
    # We combine both scores. 
    # Alpha controls the balance. 0.5 means 50% keyword, 50% meaning.
    alpha = 0.5 
    final_scores = (tfidf_scores * alpha) + (semantic_scores * (1 - alpha))

    # ---------------------------------------------------------
    # PART D: Sorting & Formatting
    # ---------------------------------------------------------
    # Get the indices of the top_k highest scores
    # argsort sorts low-to-high, so we take the last k and reverse them
    top_indices = final_scores.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        score = final_scores[idx]
        
        # Filter out bad matches (Optional)
        if score < 0.1: 
            continue

        recipe = recipes[idx]
        results.append({
            "name": recipe["name"],
            "ingredients": recipe["ingredients"],
            "directions": recipe.get("directions", []),
            "match_score": float(score) # Convert numpy float to python float
        })

    logger.info(f"Found {len(results)} matches.")
    return results

In [29]:
user_ing = ["onion", "cheese", "chicken", "bread"]
find_similar_recipes(user_ing)

INFO:__main__:Processing query: onion cheese chicken bread


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Found 5 matches.


[{'name': 'Feta Chicken',
  'ingredients': ['tomato basil feta cheese',
   'chicken breast halves',
   'bread'],
  'directions': ['bake 25 to 30 minutes in the preheated oven, or until chicken is no longer pink and juices run clear.',
   'place chicken breasts between 2 pieces of waxed paper. gently pound chicken with flat side of meat mallet or rolling pin until about 1/4 inch thick; remove wax paper. place 1 ounce of feta cheese in the center of each chicken breast, and fold in half.',
   'preheat oven to 350 degrees f (175 degrees c). lightly grease a 9x13 inch baking dish.',
   'spread 2 tablespoons bread crumbs in the bottom of the prepared baking dish. arrange chicken in the dish, and top with remaining bread crumbs.'],
  'match_score': 0.33859556913375854},
 {'name': 'Great Easter Appetizer',
  'ingredients': ['onion', 'cream cheese', 'ham', 'eggs', 'bread crumbs'],
  'directions': ['bake in preheated oven 4 to 5 minutes, until golden and hot. serve warm.',
   'in a small bowl, 

### Sanity Check
Check:  
Are recommended recipes actually similar?  
Do they differ only in minor ingredients?  
Are results dominated by “salt/oil” (if yes → tweak max_df)?

In [62]:
def recommend(idx, sim_matrix, k=5):
    scores = sim_matrix[idx]
    top = scores.argsort()[::-1][1:k+1]
    return recipes_df.iloc[top][["recipe_title", "clean_ingredients_norm"]]

In [None]:
recommend(10, tfidf_sim) # tfidf matrix alone

Unnamed: 0,recipe_title,clean_ingredients_norm
3882,Easiest Sheet Pan Tacos,"[cheddar cheese, refried bean, beef, chili pow..."
2270,Taco Meatballs,"[taco seasoning mix, onion, egg, beef, tortill..."
50493,Classic Pub Style Nachos,"[tortilla chip, lime, salt, pepper, cheddar ch..."
28590,Iowa Enchiladas,"[taco seasoning, sour cream, onion, tomato, le..."
232,20 Layer Air Fryer Nachos,"[bean, olive, tortilla chip, avocado, onion, s..."


In [None]:
recommend(10, embed_sim) # embeddings matrix alone

Unnamed: 0,recipe_title,clean_ingredients_norm
232,20 Layer Air Fryer Nachos,"[bean, olive, tortilla chip, avocado, onion, s..."
4347,Mac And Cheese Cowboy Cups,"[tomato, chile, taco seasoning, beef, onion, m..."
42066,Mexican Lasagna Rollups,"[taco seasoning mix, con queso sauce, salsa, c..."
32958,Mexican Chicken Tortilla Lasagna,"[chile, halfway, mild enchilada sauce, egg, ch..."
28584,Cheese And Beef Enchiladas,"[taco seasoning, tomato, lettuce, bell pepper,..."


In [64]:
recipes_df.iloc[10][["recipe_title", "clean_ingredients_norm"]]

recipe_title                                          Air Fryer Pasta Tacos
clean_ingredients_norm    [manicotti pasta, chile, tel, taco seasoning, ...
Name: 10, dtype: object

### Ingredient Hide-and-Seek
What this test asks:  
“If I only know SOME of the ingredients, can I still find the right recipe?”  

This simulates a real user who says:  
“I have chicken, soy sauce, and garlic — what can I cook?”

In [None]:
# for tfidf
def partial_query_eval(recipe_idx, k=10):
    ingredients = recipes_df.loc[recipe_idx, "clean_ingredients_norm"]
    cut = int(len(ingredients) * 0.7)
    query = ingredients[:cut]

    query_vec = vectorizer.transform(["|".join(query)])
    sims = cosine_similarity(query_vec, X_norm)[0]

    top_k = sims.argsort()[::-1][:k]
    return recipe_idx in top_k

In [68]:
partial_query_eval(10)

True

In [None]:
# for embeddings
def partial_query_eval2(recipe_idx, k=10):
    ingredients = recipes_df.loc[recipe_idx, "clean_ingredients_norm"]
    cut = int(len(ingredients) * 0.7)
    query = ingredients[:cut]

    query_text = ", ".join(query)
    query_emb = model.encode([query_text], normalize_embeddings=True)
    sims = cosine_similarity(query_emb, embeddings)[0]

    top_k = sims.argsort()[::-1][:k]
    return recipe_idx in top_k

In [72]:
partial_query_eval2(10)

True

### Hybrid Recommender

In [81]:
def tfidf_query_similarity(query_ingredients):
    query_doc = "|".join(query_ingredients)
    query_vec = vectorizer.transform([query_doc])
    sims = cosine_similarity(query_vec, X_norm)[0]
    return sims


def embedding_query_similarity(query_ingredients):
    query_text = ", ".join(query_ingredients)
    query_emb = model.encode(
        [query_text],
        normalize_embeddings=True
    )
    sims = cosine_similarity(query_emb, embeddings)[0]
    return sims


def recommend_from_ingredients(query_ingredients, k=5, alpha=0.6):
    tfidf_sims = tfidf_query_similarity(query_ingredients)
    embed_sims = embedding_query_similarity(query_ingredients)

    hybrid_sims = alpha * tfidf_sims + (1 - alpha) * embed_sims

    top_k = hybrid_sims.argsort()[::-1][:k]
    return recipes_df.iloc[top_k][["recipe_title", "clean_ingredients_norm"]]

In [82]:
test_ingredients = ["chicken", "soy sauce", "garlic"]
recommend_from_ingredients(test_ingredients)

Unnamed: 0,recipe_title,clean_ingredients_norm
14724,Teriyaki Roasted Chicken,"[chicken, ginger, clove garlic, sugar, soy sauce]"
14760,Easiest Teriyaki Marinade Ever,"[garlic, cinnamon, sugar, soy sauce, water]"
4187,Japchae,"[sesame oil, garlic, sugar, soy sauce]"
23137,Orange Soya Sauce Marinated Chicken,"[chicken breast, garlic, orange juice, soy sauce]"
30122,Hibachi-Style Fried Rice,"[grilled chicken, rice, egg, garlic, soy sauce..."
