Imports

In [None]:
# Configuration file for the ML Food Buddy Recommender project
import os
import re
import html
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from rapidfuzz import process, fuzz

Functions

In [2]:
def data_loader(filename: str, source: str = "raw"):
    """
    Load a CSV dataset from the repo's data/raw or data/preprocessed folder.
    
    Parameters:
        filename: str - the CSV file name (can be compressed .zip or .gz)
        source: str - "raw" or "preprocessed" to select folder
    
    Returns:
        pd.DataFrame
    """
    if source not in ["raw", "preprocessed"]:
        raise ValueError("source must be 'raw' or 'preprocessed'")

    cwd = os.getcwd()
    repo_root = cwd

    # Walk upwards until we find the desired folder
    while True:
        data_path = os.path.join(repo_root, "data", source, filename)
        if os.path.exists(data_path):
            break
        parent = os.path.dirname(repo_root)
        if parent == repo_root:  # reached root of filesystem
            raise FileNotFoundError(f"Could not find {filename} in data/{source} from {cwd}")
        repo_root = parent

    # Detect compression type
    compression_type = None
    if filename.endswith('.zip'):
        compression_type = 'zip'
    elif filename.endswith('.gz'):
        compression_type = 'gzip'

    return pd.read_csv(data_path, compression=compression_type)


In [3]:
def parse_time(t):
    """
    Parse time in minutes. Supports:
    - Raw numeric strings (e.g., "45")
    - Already numeric values
    
    Returns:
        float (minutes) or np.nan if parsing fails
    """
    if pd.isna(t):
        return np.nan
    if isinstance(t, str):
        t = t.strip()
        if t.startswith("PT"):  # ISO 8601 duration
            hours = re.search(r'(\d+)H', t)
            minutes = re.search(r'(\d+)M', t)
            secs = re.search(r'(\d+)S', t)
            total_minutes = 0
            if hours:
                total_minutes += int(hours.group(1)) * 60
            if minutes:
                total_minutes += int(minutes.group(1))
            if secs:
                total_minutes += int(secs.group(1)) / 60
            return total_minutes if total_minutes > 0 else np.nan
        # fallback: try to parse as float
        try:
            return float(t)
        except:
            return np.nan
    # If already numeric
    try:
        return float(t)
    except:
        return np.nan


In [4]:
def format_time(t):
    """
    Convert a time in minutes to a human-readable string format.

    Args:
        t (int or float or None): Total time in minutes. Can be NaN.

    Returns:
        str or None: A string representing the time in hours and minutes, e.g.:
                     - 135 → "2 hours 15 minutes"
                     - 60  → "1 hour"
                     - 45  → "45 minutes"
                     - 0 or NaN → "0 minutes" or None if input is NaN
    """
    if pd.isna(t):
        return None
    t = int(round(t))
    hours, minutes = divmod(t, 60)
    parts = []
    if hours > 0:
        parts.append(f"{hours} hour{'s' if hours > 1 else ''}")
    if minutes > 0:
        parts.append(f"{minutes} minute{'s' if minutes > 1 else ''}")
    return " ".join(parts) if parts else "0 minutes"


In [5]:
def clip_top_outliers(df, cols, z_thresh=3.5):
    """
    Clip only extreme outliers of selected columns using modified Z-score.
    
    Parameters:
    - df: pd.DataFrame
    - cols: list of str, columns to clip
    - z_thresh: float, threshold for modified Z-score (default=3.5)
    
    Returns:
    - df_clipped: pd.DataFrame with clipped values
    - thresholds: dict of column:clip_value for reference
    """
    df_clipped = df.copy()
    thresholds = {}
    
    for col in cols:
        if col not in df.columns:
            continue
        series = df[col]
        median = series.median()
        mad = np.median(np.abs(series - median))
        if mad == 0:
            continue  # can't detect outliers if MAD is zero
        mod_z = 0.6745 * (series - median) / mad
        upper_limit = series[mod_z <= z_thresh].max()  # largest non-outlier
        df_clipped[col] = np.minimum(series, upper_limit)
        thresholds[col] = upper_limit
    
    return df_clipped, thresholds

In [6]:
def correct_query(query, recipes, vocab_columns=None, threshold=80):
    """
    Corrects a user query using RapidFuzz, matching words to recipe vocabulary.
    Only replaces words if a close match is found above threshold.
    
    Args:
        query: Input query string (e.g., "Italain paste vegeterian")
        recipes: DataFrame with vocabulary columns
        vocab_columns: List of column names to extract vocabulary from. 
        threshold: Minimum similarity score to accept correction (0-100)
    
    Returns:
        Corrected query string (e.g., "Italian pasta vegetarian")
    """
    # Return original query if empty
    if vocab_columns is None:
        return query
    
    # Extract vocabulary from specified columns
    vocab = set()
    
    for col_name in vocab_columns:
        if col_name in recipes.columns:
            for cell_value in recipes[col_name].dropna():
                if isinstance(cell_value, str):
                    words = cell_value.replace(',', ' ').split()
                    vocab.update(word.strip().lower() for word in words if word.strip())
                elif isinstance(cell_value, list):
                    vocab.update(word.strip().lower() for word in cell_value if isinstance(word, str) and word.strip())
    
    vocab = [word for word in vocab if len(word) > 2]  # Filter out very short words
    
    if not vocab:
        return query
    
    words = query.lower().split()
    corrected = []
    
    for word in words:
        # Skip very short words
        if len(word) <= 2:
            corrected.append(word)
            continue
            
        # Find the best match from vocab
        result = process.extractOne(word, vocab, scorer=fuzz.WRatio)
        
        if result and result[1] >= threshold:
            corrected.append(result[0])
        else:
            corrected.append(word)
    
    return ' '.join(corrected)

In [7]:
def parse_r_list_column(col, extended_clean=False):
    """
    Parse and clean R-style list strings in a pandas Series.
    Args:
        col: pandas Series containing R-style list strings
        extended_clean: bool, if False applies basic cleaning only,
                       if True applies extended cleaning (whitespace collapse + trailing comma removal)
    Returns:
        pandas Series with cleaned text
    """
    # Handle None input
    if col is None:
        return pd.Series([], dtype=str)
    
    # Basic cleaning (always applied)
    cleaned = col.fillna("").astype(str).str.strip()
    
    # Remove c( and ) wrapper (always applied)
    cleaned = cleaned.str.replace(r'^c\(|\)$', '', regex=True)  # Fixed: added $ at the end
    
    # Remove quotes (always applied)
    cleaned = cleaned.str.replace(r'"', '', regex=False)
    
    # Extended cleaning (only if requested)
    if extended_clean:
        # Collapse multiple spaces/newlines
        cleaned = cleaned.apply(lambda x: re.sub(r'\s+', ' ', x).strip())
        
        # Remove comma immediately following a period (anywhere in text, not just end)
        cleaned = cleaned.str.replace(r'\.(?=\s*,)', '.', regex=True)
        cleaned = cleaned.str.replace(r'\.,\s*', '. ', regex=True)
        
        # Decode HTML entities
        cleaned = cleaned.apply(lambda x: html.unescape(x))
    
    return cleaned

In [8]:
def clean_text(text):
    """
    Clean a text string by normalizing case, whitespace, and punctuation.

    - Converts to lowercase
    - Replaces multiple whitespace characters with a single space
    - Removes punctuation and special characters
    - Strips leading and trailing whitespace
    """
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

In [9]:
def time_bin(minutes):
    """
    Categorize a recipe's total time into a time bin.

    Parameters:
    minutes (float or int): Total time of the recipe in minutes.

    Returns:
    str or None: 
        - "fast" if time is 30 minutes or less
        - "medium" if time is between 31 and 90 minutes
        - "long" if time is more than 90 minutes
        - None if input is NaN
    """
    if pd.isna(minutes):
        return None
    if minutes <= 30:
        return "fast"
    elif minutes <= 90:
        return "medium"
    else:
        return "long"

In [10]:
def calorie_bin(cals):
    """
    Categorize a recipe's calories into a calorie bin.

    Parameters:
    cals (float or int): Total calories of the recipe.

    Returns:
    str or None:
        - "low" if calories are less than 300
        - "medium" if calories are between 300 and 600
        - "high" if calories are more than 600
        - None if input is NaN
    """
    if pd.isna(cals):
        return None
    if cals < 300:
        return "low"
    elif cals <= 600:
        return "medium"
    else:
        return "high"


In [11]:
def get_first_url(cell):
    """
    Extract the first URL from a string that contains one or more URLs enclosed in quotes.

    Parameters:
        cell (str or any): A string containing URLs in quotes, or any value (e.g., NaN).

    Returns:
        str or None: 
            - The first URL found in quotes.
            - None if no URL is found or if the input is NaN.

    """
    if pd.isna(cell):
        return None
    text = str(cell)
    match = re.search(r'"(.*?)"', text)
    if match:
        return match.group(1)
    return None


In [12]:
def apply_categorical_filters(dataset, filters: dict):
    """
    Apply generic categorical filters to a DataFrame.

    Args:
        dataset (pd.DataFrame): DataFrame to filter.
        filters (dict): Dictionary where keys are column names and values are filter values
                        (str or list of str) to keep in that column.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    filtered = dataset.copy()
    
    for column, values in filters.items():
        if column not in filtered.columns or not values:
            continue
        if isinstance(values, str):
            values = [values]
        clean_values = [str(v).strip().lower() for v in values]
        filtered = filtered[filtered[column].fillna('').astype(str).str.strip().str.lower().isin(clean_values)]
    
    return filtered


In [13]:
def recommend(
    user_prefs,
    dataset,
    recipe_vectors_matrix,
    vectorizer,
    top_n=3,
    time_pref=None,
    calorie_pref=None,
    vocab_column='combined_text',
    time_column='time_bin',
    calorie_column='calorie_bin',
    return_columns=None
):
    """
    Recommend recipes based on user preferences using TF-IDF cosine similarity.

    This function cleans and autocorrects the user query, computes similarity
    with recipe vectors, applies optional filters, and returns the top N matching recipes.

    Args:
        user_prefs (str): User query describing desired recipe.
        dataset (pd.DataFrame): DataFrame containing recipe information.
        recipe_vectors_matrix (np.ndarray): Precomputed TF-IDF vectors of the recipes.
        vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer used to transform queries.
        top_n (int, optional): Number of top recipes to return. Defaults to 3.
        time_pref (str or list of str, optional): Filter recipes by time category.
        calorie_pref (str or list of str, optional): Filter recipes by calorie category.
        vocab_column (str, optional): Column for query correction vocabulary. Defaults to 'combined_text'.
        time_column (str, optional): Column name for recipe time categories. Defaults to 'time_bin'.
        calorie_column (str, optional): Column name for recipe calorie categories. Defaults to 'calorie_bin'.
        return_columns (list of str, optional): Columns to return from the results.

    Returns:
        pd.DataFrame: Top N recommended recipes with requested columns and 'TotalTime_str' if available.
    """

    # Clean and correct query
    cleaned_query = clean_text(user_prefs)
    corrected_query = correct_query(query=cleaned_query, recipes=dataset, vocab_columns=[vocab_column])

    # Vectorize query
    user_vec = vectorizer.transform([corrected_query]).toarray()[0]
    user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-10)

    # Cosine similarity
    dataset = dataset.copy()
    dataset['similarity'] = recipe_vectors_matrix @ user_vec

    # If max similarity is 0, return a message
    if dataset['similarity'].max() == 0:
        return pd.DataFrame([{
            'message': 'No matching recipes found. Please add correct ingredients.'
        }])

    # Apply filters
    filters = {
        time_column: time_pref,
        calorie_column: calorie_pref
    }
    dataset = apply_categorical_filters(dataset, filters)

    # Sort by similarity and pick top_n
    results = dataset.sort_values(by='similarity', ascending=False).head(top_n).copy()

    # Format TotalTime
    if 'TotalTime_min' in results.columns:
        results['TotalTime_str'] = results['TotalTime_min'].apply(format_time)
    else:
        results['TotalTime_str'] = ""

    # Return requested columns
    if return_columns:
        valid_columns = [c for c in return_columns if c in results.columns]
        if not valid_columns:
            valid_columns = results.columns.tolist()
    else:
        valid_columns = results.columns.tolist()

    return results[valid_columns]

EDA

In [14]:
# Load the dataset
recipes = data_loader("recipes.csv")
recipes.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil..."


In [15]:
# Info
recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   RecipeId                    522517 non-null  int64  
 1   Name                        522517 non-null  object 
 2   AuthorId                    522517 non-null  int64  
 3   AuthorName                  522517 non-null  object 
 4   CookTime                    439972 non-null  object 
 5   PrepTime                    522517 non-null  object 
 6   TotalTime                   522517 non-null  object 
 7   DatePublished               522517 non-null  object 
 8   Description                 522512 non-null  object 
 9   Images                      522516 non-null  object 
 10  RecipeCategory              521766 non-null  object 
 11  Keywords                    505280 non-null  object 
 12  RecipeIngredientQuantities  522514 non-null  object 
 13  RecipeIngredie

In [16]:
# Convert PrepTime into minutes
recipes['PrepTime_min'] = recipes['PrepTime'].apply(parse_time)

# Convert CookTime into minutes
recipes['CookTime_min'] = recipes['CookTime'].apply(parse_time)

# Convert TotalTime into minutes
recipes['TotalTime_min'] = recipes['TotalTime'].apply(parse_time)

In [17]:
# Identify unparseable PrepTime entries
invalid_prep_times = recipes[recipes['PrepTime'].apply(parse_time).isna()][['PrepTime']]
print("Invalid or unparseable PrepTime entries:")
print(invalid_prep_times.value_counts())

# Identify unparseable CookTime entries
invalid_cooking_times = recipes[recipes['CookTime'].apply(parse_time).isna()][['CookTime']]
print()
print("Invalid or unparseable CookTime entries:")
print(invalid_cooking_times.value_counts())

# Identify unparseable TotalTime entries
invalid_total_times = recipes[recipes['TotalTime'].apply(parse_time).isna()][['TotalTime']]
print()
print("Invalid or unparseable TotalTime entries:")
print(invalid_total_times.value_counts())

Invalid or unparseable PrepTime entries:
PrepTime
PT0S        15010
Name: count, dtype: int64

Invalid or unparseable CookTime entries:
Series([], Name: count, dtype: int64)

Invalid or unparseable TotalTime entries:
TotalTime
PT0S         2129
Name: count, dtype: int64


**Note:**  
The entries for Prep and Cook time identified as invalid or unparseable are likely incorrect or missing.  
It is recommended to replace these entries with the **median** or **average** value of the respective column to avoid skewing any analysis. 
Also, let's check if they are affecting TotalTime.

In [18]:
# Compute difference
recipes['time_diff'] = recipes['PrepTime_min'] + recipes['CookTime_min'] - recipes['TotalTime_min']

# Find rows where the difference is not zero
mismatch = recipes[recipes['time_diff'].abs() != 0]

# Show relevant columns
mismatch.head()


Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions,PrepTime_min,CookTime_min,TotalTime_min,time_diff
8,46,A Jad - Cucumber Pickle,1533,Dancer,,PT25M,PT25M,1999-08-11T19:48:00Z,Make and share this A Jad - Cucumber Pickle re...,character(0),...,0.2,0.2,0.1,,1 cup,"c(""Slice the cucumber in four lengthwise, then...",25.0,,25.0,
10,48,Boston Cream Pie,1545,Nancy Van Ess,,PT2H15M,PT2H15M,1999-08-24T04:35:00Z,Make and share this Boston Cream Pie recipe fr...,character(0),...,1.6,46.2,8.8,8.0,1 pie,"c(""Beat egg whites until soft peaks form."", ""G...",135.0,,135.0,
14,52,Cafe Cappuccino,2178,troyh,,PT5M,PT5M,1999-08-31T21:05:00Z,Make and share this Cafe Cappuccino recipe fro...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.0,11.8,2.7,18.0,2 1/4 cups,"c(""Stir ingredients together."", ""Process in a ...",5.0,,5.0,
19,57,Black Bean Salsa,1569,Linda7,,PT10M,PT10M,1999-08-31T21:02:00Z,Make and share this Black Bean Salsa recipe fr...,character(0),...,5.5,1.4,5.4,8.0,,"c(""Combine all ingredients in a bowl."", ""Serve...",10.0,,10.0,
22,60,Blueberry Dessert,1545,Nancy Van Ess,,PT35M,PT35M,1999-08-16T05:59:00Z,Make and share this Blueberry Dessert recipe f...,character(0),...,1.6,36.9,3.9,12.0,,"c(""Heat oven to 400 degrees."", ""Mix 2 cups bak...",35.0,,35.0,


In [19]:
# Percentage of mismatches
mismatch.shape[0]/recipes.shape[0]

0.18157112591552046

In [20]:
# Fill missing prep/cook times with median
recipes['PrepTime_min'].fillna(recipes['PrepTime_min'].median(), inplace=True)
recipes['CookTime_min'].fillna(recipes['CookTime_min'].median(), inplace=True)

# Recalculate total time to ensure consistency
recipes['TotalTime_min'] = recipes['PrepTime_min'] + recipes['CookTime_min']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  recipes['PrepTime_min'].fillna(recipes['PrepTime_min'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  recipes['CookTime_min'].fillna(recipes['CookTime_min'].median(), inplace=True)


In [21]:
# Convert DatePublished to datetime
recipes['DatePublished'] = pd.to_datetime(recipes['DatePublished'], errors='coerce')

In [22]:
# Remove timezone to make tz-naive
recipes['DatePublished'] = recipes['DatePublished'].dt.tz_convert(None)

# Identify unparseable dates
invalid_dates = recipes[recipes['DatePublished'].isna()][['DatePublished']]
print("Invalid or unparseable dates:")
print(invalid_dates.value_counts())

# Define a reasonable date range
today = pd.Timestamp.today()
min_reasonable_date = pd.Timestamp('1980-01-01')

# Identify dates outside reasonable range
unreasonable_dates = recipes[(recipes['DatePublished'] < min_reasonable_date) |
                             (recipes['DatePublished'] > today)][['DatePublished']]
print("Unreasonable dates:")
print(unreasonable_dates.value_counts())


Invalid or unparseable dates:
Series([], Name: count, dtype: int64)
Unreasonable dates:
Series([], Name: count, dtype: int64)


In [23]:
# Missing values percentage, excluding columns with 0% missing
missing_pct = recipes.isna().sum() / len(recipes) * 100
missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)
print(missing_pct)

RecipeYield                   66.614292
AggregatedRating              48.462155
ReviewCount                   47.364775
RecipeServings                35.005751
time_diff                     18.152902
CookTime                      15.797572
Keywords                       3.298840
RecipeCategory                 0.143727
Description                    0.000957
RecipeIngredientQuantities     0.000574
Images                         0.000191
dtype: float64


In [24]:
# Drop rows where 'description' is NaN
recipes = recipes.dropna(subset=['Description'])

In [25]:
# Show rows where Keywords is missing
recipes[recipes['Keywords'].isna()]

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions,PrepTime_min,CookTime_min,TotalTime_min,time_diff
25,63,Cabbage and Sausage Soup,1544,tranch,PT25M,PT15M,PT40M,1999-09-07 12:52:00,Make and share this Cabbage and Sausage Soup r...,character(0),...,3.5,11.6,30.2,6.0,,"c(""In a medium stockpot or Dutch oven heat oli...",15.0,25.0,40.0,0.0
188,240,Chicken Fried Brown Rice,1572,Ed Paulhus,PT6M,PT10M,PT16M,1999-09-13 03:05:00,Make and share this Chicken Fried Brown Rice r...,"""https://img.sndimg.com/food/image/upload/w_55...",...,6.5,4.7,20.3,4.0,,"c(""Heat large nonstick skillet over medium hea...",10.0,6.0,16.0,0.0
198,252,Sorrel Tarragon Sauce,1554,Jacques Lorrain,PT4H,PT25M,PT4H25M,1999-10-03 23:28:00,Make and share this Sorrel Tarragon Sauce reci...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.7,11.9,5.3,,1 1/2 cups,"c(""Mix all ingredients in medium bowl."", ""Seas...",25.0,240.0,265.0,0.0
493,580,Garlic Mushroom Sauce,1543,Doreen Randal,PT6M,PT30M,PT36M,1999-08-27 05:13:00,Make and share this Garlic Mushroom Sauce reci...,character(0),...,0.5,1.1,1.8,4.0,,"c(""Crushed garlic, peel and chop finely."", ""Wi...",30.0,6.0,36.0,0.0
591,680,Creamy Smoked Salmon & Dijon Pasta,1556,Strawberry Girl,PT5M,PT45M,PT50M,1999-09-06 04:24:00,Make and share this Creamy Smoked Salmon & Dij...,character(0),...,2.4,3.0,20.1,4.0,,"c(""Cook the pasta."", ""Drain but do not rinse. ...",45.0,5.0,50.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522467,541334,Easy Tater Tot Hotdish,274666,Wendelina,PT55M,PT15M,PT1H10M,2020-12-08 19:58:00,Make and share this Easy Tater Tot Hotdish rec...,character(0),...,6.7,2.1,38.0,6.0,1 9x13 pan,"c(""In a large saucepan, mix ground beef, onion...",15.0,55.0,70.0,0.0
522470,541337,Firehouse Favorite Casserole,2001361961,Dori K.,PT55M,PT15M,PT1H10M,2020-12-15 19:17:00,Originally I got this recipe off of the Grandm...,character(0),...,5.8,11.6,44.7,6.0,,"c(""Cook frozen egg noodles according to packag...",15.0,55.0,70.0,0.0
522476,541343,Shrimp Cocktail Bar,2001112113,Jonathan Melendez,PT45M,PT20M,PT1H5M,2020-12-16 18:40:00,Whether you're ringing in the New Year or just...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,8.5,11.9,45.9,,,"c(""You can decide to go either the classic rou...",20.0,45.0,65.0,0.0
522486,541353,Jewish-Style Braised Beef Brisket,181957,Oliver1010,PT4H,PT45M,PT4H45M,2020-12-21 15:50:00,This is a classic &quot;Jewish&quot; style bri...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,4.1,8.2,72.7,8.0,,"c(""Preheat oven to 300&deg;F Season brisket a...",45.0,240.0,285.0,0.0


In [26]:
# Display summary statistics for all
print(recipes.describe())

            RecipeId      AuthorId                  DatePublished  \
count  522512.000000  5.225120e+05                         522512   
mean   271823.979206  4.572629e+07  2008-01-18 07:15:34.635109120   
min        38.000000  2.700000e+01            1999-08-06 00:40:00   
25%    137209.750000  6.947400e+04            2005-09-13 10:36:30   
50%    271760.500000  2.389615e+05            2007-12-13 16:30:00   
75%    406146.500000  5.658280e+05            2009-12-31 09:46:00   
max    541383.000000  2.002886e+09            2020-12-22 22:12:00   
std    155494.450593  2.929728e+08                            NaN   

       AggregatedRating    ReviewCount       Calories     FatContent  \
count     269291.000000  275025.000000  522512.000000  522512.000000   
mean           4.632011       5.227790     484.429822      24.614011   
min            1.000000       1.000000       0.000000       0.000000   
25%            4.500000       1.000000     174.200000       5.600000   
50%            5.0

**Dataset Summary Observations:**

- **Extreme scales & outliers:** `Calories`, `FatContent`, `SodiumContent`, `SugarContent`, `RecipeServings` max far above 75th percentile – likely errors.  
- **Skewed distributions:** Means > medians for nutrition columns – right-skewed.  
- **Missing data:** `AggregatedRating`, `ReviewCount`, `RecipeServings` have many NaNs.  
- **Time inconsistencies:** `PrepTime_min`, `CookTime_min`, `TotalTime_min` have unrealistic maxima

Preprocessing / Cleaning

In [27]:
# Parse and clean string columns
recipes['ingredients_clean'] = parse_r_list_column(recipes['RecipeIngredientParts'])
recipes['category_clean'] = parse_r_list_column(recipes['RecipeCategory']) if 'RecipeCategory' in recipes.columns else ""
recipes['keywords_clean'] = parse_r_list_column(recipes['Keywords']) if 'Keywords' in recipes.columns else ""
recipes['recipe_instructions_clean'] = parse_r_list_column(recipes['RecipeInstructions'], extended_clean=True) if 'RecipeInstructions' in recipes.columns else ""
recipes['Name'] = parse_r_list_column(recipes['Name'], extended_clean=True) if 'Name' in recipes.columns else ""

In [28]:
# Apply cleaning to all relevant columns
recipes['combined_text'] = (
    recipes['ingredients_clean'] + ", " +
    recipes['category_clean'] + ", " +
    recipes['keywords_clean'] + ", " +
    recipes['Description'].fillna("")
).apply(clean_text)

In [29]:
# Columns to clip
numeric_cols_to_clip = [
    "TotalTime_min", "Calories", "FatContent", "SaturatedFatContent",
    "CholesterolContent", "SodiumContent", "CarbohydrateContent",
    "FiberContent", "SugarContent", "ProteinContent"
]

# Apply clipping
recipes_clipped, clip_thresholds = clip_top_outliers(recipes, numeric_cols_to_clip)

# Show thresholds
for col, val in clip_thresholds.items():
    print(f"{col} clipped at: {val:.2f}")


TotalTime_min clipped at: 122.00
Calories clipped at: 1173.20
FatContent clipped at: 64.10
SaturatedFatContent clipped at: 24.40
CholesterolContent clipped at: 263.60
SodiumContent clipped at: 1792.20
CarbohydrateContent clipped at: 120.00
FiberContent clipped at: 10.50
SugarContent clipped at: 32.80
ProteinContent clipped at: 46.90


In [30]:
# Apply these functions to the df
recipes_clipped['time_bin'] = recipes_clipped['TotalTime_min'].apply(time_bin)
recipes_clipped['calorie_bin'] = recipes_clipped['Calories'].apply(calorie_bin)
recipes_clipped['Image_first'] = recipes_clipped['Images'].apply(get_first_url)


In [31]:
# Replace 0 calories with median of non-zero values
recipes_clipped['Calories'] = recipes_clipped['Calories'].replace(0, recipes_clipped[recipes_clipped['Calories'] > 0]['Calories'].median())

In [32]:
# Columns needed for recommendation
important_columns = [
    'Name',
    'Image_first',
    'recipe_instructions_clean',
    'Calories',
    'combined_text',    
    'TotalTime_min',            
    'time_bin',        
    'calorie_bin'     
]

# Create the final_df
final_df = recipes_clipped[important_columns].copy()
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,Name,Image_first,recipe_instructions_clean,Calories,combined_text,TotalTime_min,time_bin,calorie_bin
0,Low-Fat Berry Blue Frozen Dessert,https://img.sndimg.com/food/image/upload/w_555...,Toss 2 cups berries with sugar. Let stand for ...,170.9,blueberries granulated sugar vanilla yogurt le...,122.0,long,low
1,Biryani,https://img.sndimg.com/food/image/upload/w_555...,Soak saffron in warm milk for 5 minutes and pu...,1110.7,saffron milk hot green chili peppers onions ga...,122.0,long,high
2,Best Lemonade,https://img.sndimg.com/food/image/upload/w_555...,"Into a 1 quart Jar with tight fitting lid, put...",311.1,sugar lemons rind of lemon zest of fresh water...,35.0,medium,medium
3,Carina's Tofu-Vegetable Kebabs,https://img.sndimg.com/food/image/upload/w_555...,"Drain the tofu, carefully squeezing out excess...",536.1,extra firm tofu eggplant zucchini mushrooms so...,122.0,long,medium
4,Cabbage Soup,https://img.sndimg.com/food/image/upload/w_555...,Mix everything together and bring to a boil. R...,103.6,plain tomato juice cabbage onion carrots celer...,50.0,medium,low


TF-IDF

In [None]:
# Create TF-IDF Vectors
vectorizer = TfidfVectorizer(max_features=2000)
recipe_vectors = vectorizer.fit_transform(final_df['combined_text'])
recipe_vectors = recipe_vectors.toarray()

# Normalize for cosine similarity
recipe_vectors = recipe_vectors / (np.linalg.norm(recipe_vectors, axis=1, keepdims=True) + 1e-10)

# Keep as separate numpy array instead of storing in DataFrame
recipe_vectors_matrix = recipe_vectors 

Recommender

In [None]:
top_recipes = recommend(
    user_prefs="Italiaan pastaa vegegtarian",
    dataset=final_df,
    recipe_vectors_matrix=recipe_vectors_matrix,
    vectorizer=vectorizer,
    top_n=5,
    time_pref="fast",
    calorie_pref="low",
    vocab_column="combined_text",
    time_column="time_bin",
    calorie_column="calorie_bin",
    return_columns=["Name", "Image_first", "recipe_instructions_clean", "TotalTime_str", "Calories"]
)
print(top_recipes)

                                    Name  \
101092  Fast, Tasty, Low Fat Pasta Sauce   
299097   Vegetarian Low Fat "sloppy Joe"   
38913                         Easy pasta   
221436                Easy Italian Pasta   
62523     Zesty Italian Pasta Bean Salad   

                                                                                                                Image_first  \
101092                                                                                                                 None   
299097                                                                                                                 None   
38913                                                                                                                  None   
221436                                                                                                                 None   
62523   https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipe

Tests

In [None]:
# Functionality tests
basic_queries = [
    "italian pasta",
    "chicken recipe", 
    "vegetarian meal",
    "quick breakfast",
    "chocolate dessert"
]

for query in basic_queries:
    print(f"\n--- Testing: '{query}' ---")
    results = recommend(
        user_prefs=query,
        dataset=final_df,
        recipe_vectors_matrix=recipe_vectors_matrix,
        vectorizer=vectorizer,
        top_n=3,
        return_columns=["Name", "TotalTime_str", "Calories", "similarity"]
    )
    print(f"Results found: {len(results)}")
    if len(results) > 0:
        print(f"Top result: {results.iloc[0]['Name']} (similarity: {results.iloc[0]['similarity']:.3f})")



--- Testing: 'italian pasta' ---
Results found: 3
Top result: Italian Pasta Salad (similarity: 0.650)

--- Testing: 'chicken recipe' ---
Results found: 3
Top result: Peachy Chicken (similarity: 0.754)

--- Testing: 'vegetarian meal' ---
Results found: 3
Top result: Vegetarian Shepherd's Pie (similarity: 0.646)

--- Testing: 'quick breakfast' ---
Results found: 3
Top result: Quick Breakfast Scramble (similarity: 0.546)

--- Testing: 'chocolate dessert' ---
Results found: 3
Top result: Chocolate Truffle Ganache Torte II (similarity: 0.820)


In [None]:
# filtering tests
filter_tests = [
    {
        "query": "pasta recipe",
        "time_pref": "fast",
        "calorie_pref": None,
        "description": "Fast pasta recipes"
    },
    {
        "query": "healthy meal",
        "time_pref": None,
        "calorie_pref": "low",
        "description": "Low calorie healthy meals"
    },
    {
        "query": "italian food",
        "time_pref": "fast",
        "calorie_pref": "low",
        "description": "Fast, low-calorie Italian food"
    },
    {
        "query": "comfort food",
        "time_pref": "long",
        "calorie_pref": "high",
        "description": "Long-cooking, high-calorie comfort food"
    }
]

for test in filter_tests:
    print(f"\n--- Testing: {test['description']} ---")
    results = recommend(
        user_prefs=test["query"],
        dataset=final_df,
        recipe_vectors_matrix=recipe_vectors_matrix,
        vectorizer=vectorizer,
        top_n=3,
        time_pref=test["time_pref"],
        calorie_pref=test["calorie_pref"],
        return_columns=["Name", "time_bin", "calorie_bin", "Calories", "TotalTime_str"]
    )
    print(f"Results found: {len(results)}")
    if len(results) > 0:
        for idx, row in results.iterrows():
            print(f"  - {row['Name']} | Time: {row['time_bin']} ({row['TotalTime_str']}) | Calories: {row['calorie_bin']} ({row['Calories']:.0f})")


--- Testing: Fast pasta recipes ---
Results found: 3
  - Easy pasta | Time: fast (9 minutes) | Calories: low (319)
  - Pasta E Olio | Time: fast (16 minutes) | Calories: medium (574)
  - How to Cook Perfect Pasta | Time: fast (15 minutes) | Calories: medium (423)

--- Testing: Low calorie healthy meals ---
Results found: 3
  - Potato Tapas | Time: medium (40 minutes) | Calories: low (215)
  - Matzo Soup Balls | Time: medium (40 minutes) | Calories: low (29)
  - Bean and Cheese Burrito Bake | Time: medium (50 minutes) | Calories: low (237)

--- Testing: Fast, low-calorie Italian food ---
Results found: 3
  - Italian Straciatella Soup | Time: fast (20 minutes) | Calories: low (216)
  - Italian Style Vegetables | Time: fast (30 minutes) | Calories: low (168)
  - Italian Meatballs Venice Italy | Time: fast (27 minutes) | Calories: low (121)

--- Testing: Long-cooking, high-calorie comfort food ---
Results found: 3
  - Mad Dog’s Sweet-And-Sour Stuffed Cabbage | Time: long (1 hour 45 minute

In [None]:
# type correction testing
typo_queries = [
    "italiaan pastaa vegeterian",  # italian pasta vegetarian
    "chickeen recepie",              # chicken recipe  
    "chocoolate desseart",             # chocolate dessert
    "quick breakfaast",              # breakfast
    "vegatabell soup",              # vegetable soup
]

for query in typo_queries:
    print(f"\n--- Testing typos: '{query}' ---")
    
    # Show what the correction does
    corrected = correct_query(query, final_df, vocab_columns=['combined_text'])
    print(f"Corrected to: '{corrected}'")
    
    results = recommend(
        user_prefs=query,
        dataset=final_df,
        recipe_vectors_matrix=recipe_vectors_matrix,
        vectorizer=vectorizer,
        top_n=2,
        return_columns=["Name", "similarity"]
    )
    print(f"Results found: {len(results)}")
    if len(results) > 0:
        print(f"Top result: {results.iloc[0]['Name']} (similarity: {results.iloc[0]['similarity']:.3f})")


--- Testing typos: 'italiaan pastaa vegeterian' ---
Corrected to: 'italian pasta vegeterian'
Results found: 2
Top result: Italian Pasta Salad (similarity: 0.650)

--- Testing typos: 'chickeen recepie' ---
Corrected to: 'chicken recepie'
Results found: 2
Top result: Chicken Chambray (Like Ponderosa's Chicken Monterey (similarity: 0.798)

--- Testing typos: 'chocoolate desseart' ---
Corrected to: 'chocolate dessert'
Results found: 2
Top result: Chocolate Truffle Ganache Torte II (similarity: 0.820)

--- Testing typos: 'quick breakfaast' ---
Corrected to: 'quick breakfast'
Results found: 2
Top result: Quick Breakfast Scramble (similarity: 0.546)

--- Testing typos: 'vegatabell soup' ---
Corrected to: 'bell soup'
Results found: 2
Top result: Stuffed Bell Pepper Soup (similarity: 0.569)


Function is not perfect as we can see that *vegatabell* wasn't correct as expected — but it serves its purpose.

In [None]:
# edge case testing
edge_cases = [
    "",                    # Empty query
    "a",                   # Single character
    "xyz123",              # Nonsense query
    "!@#$%^&*()",         # Special characters only
    "the and or but",      # Stop words only
    "recipe recipe recipe", # Repeated words
    "super ultra mega delicious amazing fantastic incredible recipe", # Very long query
]

for query in edge_cases:
    print(f"\n--- Testing edge case: '{query}' ---")
    try:
        results = recommend(
            user_prefs=query,
            dataset=final_df,
            recipe_vectors_matrix=recipe_vectors_matrix,
            vectorizer=vectorizer,
            top_n=3,
            return_columns=["Name", "similarity"]
        )
        print(f"✓ Success: {len(results)} results")
        if len(results) > 0:
            print(f"  Top similarity: {results.iloc[0]['similarity']:.3f}")
    except Exception as e:
        print(f"✗ Error: {str(e)}")


--- Testing edge case: '' ---
✓ Success: 1 results
✗ Error: 'similarity'

--- Testing edge case: 'a' ---
✓ Success: 1 results
✗ Error: 'similarity'

--- Testing edge case: 'xyz123' ---
✓ Success: 1 results
✗ Error: 'similarity'

--- Testing edge case: '!@#$%^&*()' ---
✓ Success: 1 results
✗ Error: 'similarity'

--- Testing edge case: 'the and or but' ---
✓ Success: 3 results
  Top similarity: 0.413

--- Testing edge case: 'recipe recipe recipe' ---
✓ Success: 3 results
  Top similarity: 0.420

--- Testing edge case: 'super ultra mega delicious amazing fantastic incredible recipe' ---
✓ Success: 3 results
  Top similarity: 0.420


In [None]:
# performance testing
performance_queries = ["italian pasta vegetarian", "quick chicken dinner", "healthy breakfast"]

for query in performance_queries:
    times = []
    for _ in range(10):  # Run 10 times for average
        start_time = time.time()
        results = recommend(
            user_prefs=query,
            dataset=final_df,
            recipe_vectors_matrix=recipe_vectors_matrix,
            vectorizer=vectorizer,
            top_n=5
        )
        end_time = time.time()
        times.append(end_time - start_time)
    
    avg_time = sum(times) / len(times)
    print(f"Query: '{query}' | Avg time: {avg_time:.4f}s | Results: {len(results)}")

Query: 'italian pasta vegetarian' | Avg time: 7.9423s | Results: 5
Query: 'quick chicken dinner' | Avg time: 7.5440s | Results: 5
Query: 'healthy breakfast' | Avg time: 7.6738s | Results: 5


In [None]:
# diversity testing
def check_diversity(results, column='Name'):
    """Check if results are diverse"""
    if len(results) <= 1:
        return "N/A"
    
    # Simple diversity check - are all results different?
    unique_results = results[column].nunique()
    total_results = len(results)
    diversity_ratio = unique_results / total_results
    
    return f"{diversity_ratio:.2f} ({unique_results}/{total_results} unique)"

diversity_queries = [
    "recipe",              # Very broad
    "pasta",               # Specific ingredient
    "italian",             # Specific cuisine
    "vegetarian protein"   # Specific dietary need
]

for query in diversity_queries:
    print(f"\n--- Testing diversity for: '{query}' ---")
    results = recommend(
        user_prefs=query,
        dataset=final_df,
        recipe_vectors_matrix=recipe_vectors_matrix,
        vectorizer=vectorizer,
        top_n=5,
        return_columns=["Name", "time_bin", "calorie_bin", "similarity"]
    )
    
    print(f"Results: {len(results)}")
    print(f"Name diversity: {check_diversity(results, 'Name')}")
    print(f"Time diversity: {check_diversity(results, 'time_bin')}")
    print(f"Calorie diversity: {check_diversity(results, 'calorie_bin')}")
    
    if len(results) > 0:
        print("Similarity scores:", [f"{sim:.3f}" for sim in results['similarity'].values])


--- Testing diversity for: 'recipe' ---
Results: 5
Name diversity: 1.00 (5/5 unique)
Time diversity: 0.40 (2/5 unique)
Calorie diversity: 0.60 (3/5 unique)
Similarity scores: ['0.420', '0.397', '0.344', '0.338', '0.331']

--- Testing diversity for: 'pasta' ---
Results: 5
Name diversity: 1.00 (5/5 unique)
Time diversity: 0.40 (2/5 unique)
Calorie diversity: 0.60 (3/5 unique)
Similarity scores: ['0.793', '0.745', '0.740', '0.725', '0.723']

--- Testing diversity for: 'italian' ---
Results: 5
Name diversity: 1.00 (5/5 unique)
Time diversity: 0.60 (3/5 unique)
Calorie diversity: 0.60 (3/5 unique)
Similarity scores: ['0.718', '0.622', '0.606', '0.605', '0.604']

--- Testing diversity for: 'vegetarian protein' ---
Results: 5
Name diversity: 1.00 (5/5 unique)
Time diversity: 0.40 (2/5 unique)
Calorie diversity: 0.40 (2/5 unique)
Similarity scores: ['0.583', '0.579', '0.566', '0.552', '0.520']


In [None]:
# real-world scenario testing
scenarios = [
    {
        "query": "quick weeknight dinner for family",
        "time_pref": "fast",
        "calorie_pref": None,
        "scenario": "Busy parent needs quick family meal"
    },
    {
        "query": "healthy lunch meal prep",
        "time_pref": None,
        "calorie_pref": "low",
        "scenario": "Health-conscious person meal prepping"
    },
    {
        "query": "romantic dinner date night",
        "time_pref": "medium",
        "calorie_pref": None,
        "scenario": "Special occasion cooking"
    },
    {
        "query": "college student budget meal",
        "time_pref": "fast",
        "calorie_pref": "high",
        "scenario": "Budget-conscious student needs filling meal"
    },
    {
        "query": "party appetizer finger food",
        "time_pref": None,
        "calorie_pref": None,
        "scenario": "Hosting a party"
    }
]

for scenario in scenarios:
    print(f"\n--- Scenario: {scenario['scenario']} ---")
    print(f"Query: '{scenario['query']}'")
    
    results = recommend(
        user_prefs=scenario["query"],
        dataset=final_df,
        recipe_vectors_matrix=recipe_vectors_matrix,
        vectorizer=vectorizer,
        top_n=3,
        time_pref=scenario["time_pref"],
        calorie_pref=scenario["calorie_pref"],
        return_columns=["Name", "TotalTime_str", "Calories", "time_bin", "calorie_bin"]
    )
    
    print(f"Results: {len(results)}")
    for idx, row in results.iterrows():
        print(f"  - {row['Name']}")
        print(f"    Time: {row['TotalTime_str']} ({row['time_bin']}) | Calories: {row['Calories']:.0f} ({row['calorie_bin']})")


--- Scenario: Busy parent needs quick family meal ---
Query: 'quick weeknight dinner for family'
Results: 3
  - Quick Cinnamon Rolls
    Time: 21 minutes (fast) | Calories: 632 (high)
  - Fresh Fruit Dip
    Time: 12 minutes (fast) | Calories: 268 (low)
  - Sandra Lee's Noodles Alfredo
    Time: 15 minutes (fast) | Calories: 1173 (high)

--- Scenario: Health-conscious person meal prepping ---
Query: 'healthy lunch meal prep'
Results: 3
  - Quick Clam Chowder
    Time: 15 minutes (fast) | Calories: 156 (low)
  - Courtney's Cajun Couscous With a Kick!
    Time: 10 minutes (fast) | Calories: 276 (low)
  - Chinh's Lunch Special
    Time: 3 minutes (fast) | Calories: 268 (low)

--- Scenario: Special occasion cooking ---
Query: 'romantic dinner date night'
Results: 3
  - Ukrainian Almond Date Horikhivnyk (Almond Date Bars
    Time: 1 hour 10 minutes (medium) | Calories: 232 (low)
  - Date Balls
    Time: 1 hour (medium) | Calories: 1173 (high)
  - Easy Date Squares
    Time: 40 minutes (med

In [None]:
def simple_keyword_search(query, dataset, top_n=3):
    """Simple baseline: count keyword matches"""
    query_words = set(query.lower().split())
    
    def count_matches(text):
        if pd.isna(text):
            return 0
        text_words = set(str(text).lower().split())
        return len(query_words.intersection(text_words))
    
    dataset = dataset.copy()
    dataset['keyword_score'] = dataset['combined_text'].apply(count_matches)
    return dataset.nlargest(top_n, 'keyword_score')[['Name', 'keyword_score']]

comparison_queries = ["italian pasta", "chicken breast", "chocolate cake"]

for query in comparison_queries:
    print(f"\n--- Comparing methods for: '{query}' ---")
    
    # Our rec method
    tfidf_results = recommend(
        user_prefs=query,
        dataset=final_df,
        recipe_vectors_matrix=recipe_vectors_matrix,
        vectorizer=vectorizer,
        top_n=3,
        return_columns=["Name", "similarity"]
    )
    
    # Simple keyword method
    keyword_results = simple_keyword_search(query, final_df, top_n=3)
    
    print("TF-IDF Results:")
    for idx, row in tfidf_results.iterrows():
        print(f"  - {row['Name']} (sim: {row['similarity']:.3f})")
    
    print("Keyword Results:")
    for idx, row in keyword_results.iterrows():
        print(f"  - {row['Name']} (matches: {row['keyword_score']})")


--- Comparing methods for: 'italian pasta' ---
TF-IDF Results:
  - Italian Pasta Salad (sim: 0.650)
  - Italian Sausage Rigatoni (sim: 0.619)
  - Spicy Italian Sausage and Pasta (sim: 0.617)
Keyword Results:
  - Italian Sausage Soup (matches: 2)
  - Crusty Garlic Bread (matches: 2)
  - Enzo's Penne a la Vodka (matches: 2)

--- Comparing methods for: 'chicken breast' ---
TF-IDF Results:
  - Poached Chicken Breast (sim: 0.846)
  - Stuffed Chicken Breast (sim: 0.791)
  - Breast of Chicken ALaska (sim: 0.782)
Keyword Results:
  - Biryani (matches: 2)
  - Chicken Breasts Lombardi (matches: 2)
  - Braised Chicken with Onions and Tomatoes (matches: 2)

--- Comparing methods for: 'chocolate cake' ---
TF-IDF Results:
  - Chocolate Sponge Cake (sim: 0.715)
  - Princess Dress Cake Truffles (sim: 0.696)
  - Chocolate Chocolate Chocolate Bundt Cake (sim: 0.677)
Keyword Results:
  - Brownie Heart Cake (matches: 2)
  - Chocolate Zucchini Cake (matches: 2)
  - Better Than Sex Cake (matches: 2)
