Imports

In [53]:
# Configuration file for the ML Food Buddy Recommender project
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import re
from rapidfuzz import process, fuzz

Functions

In [54]:
def data_loader(filename: str, source: str = "raw"):
    """
    Load a CSV dataset from the repo's data/raw or data/preprocessed folder.
    
    Parameters:
        filename: str - the CSV file name (can be compressed .zip or .gz)
        source: str - "raw" or "preprocessed" to select folder
    
    Returns:
        pd.DataFrame
    """
    if source not in ["raw", "preprocessed"]:
        raise ValueError("source must be 'raw' or 'preprocessed'")

    cwd = os.getcwd()
    repo_root = cwd

    # Walk upwards until we find the desired folder
    while True:
        data_path = os.path.join(repo_root, "data", source, filename)
        if os.path.exists(data_path):
            break
        parent = os.path.dirname(repo_root)
        if parent == repo_root:  # reached root of filesystem
            raise FileNotFoundError(f"Could not find {filename} in data/{source} from {cwd}")
        repo_root = parent

    # Detect compression type
    compression_type = None
    if filename.endswith('.zip'):
        compression_type = 'zip'
    elif filename.endswith('.gz'):
        compression_type = 'gzip'

    return pd.read_csv(data_path, compression=compression_type)


In [55]:
def parse_time(t):
    """
    Parse time in minutes. Supports:
    - Raw numeric strings (e.g., "45")
    - Already numeric values
    
    Returns:
        float (minutes) or np.nan if parsing fails
    """
    if pd.isna(t):
        return np.nan
    if isinstance(t, str):
        t = t.strip()
        if t.startswith("PT"):  # ISO 8601 duration
            hours = re.search(r'(\d+)H', t)
            minutes = re.search(r'(\d+)M', t)
            secs = re.search(r'(\d+)S', t)
            total_minutes = 0
            if hours:
                total_minutes += int(hours.group(1)) * 60
            if minutes:
                total_minutes += int(minutes.group(1))
            if secs:
                total_minutes += int(secs.group(1)) / 60
            return total_minutes if total_minutes > 0 else np.nan
        # fallback: try to parse as float
        try:
            return float(t)
        except:
            return np.nan
    # If already numeric
    try:
        return float(t)
    except:
        return np.nan


In [56]:
def format_time(t):
    """
    Convert a time in minutes to a human-readable string format.

    Args:
        t (int or float or None): Total time in minutes. Can be NaN.

    Returns:
        str or None: A string representing the time in hours and minutes, e.g.:
                     - 135 → "2 hours 15 minutes"
                     - 60  → "1 hour"
                     - 45  → "45 minutes"
                     - 0 or NaN → "0 minutes" or None if input is NaN
    """
    if pd.isna(t):
        return None
    t = int(round(t))
    hours, minutes = divmod(t, 60)
    parts = []
    if hours > 0:
        parts.append(f"{hours} hour{'s' if hours > 1 else ''}")
    if minutes > 0:
        parts.append(f"{minutes} minute{'s' if minutes > 1 else ''}")
    return " ".join(parts) if parts else "0 minutes"


In [57]:
def clip_top_outliers(df, cols, z_thresh=3.5):
    """
    Clip only extreme outliers of selected columns using modified Z-score.
    
    Parameters:
    - df: pd.DataFrame
    - cols: list of str, columns to clip
    - z_thresh: float, threshold for modified Z-score (default=3.5)
    
    Returns:
    - df_clipped: pd.DataFrame with clipped values
    - thresholds: dict of column:clip_value for reference
    """
    df_clipped = df.copy()
    thresholds = {}
    
    for col in cols:
        if col not in df.columns:
            continue
        series = df[col]
        median = series.median()
        mad = np.median(np.abs(series - median))
        if mad == 0:
            continue  # can't detect outliers if MAD is zero
        mod_z = 0.6745 * (series - median) / mad
        upper_limit = series[mod_z <= z_thresh].max()  # largest non-outlier
        df_clipped[col] = np.minimum(series, upper_limit)
        thresholds[col] = upper_limit
    
    return df_clipped, thresholds

In [58]:
def correct_query(query, recipes, vocab_columns=None, threshold=85):
    """
    Corrects a user query using RapidFuzz, matching words to recipe vocabulary.
    Only replaces words if a close match is found above threshold.
    
    Args:
        query: Input query string (e.g., "Italain paste vegeterian")
        recipes: DataFrame with vocabulary columns
        vocab_columns: List of column names to extract vocabulary from. 
                      Defaults to ['ingredients_clean', 'category_clean', 'keywords_clean']
        threshold: Minimum similarity score to accept correction (0-100)
    
    Returns:
        Corrected query string (e.g., "Italian pasta vegetarian")
    """
    # Return original query if empty
    if vocab_columns is None:
        return query
    
    # Extract vocabulary from specified columns
    vocab = set()
    
    for col_name in vocab_columns:
        if col_name in recipes.columns:
            for cell_value in recipes[col_name].dropna():
                if isinstance(cell_value, str):
                    words = cell_value.replace(',', ' ').split()
                    vocab.update(word.strip().lower() for word in words if word.strip())
                elif isinstance(cell_value, list):
                    vocab.update(word.strip().lower() for word in cell_value if isinstance(word, str) and word.strip())
    
    vocab = [word for word in vocab if len(word) > 2]  # Filter out very short words
    
    if not vocab:
        return query
    
    words = query.lower().split()
    corrected = []
    
    for word in words:
        # Skip very short words
        if len(word) <= 2:
            corrected.append(word)
            continue
            
        # Find the best match from vocab
        result = process.extractOne(word, vocab, scorer=fuzz.WRatio)
        
        if result and result[1] >= threshold:
            corrected.append(result[0])
        else:
            corrected.append(word)
    
    return ' '.join(corrected)

In [59]:
def parse_r_list_column(col):
    """
    Parse a column that looks like R-style list strings.

    - Fills NaNs with empty strings
    - Removes the c(...) wrapper
    - Strips out quotes
    """
    cleaned = col.fillna("").astype(str).str.strip()
    cleaned = cleaned.str.replace(r'^c\(|\)$', '', regex=True)  # remove c( and )
    cleaned = cleaned.str.replace(r'"', '', regex=False)       # remove quotes
    return cleaned


In [60]:
def clean_r_instructions(col):
    """
    Parse and clean R-style list strings in a pandas Series while keeping original capitalization.

    Args:
        col: pandas Series

    Returns:
        pandas Series with cleaned text
    """
    if col is None:
        return pd.Series([], dtype=str)

    cleaned = col.fillna("").astype(str).str.strip()
    # remove c( and ) around the string
    cleaned = cleaned.str.replace(r'^c\(|\)$', '', regex=True)
    # remove quotes
    cleaned = cleaned.str.replace(r'"', '', regex=False)
    # collapse multiple spaces/newlines
    cleaned = cleaned.apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    
    return cleaned


In [61]:
def clean_text(text):
    """
    Clean a text string by normalizing case, whitespace, and punctuation.

    - Converts to lowercase
    - Replaces multiple whitespace characters with a single space
    - Removes punctuation and special characters
    - Strips leading and trailing whitespace
    """
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

In [62]:
def time_bin(minutes):
    """
    Categorize a recipe's total time into a time bin.

    Parameters:
    minutes (float or int): Total time of the recipe in minutes.

    Returns:
    str or None: 
        - "fast" if time is 30 minutes or less
        - "medium" if time is between 31 and 90 minutes
        - "long" if time is more than 90 minutes
        - None if input is NaN
    """
    if pd.isna(minutes):
        return None
    if minutes <= 30:
        return "fast"
    elif minutes <= 90:
        return "medium"
    else:
        return "long"

In [63]:
def calorie_bin(cals):
    """
    Categorize a recipe's calories into a calorie bin.

    Parameters:
    cals (float or int): Total calories of the recipe.

    Returns:
    str or None:
        - "low" if calories are less than 300
        - "medium" if calories are between 300 and 600
        - "high" if calories are more than 600
        - None if input is NaN
    """
    if pd.isna(cals):
        return None
    if cals < 300:
        return "low"
    elif cals <= 600:
        return "medium"
    else:
        return "high"


In [64]:
def get_first_url(cell):
    """
    Extract the first URL from a string that contains one or more URLs enclosed in quotes.

    Parameters:
        cell (str or any): A string containing URLs in quotes, or any value (e.g., NaN).

    Returns:
        str or None: 
            - The first URL found in quotes.
            - None if no URL is found or if the input is NaN.

    """
    if pd.isna(cell):
        return None
    text = str(cell)
    match = re.search(r'"(.*?)"', text)
    if match:
        return match.group(1)
    return None


In [65]:
def recommend(
    user_prefs,
    dataset,
    recipe_vectors_matrix,
    vectorizer,
    top_n=3,
    time_pref=None,
    calorie_pref=None,
    vocab_column='combined_text',
    time_column='time_bin',
    calorie_column='calorie_bin',
    return_columns=None
):
    # Clean and correct query
    cleaned_query = clean_text(user_prefs)
    corrected_query = correct_query(query=cleaned_query, recipes=dataset, vocab_columns=[vocab_column])

    # Vectorize query
    user_vec = vectorizer.transform([corrected_query]).toarray()[0]
    user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-10)

    # Cosine similarity
    dataset = dataset.copy()
    dataset['similarity'] = recipe_vectors_matrix @ user_vec

    # -------------------
    # Apply time filter
    if time_pref and time_column in dataset.columns:
        if isinstance(time_pref, str):
            time_pref = [time_pref]
        time_pref_clean = [str(t).strip().lower() for t in time_pref]
        dataset = dataset[dataset[time_column].fillna('').astype(str).str.strip().str.lower().isin(time_pref_clean)]

    # Apply calorie filter
    if calorie_pref and calorie_column in dataset.columns:
        if isinstance(calorie_pref, str):
            calorie_pref = [calorie_pref]
        calorie_pref_clean = [str(c).strip().lower() for c in calorie_pref]
        dataset = dataset[dataset[calorie_column].fillna('').astype(str).str.strip().str.lower().isin(calorie_pref_clean)]
    # -------------------

    # Sort by similarity and pick top_n
    results = dataset.sort_values(by='similarity', ascending=False).head(top_n).copy()

    # TotalTime_str
    if 'TotalTime_min' in results.columns:
        results['TotalTime_str'] = results['TotalTime_min'].apply(format_time)
    else:
        results['TotalTime_str'] = ""

    # Return requested columns
    if return_columns:
        valid_columns = [c for c in return_columns if c in results.columns]
        if not valid_columns:
            valid_columns = results.columns.tolist()
    else:
        valid_columns = results.columns.tolist()

    return results[valid_columns]

EDA

In [66]:
# Load the dataset
recipes = data_loader("recipes.csv")
recipes.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,ReviewCount,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen Dessert recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/YUeirxMLQaeE1h3v3qnM_229%20berry%20blue%20frzn%20dess.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/AFPDDHATWzQ0b1CDpDAT_255%20berry%20blue%20frzn%20dess.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/UYgf9nwMT2SGGJCuzILO_228%20berry%20blue%20frzn%20dess.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/PeBMJN2TGSaYks2759BA_20140722_202142.jpg"", \n""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/picuaETeN.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/pictzvxW5.jpg"")",Frozen Desserts,"c(""Dessert"", ""Low Protein"", ""Low Cholesterol"", ""Healthy"", ""Free Of..."", ""Summer"", ""Weeknight"", ""Freezer"", ""Easy"")","c(""4"", ""1/4"", ""1"", ""1"")","c(""blueberries"", ""granulated sugar"", ""vanilla yogurt"", ""lemon juice"")",4.5,4.0,170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stand for 45 minutes, stirring occasionally."", ""Transfer berry-sugar mixture to food processor."", ""Add yogurt and process until smooth."", ""Strain through fine sieve. Pour into baking pan (or transfer to ice cream maker and process according to manufacturers' directions). Freeze uncovered until edges are solid but centre is soft. Transfer to processor and blend until smooth again."", ""Return to pan and freeze until edges are solid."", ""Transfer to processor and blend until smooth again."", \n""Fold in remaining 2 cups of blueberries."", ""Pour into plastic mold and freeze overnight. Let soften slightly to serve."")"
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/39/picM9Mhnw.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/39/picHv4Ocr.jpg"")",Chicken Breast,"c(""Chicken Thigh & Leg"", ""Chicken"", ""Poultry"", ""Meat"", ""Asian"", ""Indian"", ""Weeknight"", ""Stove Top"")","c(""1"", ""4"", ""2"", ""2"", ""8"", ""1/4"", ""8"", ""1/2"", ""1"", ""1"", ""1/4"", ""1/4"", ""1/2"", ""1/4"", ""2"", ""3"", NA, ""2"", ""1"", ""1"", ""8"", ""2"", ""1/3"", ""1/3"", ""1/3"", ""6"")","c(""saffron"", ""milk"", ""hot green chili peppers"", ""onions"", ""garlic"", ""clove"", ""peppercorns"", ""cardamom seed"", ""cumin seed"", ""poppy seed"", ""mace"", ""cilantro"", ""mint leaf"", ""fresh lemon juice"", ""plain yogurt"", ""boneless chicken"", ""salt"", ""ghee"", ""onion"", ""tomatoes"", ""basmati rice"", ""long-grain rice"", ""raisins"", ""cashews"", ""eggs"")",3.0,1.0,1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and puree in blender."", ""Add chiles, onions, ginger, garlic, cloves, peppercorns, cardamom seeds, cinnamon, coriander and cumin seeds, poppy seeds, nutmeg, mace, cilantro or mint leaves and lemon juice. Blend into smooth paste. Put paste into large bowl, add yogurt and mix well."", ""Marinate chicken in yogurt mixture with salt, covered for at least 2 - 6 hours in refrigerator."", ""In skillet. heat oil over medium heat for 1 minute. Add ghee and 15 seconds later add onion and fry for about8 minutes."", \n""Reserve for garnish."", ""In same skillet, cook chicken with its marinade with tomatoes for about 10 minutes over medium heat, uncovered."", ""Remove chicken pieces from the sauce and set aside. Add rice to sauce, bring to boil, and cook, covered over low heat for 15 minutes."", ""Return chicken and add raisins, cashews and almonds; mix well."", ""Simmer, covered for 5 minutes."", ""Place chicken, eggs and rice in large serving dish in such a way that yellow of the eggs, the saffron-colored rice, the nuts and the chicken make a colorful display."", \n""Add reserved onion as garnish."")"
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,"This is from one of my first Good House Keeping cookbooks. You must use a *zester* in order to avoid getting any of that bitter rind, and when you zest the lemons, zest them onto some sugar from the recipe (the sugar will 'catch' all of the oils). I also advise you from personal experience to use only the best skinned lemons for the best flavor.","c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/40/picJ4Sz3N.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/40/pic23FWio.jpg"")",Beverages,"c(""Low Protein"", ""Low Cholesterol"", ""Healthy"", ""Summer"", ""< 60 Mins"")","c(""1 1/2"", ""1"", NA, ""1 1/2"", NA, ""3/4"")","c(""sugar"", ""lemons, rind of"", ""lemon, zest of"", ""fresh water"", ""fresh lemon juice"")",4.5,10.0,311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, put sugar and lemon peel, or zest; add 1 1/2 cups very hot water (not from tap!). With lid fitted firmly, shake jar until sugar is dissolved."", ""Add lemon juice. Refrigerate until chilled."", ""To Serve: Into each 12-ounce glass, over ice cubes, pour 1/4 cup of the lemon syrup."", ""Then add chilled club soda or, if you prefer, water."", ""Stir to mix well."")"
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to allow the ingredients to soak in the marinade overnight.,"c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/41/picmbLig8.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/41/picL02w0s.jpg"")",Soy/Tofu,"c(""Beans"", ""Vegetable"", ""Low Cholesterol"", ""Weeknight"", ""Broil/Grill"", ""Oven"")","c(""12"", ""1"", ""2"", ""1"", ""10"", ""1"", ""3"", ""2"", ""2"", ""2"", ""1"", ""2"", ""1/2"", ""1/4"", ""4"")","c(""extra firm tofu"", ""eggplant"", ""zucchini"", ""mushrooms"", ""soy sauce"", ""low sodium soy sauce"", ""olive oil"", ""maple syrup"", ""honey"", ""red wine vinegar"", ""lemon juice"", ""garlic cloves"", ""mustard powder"", ""black pepper"")",4.5,2.0,536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out excess water, and pat dry with paper towels."", ""Cut tofu into one-inch squares."", ""Set aside. Cut eggplant lengthwise in half, then cut each half into approximately three strips."", ""Cut strips crosswise into one-inch cubes."", ""Slice zucchini into half-inch thick slices."", ""Cut red pepper in half, removing stem and seeds, and cut each half into one-inch squares."", ""Wipe mushrooms clean with a moist paper towel and remove stems."", ""Thread tofu and vegetables on to barbecue skewers in alternating color combinations: For example, first a piece of eggplant, then a slice of tofu, then zucchini, then red pepper, baby corn and mushrooms."", \n""Continue in this way until all skewers are full."", ""Make the marinade by putting all ingredients in a blender, and blend on high speed for about one minute until mixed."", ""Alternatively, put all ingredients in a glass jar, cover tightly with the lid and shake well until mixed."", ""Lay the kebabs in a long, shallow baking pan or on a non-metal tray, making sure they lie flat. Evenly pour the marinade over the kebabs, turning them once so that the tofu and vegetables are coated."", ""Refrigerate the kebabs for three to eight hours, occasionally spooning the marinade over them."", \n""Broil or grill the kebabs at 450 F for 15-20 minutes, or on the grill, until the vegetables are browned."", ""Suggestions This meal can be served over cooked, brown rice. Amounts can easily be doubled to make four servings."")"
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from Food.com.,"""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/42/picVEMxk8.jpg""",Vegetable,"c(""Low Protein"", ""Vegan"", ""Low Cholesterol"", ""Healthy"", ""Winter"", ""< 60 Mins"", ""Easy"")","c(""46"", ""4"", ""1"", ""2"", ""1"")","c(""plain tomato juice"", ""cabbage"", ""onion"", ""carrots"", ""celery"")",4.5,11.0,103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil."", ""Reduce heat and simmer for 30 minutes (longer if you prefer your veggies to be soft)."", ""Refrigerate until cool."", ""Serve chilled with sour cream."")"


In [67]:
# Info
recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   RecipeId                    522517 non-null  int64  
 1   Name                        522517 non-null  object 
 2   AuthorId                    522517 non-null  int64  
 3   AuthorName                  522517 non-null  object 
 4   CookTime                    439972 non-null  object 
 5   PrepTime                    522517 non-null  object 
 6   TotalTime                   522517 non-null  object 
 7   DatePublished               522517 non-null  object 
 8   Description                 522512 non-null  object 
 9   Images                      522516 non-null  object 
 10  RecipeCategory              521766 non-null  object 
 11  Keywords                    505280 non-null  object 
 12  RecipeIngredientQuantities  522514 non-null  object 
 13  RecipeIngredie

In [68]:
# Convert PrepTime into minutes
recipes['PrepTime_min'] = recipes['PrepTime'].apply(parse_time)

# Convert CookTime into minutes
recipes['CookTime_min'] = recipes['CookTime'].apply(parse_time)

# Convert TotalTime into minutes
recipes['TotalTime_min'] = recipes['TotalTime'].apply(parse_time)

In [69]:
# Identify unparseable PrepTime entries
invalid_prep_times = recipes[recipes['PrepTime'].apply(parse_time).isna()][['PrepTime']]
print("Invalid or unparseable PrepTime entries:")
print(invalid_prep_times.value_counts())

# Identify unparseable CookTime entries
invalid_cooking_times = recipes[recipes['CookTime'].apply(parse_time).isna()][['CookTime']]
print()
print("Invalid or unparseable CookTime entries:")
print(invalid_cooking_times.value_counts())

# Identify unparseable TotalTime entries
invalid_total_times = recipes[recipes['TotalTime'].apply(parse_time).isna()][['TotalTime']]
print()
print("Invalid or unparseable TotalTime entries:")
print(invalid_total_times.value_counts())

Invalid or unparseable PrepTime entries:
PrepTime
PT0S        15010
Name: count, dtype: int64

Invalid or unparseable CookTime entries:
Series([], Name: count, dtype: int64)

Invalid or unparseable TotalTime entries:
TotalTime
PT0S         2129
Name: count, dtype: int64


**Note:**  
The entries for Prep and Cook time identified as invalid or unparseable are likely incorrect or missing.  
It is recommended to replace these entries with the **median** or **average** value of the respective column to avoid skewing any analysis. 
Also, Let's check if they are affecting TotalTime.

In [70]:
# Compute difference
recipes['time_diff'] = recipes['PrepTime_min'] + recipes['CookTime_min'] - recipes['TotalTime_min']

# Find rows where the difference is not zero
mismatch = recipes[recipes['time_diff'].abs() != 0]

# Show relevant columns
mismatch.head()


Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,ReviewCount,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions,PrepTime_min,CookTime_min,TotalTime_min,time_diff
8,46,A Jad - Cucumber Pickle,1533,Dancer,,PT25M,PT25M,1999-08-11T19:48:00Z,Make and share this A Jad - Cucumber Pickle recipe from Food.com.,character(0),Vegetable,"c(""Thai"", ""Asian"", ""Free Of..."", ""< 30 Mins"")","c(""1/2"", ""5"", ""2"", ""1"", ""1"", ""1"")","c(""rice vinegar"", ""haeo"")",5.0,2.0,4.3,0.0,0.0,0.0,0.7,1.1,0.2,0.2,0.1,,1 cup,"c(""Slice the cucumber in four lengthwise, then slice the pieces to segments about an eighth of an inch thick."", ""Slice the tops of the chilies (green ones can be used if red are not available, but Thais like the color contrast), tap out any loose seeds and discard, then slice the chilies across into thin rounds."", ""Slice the shallots and water chestnuts."", ""Combine and serve. This will keep 2 or 3 weeks in a refrigerator."")",25.0,,25.0,
10,48,Boston Cream Pie,1545,Nancy Van Ess,,PT2H15M,PT2H15M,1999-08-24T04:35:00Z,Make and share this Boston Cream Pie recipe from Food.com.,character(0),Pie,"c(""Dessert"", ""Weeknight"", ""Oven"", ""< 4 Hours"")","c(""1/2"", ""2 1/4"", ""3"", ""1"", ""1 1/2"", ""1/3"", ""1 1/2"", ""2"", ""1"", ""1/3"", ""1"", ""1 1/2"", ""1"", ""1/2"", ""1"", ""1/4"", ""1"", ""1 1/2"", ""2"", ""1"")","c(""margarine"", ""cake flour"", ""baking powder"", ""salt"", ""sugar"", ""vanilla"", ""eggs"", ""milk"", ""sugar"", ""cornstarch"", ""milk"", ""flour"", ""salt"", ""vanilla"", ""butter"", ""vanilla"")",2.0,2.0,688.2,36.4,12.9,105.9,722.3,84.0,1.6,46.2,8.8,8.0,1 pie,"c(""Beat egg whites until soft peaks form."", ""Gradually add 1/2 cup sugar, beating until very stiff peaks form."", ""Sift together remaining dry ingredients into another bowl."", ""Add oil, half the milk and vanilla."", ""Beat 1 minute at medium speed."", ""Add remaining milk and egg yolks."", ""Beat 1 minute, scrape bowl."", ""Gently fold in egg whites."", ""Bake in two greased 9x 1.5-inch round pans in 350°F oven for 25 minutes."", ""Cool 10 minutes and then remove from pans."", ""Cool completely."", ""Fill with cream filling."", \n""Frost with Chocolate Glaze."", ""French Custard Filling: In saucepan, combine sugar, flour, cornstarch and salt. Gradually stir in milk."", ""Cook and stir until mixture thickens and boils; cook and stir 2-3 minutes longer."", ""Stir a little of hot mixture into egg yolk; return to hot mixture."", ""Cook and stir until mixture just boils."", ""Add vanilla; cool."", ""Beat until smooth; fold in whipped cream."", ""Chocolate Glaze: Melt chocolate and butter over low heat, stirring constantly."", ""Remove from heat. Stir in confectioners' sugar and vanilla until crumbly."", \n""Blend in 3 tablespoons boiling water."", ""Add enough water (about 2 teaspoons), a teaspoon at a time, to form medium glaze of pouring consistency."", ""Pour quickly over top of cake; spread glaze evenly over top and sides."")",135.0,,135.0,
14,52,Cafe Cappuccino,2178,troyh,,PT5M,PT5M,1999-08-31T21:05:00Z,Make and share this Cafe Cappuccino recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/52/picAkC0UW.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/52/piccYWEE1.jpg"")",Beverages,"c(""Low Cholesterol"", ""High Protein"", ""Healthy"", ""High In..."", ""< 15 Mins"", ""For Large Groups"", ""No Cook"", ""Easy"")","c(""1/2"", ""3/4"", ""1"", ""1/2"")","c(""instant coffee"", ""sugar"", ""nonfat dry milk solid"")",5.0,1.0,62.2,0.1,0.0,1.3,36.6,12.8,0.0,11.8,2.7,18.0,2 1/4 cups,"c(""Stir ingredients together."", ""Process in a blender until powdered."", ""Use 2 Tbsp. of mixture for each cup of hot water."")",5.0,,5.0,
19,57,Black Bean Salsa,1569,Linda7,,PT10M,PT10M,1999-08-31T21:02:00Z,Make and share this Black Bean Salsa recipe from Food.com.,character(0),Sauces,"c(""Black Beans"", ""Beans"", ""Mexican"", ""Low Cholesterol"", ""Healthy"", ""< 15 Mins"", ""No Cook"", ""Easy"")","c(""2"", ""2 -3"", ""2 -3"", ""2 -3"", ""2 -3"", ""2"", ""2"", NA, NA)","c(""black beans"", ""tomatoes"", ""roma tomatoes"", ""green onions"", ""garlic"", ""cilantro"", ""white pepper"")",3.0,1.0,114.3,3.8,0.5,0.0,3.7,15.6,5.5,1.4,5.4,8.0,,"c(""Combine all ingredients in a bowl."", ""Serve with tortilla chips."")",10.0,,10.0,
22,60,Blueberry Dessert,1545,Nancy Van Ess,,PT35M,PT35M,1999-08-16T05:59:00Z,Make and share this Blueberry Dessert recipe from Food.com.,character(0),Dessert,"c(""Berries"", ""Fruit"", ""< 60 Mins"", ""Oven"")","c(""2"", ""1/2"", ""1/2"", ""2"", ""16"", ""1"", ""1"", ""1/4"")","c(""Bisquick baking mix"", ""sugar"", ""butter"", ""margarine"", ""eggs"", ""frozen blueberries"", ""Bisquick baking mix"", ""sugar"", ""firm butter"", ""margarine"")",3.0,1.0,381.1,17.3,8.8,62.1,437.6,54.5,1.6,36.9,3.9,12.0,,"c(""Heat oven to 400 degrees."", ""Mix 2 cups baking mix, 1/2 cup sugar, 1/2 cup butter and egg yolks until crumbly."", ""Press into ungreased rectangular pan, 13x9x2\""."", ""Bake 10 min."", ""Beat egg whites until stiff; fold in blueberries."", ""Spread over baked layer."", ""Mix remaining ingredients until crumbly; sprinkle over blueberry mixture. Bake until golden brown, about 20 min."")",35.0,,35.0,


In [71]:
# Percentage of mismatches
mismatch.shape[0]/recipes.shape[0]

0.18157112591552046

In [72]:
# Fill missing prep/cook times with median
recipes['PrepTime_min'].fillna(recipes['PrepTime_min'].median(), inplace=True)
recipes['CookTime_min'].fillna(recipes['CookTime_min'].median(), inplace=True)

# Recalculate total time to ensure consistency
recipes['TotalTime_min'] = recipes['PrepTime_min'] + recipes['CookTime_min']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  recipes['PrepTime_min'].fillna(recipes['PrepTime_min'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  recipes['CookTime_min'].fillna(recipes['CookTime_min'].median(), inplace=True)


In [73]:
# Convert DatePublished to datetime
recipes['DatePublished'] = pd.to_datetime(recipes['DatePublished'], errors='coerce')

In [74]:
# Remove timezone to make tz-naive
recipes['DatePublished'] = recipes['DatePublished'].dt.tz_convert(None)

# Identify unparseable dates
invalid_dates = recipes[recipes['DatePublished'].isna()][['DatePublished']]
print("Invalid or unparseable dates:")
print(invalid_dates.value_counts())

# Define a reasonable date range
today = pd.Timestamp.today()
min_reasonable_date = pd.Timestamp('1980-01-01')

# Identify dates outside reasonable range
unreasonable_dates = recipes[(recipes['DatePublished'] < min_reasonable_date) |
                             (recipes['DatePublished'] > today)][['DatePublished']]
print("Unreasonable dates:")
print(unreasonable_dates.value_counts())


Invalid or unparseable dates:
Series([], Name: count, dtype: int64)
Unreasonable dates:
Series([], Name: count, dtype: int64)


In [75]:
# Missing values percentage, excluding columns with 0% missing
missing_pct = recipes.isna().sum() / len(recipes) * 100
missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)
print(missing_pct)

RecipeYield                   66.614292
AggregatedRating              48.462155
ReviewCount                   47.364775
RecipeServings                35.005751
time_diff                     18.152902
CookTime                      15.797572
Keywords                       3.298840
RecipeCategory                 0.143727
Description                    0.000957
RecipeIngredientQuantities     0.000574
Images                         0.000191
dtype: float64


In [76]:
# Drop rows where 'description' is NaN
recipes = recipes.dropna(subset=['Description'])

In [None]:
# Show rows where Keywords is missing
recipes[recipes['Keywords'].isna()]

In [None]:
# Display summary statistics for all
print(recipes.describe())

            RecipeId      AuthorId                  DatePublished  \
count  522512.000000  5.225120e+05                         522512   
mean   271823.979206  4.572629e+07  2008-01-18 07:15:34.635109120   
min        38.000000  2.700000e+01            1999-08-06 00:40:00   
25%    137209.750000  6.947400e+04            2005-09-13 10:36:30   
50%    271760.500000  2.389615e+05            2007-12-13 16:30:00   
75%    406146.500000  5.658280e+05            2009-12-31 09:46:00   
max    541383.000000  2.002886e+09            2020-12-22 22:12:00   
std    155494.450593  2.929728e+08                            NaN   

       AggregatedRating    ReviewCount       Calories     FatContent  \
count     269291.000000  275025.000000  522512.000000  522512.000000   
mean           4.632011       5.227790     484.429822      24.614011   
min            1.000000       1.000000       0.000000       0.000000   
25%            4.500000       1.000000     174.200000       5.600000   
50%            5.0

**Dataset Summary Observations:**

- **Extreme scales & outliers:** `Calories`, `FatContent`, `SodiumContent`, `SugarContent`, `RecipeServings` max far above 75th percentile – likely errors.  
- **Skewed distributions:** Means > medians for nutrition columns – right-skewed.  
- **Missing data:** `AggregatedRating`, `ReviewCount`, `RecipeServings` have many NaNs.  
- **Time inconsistencies:** `PrepTime_min`, `CookTime_min`, `TotalTime_min` have unrealistic maxima

Preprocessing / Cleaning

In [None]:
# Parse and clean string columns
recipes['ingredients_clean'] = parse_r_list_column(recipes['RecipeIngredientParts'])
recipes['category_clean'] = parse_r_list_column(recipes['RecipeCategory']) if 'RecipeCategory' in recipes.columns else ""
recipes['keywords_clean'] = parse_r_list_column(recipes['Keywords']) if 'Keywords' in recipes.columns else ""
recipes['recipe_instructions_clean'] = clean_r_instructions(recipes['RecipeInstructions']) if 'RecipeInstructions' in recipes.columns else ""

In [None]:
# Apply cleaning to all relevant columns
recipes['combined_text'] = (
    recipes['ingredients_clean'] + ", " +
    recipes['category_clean'] + ", " +
    recipes['keywords_clean'] + ", " +
    recipes['Description'].fillna("")
).apply(clean_text)

In [None]:
# Columns to clip
numeric_cols_to_clip = [
    "TotalTime_min", "Calories", "FatContent", "SaturatedFatContent",
    "CholesterolContent", "SodiumContent", "CarbohydrateContent",
    "FiberContent", "SugarContent", "ProteinContent"
]

# Apply clipping
recipes_clipped, clip_thresholds = clip_top_outliers(recipes, numeric_cols_to_clip)

# Show thresholds
for col, val in clip_thresholds.items():
    print(f"{col} clipped at: {val:.2f}")


TotalTime_min clipped at: 122.00
Calories clipped at: 1173.20
FatContent clipped at: 64.10
SaturatedFatContent clipped at: 24.40
CholesterolContent clipped at: 263.60
SodiumContent clipped at: 1792.20
CarbohydrateContent clipped at: 120.00
FiberContent clipped at: 10.50
SugarContent clipped at: 32.80
ProteinContent clipped at: 46.90


In [None]:
# Apply these functions to the df
recipes_clipped['time_bin'] = recipes_clipped['TotalTime_min'].apply(time_bin)
recipes_clipped['calorie_bin'] = recipes_clipped['Calories'].apply(calorie_bin)
recipes_clipped['Image_first'] = recipes_clipped['Images'].apply(get_first_url)


In [None]:
# Columns needed for recommendation
important_columns = [
    'Name',
    'Image_first',
    'recipe_instructions_clean',
    'Calories',
    'combined_text',    
    'TotalTime_min',            
    'time_bin',        
    'calorie_bin'     
]

# Create the final_df
final_df = recipes_clipped[important_columns].copy()
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,Name,Image_first,recipe_instructions_clean,Calories,combined_text,TotalTime_min,time_bin,calorie_bin
0,Low-Fat Berry Blue Frozen Dessert,"https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/YUeirxMLQaeE1h3v3qnM_229%20berry%20blue%20frzn%20dess.jpg","Toss 2 cups berries with sugar., Let stand for 45 minutes, stirring occasionally., Transfer berry-sugar mixture to food processor., Add yogurt and process until smooth., Strain through fine sieve. Pour into baking pan (or transfer to ice cream maker and process according to manufacturers' directions). Freeze uncovered until edges are solid but centre is soft. Transfer to processor and blend until smooth again., Return to pan and freeze until edges are solid., Transfer to processor and blend until smooth again., Fold in remaining 2 cups of blueberries., Pour into plastic mold and freeze overnight. Let soften slightly to serve.",170.9,blueberries granulated sugar vanilla yogurt lemon juice frozen desserts dessert low protein low cholesterol healthy free of summer weeknight freezer easy make and share this lowfat berry blue frozen dessert recipe from foodcom,122.0,long,low
1,Biryani,"https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/39/picM9Mhnw.jpg","Soak saffron in warm milk for 5 minutes and puree in blender., Add chiles, onions, ginger, garlic, cloves, peppercorns, cardamom seeds, cinnamon, coriander and cumin seeds, poppy seeds, nutmeg, mace, cilantro or mint leaves and lemon juice. Blend into smooth paste. Put paste into large bowl, add yogurt and mix well., Marinate chicken in yogurt mixture with salt, covered for at least 2 - 6 hours in refrigerator., In skillet. heat oil over medium heat for 1 minute. Add ghee and 15 seconds later add onion and fry for about8 minutes., Reserve for garnish., In same skillet, cook chicken with its marinade with tomatoes for about 10 minutes over medium heat, uncovered., Remove chicken pieces from the sauce and set aside. Add rice to sauce, bring to boil, and cook, covered over low heat for 15 minutes., Return chicken and add raisins, cashews and almonds; mix well., Simmer, covered for 5 minutes., Place chicken, eggs and rice in large serving dish in such a way that yellow of the eggs, the saffron-colored rice, the nuts and the chicken make a colorful display., Add reserved onion as garnish.",1110.7,saffron milk hot green chili peppers onions garlic clove peppercorns cardamom seed cumin seed poppy seed mace cilantro mint leaf fresh lemon juice plain yogurt boneless chicken salt ghee onion tomatoes basmati rice longgrain rice raisins cashews eggs chicken breast chicken thigh leg chicken poultry meat asian indian weeknight stove top make and share this biryani recipe from foodcom,122.0,long,high
2,Best Lemonade,"https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/40/picJ4Sz3N.jpg","Into a 1 quart Jar with tight fitting lid, put sugar and lemon peel, or zest; add 1 1/2 cups very hot water (not from tap!). With lid fitted firmly, shake jar until sugar is dissolved., Add lemon juice. Refrigerate until chilled., To Serve: Into each 12-ounce glass, over ice cubes, pour 1/4 cup of the lemon syrup., Then add chilled club soda or, if you prefer, water., Stir to mix well.",311.1,sugar lemons rind of lemon zest of fresh water fresh lemon juice beverages low protein low cholesterol healthy summer 60 mins this is from one of my first good house keeping cookbooks you must use a zester in order to avoid getting any of that bitter rind and when you zest the lemons zest them onto some sugar from the recipe the sugar will catch all of the oils i also advise you from personal experience to use only the best skinned lemons for the best flavor,35.0,medium,medium
3,Carina's Tofu-Vegetable Kebabs,"https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/41/picmbLig8.jpg","Drain the tofu, carefully squeezing out excess water, and pat dry with paper towels., Cut tofu into one-inch squares., Set aside. Cut eggplant lengthwise in half, then cut each half into approximately three strips., Cut strips crosswise into one-inch cubes., Slice zucchini into half-inch thick slices., Cut red pepper in half, removing stem and seeds, and cut each half into one-inch squares., Wipe mushrooms clean with a moist paper towel and remove stems., Thread tofu and vegetables on to barbecue skewers in alternating color combinations: For example, first a piece of eggplant, then a slice of tofu, then zucchini, then red pepper, baby corn and mushrooms., Continue in this way until all skewers are full., Make the marinade by putting all ingredients in a blender, and blend on high speed for about one minute until mixed., Alternatively, put all ingredients in a glass jar, cover tightly with the lid and shake well until mixed., Lay the kebabs in a long, shallow baking pan or on a non-metal tray, making sure they lie flat. Evenly pour the marinade over the kebabs, turning them once so that the tofu and vegetables are coated., Refrigerate the kebabs for three to eight hours, occasionally spooning the marinade over them., Broil or grill the kebabs at 450 F for 15-20 minutes, or on the grill, until the vegetables are browned., Suggestions This meal can be served over cooked, brown rice. Amounts can easily be doubled to make four servings.",536.1,extra firm tofu eggplant zucchini mushrooms soy sauce low sodium soy sauce olive oil maple syrup honey red wine vinegar lemon juice garlic cloves mustard powder black pepper soytofu beans vegetable low cholesterol weeknight broilgrill oven this dish is best prepared a day in advance to allow the ingredients to soak in the marinade overnight,122.0,long,medium
4,Cabbage Soup,"https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/42/picVEMxk8.jpg","Mix everything together and bring to a boil., Reduce heat and simmer for 30 minutes (longer if you prefer your veggies to be soft)., Refrigerate until cool., Serve chilled with sour cream.",103.6,plain tomato juice cabbage onion carrots celery vegetable low protein vegan low cholesterol healthy winter 60 mins easy make and share this cabbage soup recipe from foodcom,50.0,medium,low


TF-IDF

In [None]:
# Create TF-IDF Vectors
vectorizer = TfidfVectorizer(max_features=2000)
recipe_vectors = vectorizer.fit_transform(final_df['combined_text'])
recipe_vectors = recipe_vectors.toarray()

# Normalize for cosine similarity
recipe_vectors = recipe_vectors / (np.linalg.norm(recipe_vectors, axis=1, keepdims=True) + 1e-10)

# Keep as separate numpy array instead of storing in DataFrame
recipe_vectors_matrix = recipe_vectors 

In [None]:
# Show all columns
pd.set_option('display.max_columns', None)
# Show all rows
pd.set_option('display.max_rows', None)
# Show full column width (no truncation)
pd.set_option('display.max_colwidth', None)

In [None]:
top_recipes = recommend(
    user_prefs="Italiaan pastaa vegegtarian",
    dataset=final_df,
    recipe_vectors_matrix=recipe_vectors_matrix,
    vectorizer=vectorizer,
    top_n=5,
    time_pref="fast",
    calorie_pref="low",
    vocab_column="combined_text",
    time_column="time_bin",
    calorie_column="calorie_bin",
    return_columns=["Name", "Image_first", "recipe_instructions_clean", "TotalTime_str", "Calories"]
)
print(top_recipes)

                                             Name  \
101092          Fast, Tasty, Low Fat  Pasta Sauce   
299097  Vegetarian Low Fat &quot;sloppy Joe&quot;   
38913                                  Easy pasta   
221436                         Easy Italian Pasta   
62523              Zesty Italian Pasta Bean Salad   

                                                                                                                Image_first  \
101092                                                                                                                 None   
299097                                                                                                                 None   
38913                                                                                                                  None   
221436                                                                                                                 None   
62523   https://img.sndimg.com/food/image/uplo