## Phase 1 - Project Setup ##

In [36]:
# imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ast
import re

In [37]:
df = pd.read_csv("1_Recipe_csv.csv") # Loads csv file
df.head() # Displays first few rows in csv

Unnamed: 0,recipe_title,category,subcategory,description,ingredients,directions,num_ingredients,num_steps
0,Air Fryer Potato Slices with Dipping Sauce,Air Fryer Recipes,Air Fryer Recipes,"These air fryer potato slices, served with a b...","[""3/4 cup ketchup"", ""1/2 cup beer"", ""1 tablesp...","[""Combine ketchup, beer, Worcestershire sauce,...",9,5
1,Gochujang Pork Belly Bites,Air Fryer Recipes,Air Fryer Recipes,These gochujang pork belly bites are sweet and...,"[""1 pound pork belly"", ""1/4 cup gochujang"", ""2...","[""Preheat an air fryer to 400 degrees F (200 d...",5,4
2,3-Ingredient Air Fryer Everything Bagel Chicke...,Air Fryer Recipes,Air Fryer Recipes,These 3-ingredient air fryer everything bagel ...,"[""1 \u00bc pounds chicken tenders"", ""1 tablesp...","[""Gather all ingredients. Preheat an air fryer...",3,4
3,Air Fryer Everything Bagel Chicken Cutlets,Air Fryer Recipes,Air Fryer Recipes,These air fryer everything bagel chicken cutle...,"[""4 chicken cutlets (about 1 pound total)"", ""s...","[""Preheat an air fryer to 400 degrees F (200 d...",9,9
4,Air Fryer Honey Sriracha Salmon Bites,Air Fryer Recipes,Air Fryer Recipes,These air fryer honey Sriracha salmon bites ar...,"[""1 tablespoon soy sauce"", ""1 tablespoon honey...","[""Preheat an air fryer to 400 degrees F (200 d...",5,5


In [38]:
df.describe() # Generates descriptive statistics

Unnamed: 0,num_ingredients,num_steps
count,62126.0,62126.0
mean,9.017448,4.661865
std,3.830323,2.310253
min,1.0,1.0
25%,6.0,3.0
50%,9.0,4.0
75%,11.0,6.0
max,35.0,25.0


In [39]:
df.isnull() # Checks for missing values

Unnamed: 0,recipe_title,category,subcategory,description,ingredients,directions,num_ingredients,num_steps
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
62121,False,False,False,False,False,False,False,False
62122,False,False,False,False,False,False,False,False
62123,False,False,False,False,False,False,False,False
62124,False,False,False,False,False,False,False,False


In [40]:
df.columns 

Index(['recipe_title', 'category', 'subcategory', 'description', 'ingredients',
       'directions', 'num_ingredients', 'num_steps'],
      dtype='object')

In [41]:
df.dtypes

recipe_title       object
category           object
subcategory        object
description        object
ingredients        object
directions         object
num_ingredients     int64
num_steps           int64
dtype: object

In [42]:
df.isnull().sum()

recipe_title       0
category           0
subcategory        0
description        0
ingredients        0
directions         0
num_ingredients    0
num_steps          0
dtype: int64

## Phase 2 - Data Cleaning & Preprocessing ##

**Parse Ingredients**

In [40]:
# Parse ingredients
def parse_ingredients(x):
    
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            # treat the whole string as one ingredient
            return [x]
            # if it's something weird return empty list
            return []

df["ingredients_list"] = df["ingredients"].apply(parse_ingredients)

df["ingredients_list"].head()

0    [3/4 cup ketchup, 1/2 cup beer, 1 tablespoon W...
1    [1 pound pork belly, 1/4 cup gochujang, 2 tabl...
2    [1 ¼ pounds chicken tenders, 1 tablespoon oliv...
3    [4 chicken cutlets (about 1 pound total), salt...
4    [1 tablespoon soy sauce, 1 tablespoon honey, 1...
Name: ingredients_list, dtype: object

**Clean & Normalize Ingredients**

In [38]:
# Common measurement words to remove
MEASUREMENT_WORDS = {
    "cup", "cups",
    "tablespoon", "tablespoons", "tbsp", "tbsps",
    "teaspoon", "teaspoons", "tsp", "tsps",
    "pound", "pounds", "lb", "lbs",
    "ounce", "ounces", "oz",
    "gram", "grams", "g", "kg",
    "ml", "liter", "liters", "l",
    "slice", "slices",
    "clove", "cloves",
    "pinch", "dash",
    "package", "packages",
    "can", "cans"
}

# Normalizes raw ingredient string
def clean_single_ingredient(ing):
    
    if not isinstance(ing, str):
        return ""
    
    # lowercase
    ing = ing.lower()
    
    # remove fractions
    ing = re.sub(r"\d+/\d+", " ", ing)
    # remove digits
    ing = re.sub(r"\d+", " ", ing)
    
    # remove parentheses content
    ing = re.sub(r"\(.*?\)", " ", ing)
    
    # replace nonletter characters with spaces 
    ing = re.sub(r"[^a-z\s]", " ", ing)
    
    # split into tokens
    tokens = ing.split()
    
    # remove measurement words
    tokens = [t for t in tokens if t not in MEASUREMENT_WORDS]
    
    # collapse multiple spaces and join back
    cleaned = " ".join(tokens).strip()
    
    return cleaned

def clean_ingredients_list(ing_list):
    cleaned_list = []
    for ing in ing_list:
        cleaned = clean_single_ingredient(ing)
        if cleaned:  # skip empty
            cleaned_list.append(cleaned)
    return cleaned_list

df["clean_ingredients_list"] = df["ingredients_list"].apply(clean_ingredients_list)

df[["recipe_title", "ingredients_list", "clean_ingredients_list"]].head()


Unnamed: 0,recipe_title,ingredients_list,clean_ingredients_list
0,Air Fryer Potato Slices with Dipping Sauce,"[3/4 cup ketchup, 1/2 cup beer, 1 tablespoon W...","[ketchup, beer, worcestershire sauce, onion po..."
1,Gochujang Pork Belly Bites,"[1 pound pork belly, 1/4 cup gochujang, 2 tabl...","[pork belly, gochujang, soy sauce, honey, grou..."
2,3-Ingredient Air Fryer Everything Bagel Chicke...,"[1 ¼ pounds chicken tenders, 1 tablespoon oliv...","[chicken tenders, olive oil, everything bagel ..."
3,Air Fryer Everything Bagel Chicken Cutlets,"[4 chicken cutlets (about 1 pound total), salt...","[chicken cutlets, salt and freshly ground blac..."
4,Air Fryer Honey Sriracha Salmon Bites,"[1 tablespoon soy sauce, 1 tablespoon honey, 1...","[soy sauce, honey, sriracha, rice vinegar, gra..."


**Tokenize Ingredients**

In [46]:
df["ingredients_text"] = df["clean_ingredients_list"].apply(lambda lst: " ".join(lst))

df["ingredients_text"].head()


0    ketchup beer worcestershire sauce onion powder...
1    pork belly gochujang soy sauce honey ground gi...
2    chicken tenders olive oil everything bagel sea...
3    chicken cutlets salt and freshly ground black ...
4    soy sauce honey sriracha rice vinegar granulat...
Name: ingredients_text, dtype: object

**TF–IDF Matrix (for classification and clustering)**

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer over ingredient tokens
tfidf_vectorizer = TfidfVectorizer(
    token_pattern=r"\b[a-zA-Z]+\b",  
    ngram_range=(1, 2)               
)

X_tfidf = tfidf_vectorizer.fit_transform(df["ingredients_text"])

print("TF-IDF matrix shape:", X_tfidf.shape)


TF-IDF matrix shape: (62126, 67698)


In [36]:
import numpy as np

# Build a vocabulary of unique ingredient phrases across all recipes
all_ingredients = set()
for lst in df["clean_ingredients_list"]:
    all_ingredients.update(lst)

ingredient_vocab = sorted(all_ingredients)
ingredient_index = {ing: i for i, ing in enumerate(ingredient_vocab)}

print("Number of unique cleaned ingredient phrases:", len(ingredient_vocab))

# Binary matrix
num_recipes = df.shape[0]
num_ingredients = len(ingredient_vocab)

X_binary = np.zeros((num_recipes, num_ingredients), dtype=int)

for row_idx, ing_list in enumerate(df["clean_ingredients_list"]):
    for ing in ing_list:
        col_idx = ingredient_index.get(ing)
        if col_idx is not None:
            X_binary[row_idx, col_idx] = 1

print("Binary ingredient matrix shape:", X_binary.shape)


Number of unique cleaned ingredient phrases: 31003
Binary ingredient matrix shape: (62126, 31003)


## Phase 5 - Clustering ##