In [1]:
import spacy
spacy.cli.download("en_core_web_sm")


import pandas as pd
import json

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# Data Preprocessing Pipeline

import pandas as pd
import json

# Load CSV file
raw_recipes = pd.read_csv("RAW_recipes.csv")

# Load JSON file
with open("train.json", "r") as file:
    train_recipes = json.load(file)  # Loads as a list of dicts
    train_recipes = pd.DataFrame(train_recipes)  # Convert to DataFrame

# View Data
print(raw_recipes.head())
print(train_recipes.head())


                                         name      id  minutes  \
0  arriba   baked winter squash mexican style  137739       55   
1            a bit different  breakfast pizza   31490       30   
2                   all in the kitchen  chili  112140      130   
3                          alouette  potatoes   59389       45   
4          amish  tomato ketchup  for canning   44061      190   

   contributor_id   submitted  \
0           47892  2005-09-16   
1           26278  2002-06-17   
2          196586  2005-02-25   
3           68585  2003-04-14   
4           41706  2002-10-25   

                                                tags  \
0  ['60-minutes-or-less', 'time-to-make', 'course...   
1  ['30-minutes-or-less', 'time-to-make', 'course...   
2  ['time-to-make', 'course', 'preparation', 'mai...   
3  ['60-minutes-or-less', 'time-to-make', 'course...   
4  ['weeknight', 'time-to-make', 'course', 'main-...   

                                    nutrition  n_steps  \
0       [

In [3]:
# Select relevant columns

raw_recipes = raw_recipes[['name', 'ingredients', 'steps', 'minutes', 'tags', 'nutrition']]
train_recipes = train_recipes[['id', 'ingredients']]


In [4]:
# Convert Ingredients Lists into a Standard Format

import ast

def clean_ingredients(ingredient_list):
    """Convert stringified list into an actual Python list."""
    try:
        return ast.literal_eval(ingredient_list)  # Convert string to list
    except (ValueError, SyntaxError):
        return []  # Return empty list if there's an issue

# Apply function to datasets
raw_recipes['ingredients'] = raw_recipes['ingredients'].apply(clean_ingredients)
train_recipes['ingredients'] = train_recipes['ingredients'].apply(clean_ingredients)


In [5]:
# Normalize Ingredients for NLP Processing

import re

def normalize_ingredients(ingredient_list):
    """Clean and normalize ingredient names."""
    normalized = []
    for ingredient in ingredient_list:
        ingredient = ingredient.lower()  # Lowercase
        ingredient = re.sub(r"\(.*?\)", "", ingredient)  # Remove parentheses
        ingredient = re.sub(r"[^a-zA-Z0-9\s]", "", ingredient)  # Remove special chars
        ingredient = ingredient.strip()
        normalized.append(ingredient)
    return normalized

# Apply normalization
raw_recipes['ingredients'] = raw_recipes['ingredients'].apply(normalize_ingredients)
train_recipes['ingredients'] = train_recipes['ingredients'].apply(normalize_ingredients)


In [6]:
# Extract Cooking Time

raw_recipes['minutes'] = pd.to_numeric(raw_recipes['minutes'], errors='coerce')
raw_recipes = raw_recipes.dropna(subset=['minutes'])  # Drop invalid entries


In [7]:
import nltk
import os
import pickle
from nltk.tokenize.punkt import PunktSentenceTokenizer

nltk.data.path.insert(0, "/Users/nikitaudayshinde/nltk_data")
punkt_path = "/Users/nikitaudayshinde/nltk_data/tokenizers/punkt/english.pickle"

if os.path.exists(punkt_path):
    print("✅ 'punkt' tokenizer found! Loading manually...")

    with open(punkt_path, "rb") as f:
        tokenizer = pickle.load(f)

    nltk.tokenize.sent_tokenize = tokenizer.tokenize
else:
    print("❌ 'punkt' tokenizer NOT found! Check the path.")

# Verify tokenizer works now:
text = "Hello. This is a test sentence."
print(nltk.tokenize.sent_tokenize(text))

✅ 'punkt' tokenizer found! Loading manually...
['Hello.', 'This is a test sentence.']


In [8]:
# Tokenize Recipe Steps (For NLP)
import nltk
from nltk.tokenize import sent_tokenize

# Tokenize steps into sentences
raw_recipes['steps'] = raw_recipes['steps'].apply(lambda x: sent_tokenize(x) if isinstance(x, str) else [])


In [9]:
# Saves the Processed Data Locally
# these are to large to push

raw_recipes.to_csv("processed_recipes.csv", index=False)
train_recipes.to_json("processed_train.json", orient="records")

In [10]:
import pandas as pd
import re
import ast

# Load CSV file
csv_path = "RAW_recipes.csv"

try:
    raw_recipes = pd.read_csv(csv_path)
except FileNotFoundError:
    raw_recipes = None

# Feature Engineering Functions

def clean_ingredients(ingredient_list):
    """Convert stringified list into an actual Python list."""
    try:
        return ast.literal_eval(ingredient_list)  # Convert string to list
    except (ValueError, SyntaxError):
        return []  # Return empty list if there's an issue

def normalize_ingredients(ingredient_list):
    """Clean and normalize ingredient names."""
    normalized = []
    for ingredient in ingredient_list:
        ingredient = ingredient.lower()  # Lowercase
        ingredient = re.sub(r"\(.*?\)", "", ingredient)  # Remove parentheses
        ingredient = re.sub(r"[^a-zA-Z0-9\s]", "", ingredient)  # Remove special chars
        ingredient = ingredient.strip()
        normalized.append(ingredient)
    return normalized

if raw_recipes is not None:
    # Selecting relevant columns
    raw_recipes = raw_recipes[['name', 'ingredients', 'steps', 'minutes', 'tags', 'nutrition']]

    # Convert ingredients and steps into structured lists
    raw_recipes['ingredients'] = raw_recipes['ingredients'].apply(clean_ingredients)
    raw_recipes['ingredients'] = raw_recipes['ingredients'].apply(normalize_ingredients)

    # Extract primary ingredient (first in list as a heuristic)
    raw_recipes['primary_ingredient'] = raw_recipes['ingredients'].apply(lambda x: x[0] if x else "unknown")

    # Tokenizing Steps (Simple Word Tokenization)
    raw_recipes['tokenized_steps'] = raw_recipes['steps'].apply(lambda x: re.findall(r'\b\w+\b', str(x).lower()))

    # Extract Cooking Time Categories
    def categorize_time(minutes):
        """Categorize recipes based on time."""
        if minutes < 15:
            return "quick"
        elif minutes < 30:
            return "moderate"
        elif minutes < 60:
            return "long"
        else:
            return "very long"

    raw_recipes['time_category'] = raw_recipes['minutes'].apply(categorize_time)


In [11]:
import pandas as pd
import re
import ast

# Load Recipes Data
csv_path = "RAW_recipes.csv"  # Update the path if needed

try:
    raw_recipes = pd.read_csv(csv_path)
except FileNotFoundError:
    print("Error: RAW_recipes.csv not found. Please check your file path.")

# Convert stringified ingredient lists into actual lists
def clean_ingredients(ingredient_list):
    """Convert stringified list into an actual Python list."""
    try:
        return ast.literal_eval(ingredient_list)  # Convert string to list
    except (ValueError, SyntaxError):
        return []  # Return empty list if there's an issue

# Normalize ingredient text
def normalize_ingredients(ingredient_list):
    """Clean and normalize ingredient names."""
    normalized = []
    for ingredient in ingredient_list:
        ingredient = ingredient.lower()  # Lowercase
        ingredient = re.sub(r"\(.*?\)", "", ingredient)  # Remove parentheses
        ingredient = re.sub(r"[^a-zA-Z0-9\s]", "", ingredient)  # Remove special chars
        ingredient = ingredient.strip()
        normalized.append(ingredient)
    return normalized

# Data Cleaning
if raw_recipes is not None:
    raw_recipes = raw_recipes[['name', 'ingredients', 'steps', 'minutes', 'tags', 'nutrition']]

    # Process ingredients
    raw_recipes['ingredients'] = raw_recipes['ingredients'].apply(clean_ingredients)
    raw_recipes['ingredients'] = raw_recipes['ingredients'].apply(normalize_ingredients)

    # Extract time categories
    def categorize_time(minutes):
        """Categorize recipes based on time."""
        if minutes < 15:
            return "quick"
        elif minutes < 30:
            return "moderate"
        elif minutes < 60:
            return "long"
        else:
            return "very long"

    raw_recipes['time_category'] = raw_recipes['minutes'].apply(categorize_time)

# Rule-Based Recipe Search Function
def find_recipes(user_ingredients, baking_method=None, time_category=None):
    """
    Rule-based recipe search.
    Filters recipes based on user-provided ingredients, optional baking method, and time constraint.
    """
    if raw_recipes is None:
        print("Error: Recipe data not loaded.")
        return None

    # Convert user ingredients to lowercase for matching
    user_ingredients = [ing.lower() for ing in user_ingredients]

    # Filter recipes that contain at least one user-provided ingredient
    def ingredient_match(recipe_ingredients):
        return any(ing in recipe_ingredients for ing in user_ingredients)

    filtered_recipes = raw_recipes[raw_recipes["ingredients"].apply(ingredient_match)]

    # If a baking method is provided, filter by method in steps
    if baking_method:
        filtered_recipes = filtered_recipes[
            filtered_recipes["steps"].str.contains(baking_method, case=False, na=False)
        ]

    # If a time category is provided, filter by time constraint
    if time_category:
        filtered_recipes = filtered_recipes[filtered_recipes["time_category"] == time_category]

    # Sort recipes by the number of matching ingredients
    filtered_recipes["ingredient_match_count"] = filtered_recipes["ingredients"].apply(
        lambda recipe_ing: sum(ing in recipe_ing for ing in user_ingredients)
    )
    filtered_recipes = filtered_recipes.sort_values(by="ingredient_match_count", ascending=False)

    # Select top results
    top_results = filtered_recipes[["name", "ingredients", "steps", "minutes"]].head(5)
    
    return top_results

# Example Usage
user_ingredients = ["flour", "sugar", "butter"]
baking_method = "bake"
time_category = "quick"

recommended_recipes = find_recipes(user_ingredients, baking_method, time_category)


if recommended_recipes is not None:
    print("Recommended Recipes:")
    display(recommended_recipes)  # Use Pandas' built-in display function in Jupyter

Recommended Recipes:


Unnamed: 0,name,ingredients,steps,minutes
128,250 chocolate chip cookies recipe,"[butter, brown sugar, vanilla, blended oatmeal...",['blended oatmeal: measure and blend in a blen...,6
49386,chocolate bar cake,"[chocolate bars, butter, boiling water, flour,...","['heat oven to 350 degrees f', 'grease and flo...",0
60709,cranberry orange cookies jar mix,"[light brown sugar, sugar, flour, baking soda,...","['jarmix instructions', 'in 1-qt jar , place b...",10
55856,coconut orange squares,"[butter, sugar, egg, orange rind, milk, coconu...","['crem butter and sugar until light', 'beat in...",0
55495,coconut brownies,"[butter, sugar, eggs, vanilla, flour, unsweete...","['melt butter over low heat', 'then remove', '...",0


In [12]:
# Converts The Full Recipe Dataset
#  
import pandas as pd

# Load datasets from local files
df1 = pd.read_csv("PP_recipes.csv")
df2 = pd.read_csv("RAW_recipes.csv")
df3 = pd.read_json("test.json")
df4 = pd.read_json("train.json")

# Store datasets in a dictionary for easier access
datasets = {
    "PP_recipes": df1,
    "RAW_recipes": df2,
    "Test_Recipes": df3,
    "Train_Recipes": df4
}

# Print dataset column names to understand their structure
for name, df in datasets.items():
    print(f"{name} Columns:", df.columns)
# unit mapping
unit_mapping = {
    "tbsp": "tablespoon",
    "tbs": "tablespoon",
    "tsp": "teaspoon",
    "oz": "ounce",
    "g": "gram",
    "kg": "kilogram",
}

def standardize_units(ingredient):
    words = ingredient.split()
    for i, word in enumerate(words):
        if word in unit_mapping:
            words[i] = unit_mapping[word]
    return " ".join(words)

df["ingredients"] = df["ingredients"].apply(lambda x: [standardize_units(i) for i in x])

# tokenizing
import spacy

nlp = spacy.load("en_core_web_sm")

def tokenize_steps(steps):
    return [sent.text for sent in nlp(steps).sents]

df["steps"] = df["steps"].apply(tokenize_steps)


# unit mapping
unit_mapping = {
    "tbsp": "tablespoon",
    "tbs": "tablespoon",
    "tsp": "teaspoon",
    "oz": "ounce",
    "g": "gram",
    "kg": "kilogram",
}

def standardize_units(ingredient):
    words = ingredient.split()
    for i, word in enumerate(words):
        if word in unit_mapping:
            words[i] = unit_mapping[word]
    return " ".join(words)

df["ingredients"] = df["ingredients"].apply(lambda x: [standardize_units(i) for i in x])

# tokenizing
import spacy

nlp = spacy.load("en_core_web_sm")

def tokenize_steps(steps):
    return [sent.text for sent in nlp(steps).sents]

df["steps"] = df["steps"].apply(tokenize_steps)





PP_recipes Columns: Index(['id', 'i', 'name_tokens', 'ingredient_tokens', 'steps_tokens',
       'techniques', 'calorie_level', 'ingredient_ids'],
      dtype='object')
RAW_recipes Columns: Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')
Test_Recipes Columns: Index(['id', 'ingredients'], dtype='object')
Train_Recipes Columns: Index(['id', 'cuisine', 'ingredients'], dtype='object')


KeyError: 'steps'

In [None]:
print(df.columns)

In [None]:
print(df.head())  # View the first few rows


In [None]:
df = pd.read_csv("RAW_recipes.csv")  # Ensure you're using the right file
print(df.columns)  # Check available columns


In [None]:
import pandas as pd
import re
import ast
import spacy

# ✅ Load the correct dataset
df = pd.read_csv("RAW_recipes.csv")  # Make sure this is the right file

# ✅ Ensure all necessary columns are present
required_columns = {"name", "ingredients", "steps", "minutes"}
missing_columns = required_columns - set(df.columns)
if missing_columns:
    raise ValueError(f"Missing columns in dataset: {missing_columns}")

# ✅ Load spaCy model (Make sure you installed it: `python -m spacy download en_core_web_sm`)
nlp = spacy.load("en_core_web_sm")

# ✅ Function to clean and normalize ingredients
def clean_ingredients(ingredient_list):
    """Convert stringified list into an actual Python list."""
    try:
        return ast.literal_eval(ingredient_list)  # Convert string to list
    except (ValueError, SyntaxError):
        return []  # Return empty list if there's an issue

def normalize_ingredients(ingredient_list):
    """Clean and normalize ingredient names."""
    normalized = []
    for ingredient in ingredient_list:
        ingredient = ingredient.lower().strip()  # Lowercase & Trim
        ingredient = re.sub(r"\(.*?\)", "", ingredient)  # Remove text in parentheses
        ingredient = re.sub(r"[^a-zA-Z0-9\s]", "", ingredient)  # Remove special characters
        normalized.append(ingredient)
    return normalized

# ✅ Function to tokenize recipe steps
def tokenize_steps(steps):
    """Tokenize recipe steps into sentences."""
    if pd.isnull(steps) or not isinstance(steps, str):  # Handle NaN values
        return []
    return [sent.text for sent in nlp(steps).sents]

# ✅ Apply preprocessing functions
df["ingredients"] = df["ingredients"].apply(clean_ingredients)
df["ingredients"] = df["ingredients"].apply(normalize_ingredients)

# ✅ Ensure "steps" is a string before tokenizing
df["steps"] = df["steps"].astype(str)
df["tokenized_steps"] = df["steps"].apply(tokenize_steps)

# ✅ Extract time categories
def categorize_time(minutes):
    """Categorize recipes based on time required."""
    if minutes < 15:
        return "quick"
    elif minutes < 30:
        return "moderate"
    elif minutes < 60:
        return "long"
    else:
        return "very long"

df["time_category"] = df["minutes"].apply(categorize_time)

# ✅ Display processed data
import ace_tools as tools
tools.display_dataframe_to_user(name="Processed Recipes Data", dataframe=df)

print("✅ Data preprocessing complete!")


In [None]:
import pandas as pd
import spacy

# ✅ Load dataset
df = pd.read_csv("RAW_recipes.csv")

# ✅ Load spaCy model (Disable unused features to speed up processing)
nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat"])

# ✅ Convert "steps" column to strings
df["steps"] = df["steps"].astype(str)

# ✅ Process all steps in **bulk** (MUCH faster than apply)
df["tokenized_steps"] = list(nlp.pipe(df["steps"], batch_size=100))

print("✅ Tokenization completed successfully!")
