In [1]:
# Data Preprocessing Pipeline

import pandas as pd
import json

# Load CSV file
raw_recipes = pd.read_csv("RAW_recipes.csv")

# Load JSON file
with open("train.json", "r") as file:
    train_recipes = json.load(file)  # Loads as a list of dicts
    train_recipes = pd.DataFrame(train_recipes)  # Convert to DataFrame

# View Data
print(raw_recipes.head())
print(train_recipes.head())


                                         name      id  minutes  \
0  arriba   baked winter squash mexican style  137739       55   
1            a bit different  breakfast pizza   31490       30   
2                   all in the kitchen  chili  112140      130   
3                          alouette  potatoes   59389       45   
4          amish  tomato ketchup  for canning   44061      190   

   contributor_id   submitted  \
0           47892  2005-09-16   
1           26278  2002-06-17   
2          196586  2005-02-25   
3           68585  2003-04-14   
4           41706  2002-10-25   

                                                tags  \
0  ['60-minutes-or-less', 'time-to-make', 'course...   
1  ['30-minutes-or-less', 'time-to-make', 'course...   
2  ['time-to-make', 'course', 'preparation', 'mai...   
3  ['60-minutes-or-less', 'time-to-make', 'course...   
4  ['weeknight', 'time-to-make', 'course', 'main-...   

                                    nutrition  n_steps  \
0       [

In [2]:
# Select relevant columns

raw_recipes = raw_recipes[['name', 'ingredients', 'steps', 'minutes', 'tags', 'nutrition']]
train_recipes = train_recipes[['id', 'ingredients']]


In [3]:
# Convert Ingredients Lists into a Standard Format

import ast

def clean_ingredients(ingredient_list):
    """Convert stringified list into an actual Python list."""
    try:
        return ast.literal_eval(ingredient_list)  # Convert string to list
    except (ValueError, SyntaxError):
        return []  # Return empty list if there's an issue

# Apply function to datasets
raw_recipes['ingredients'] = raw_recipes['ingredients'].apply(clean_ingredients)
train_recipes['ingredients'] = train_recipes['ingredients'].apply(clean_ingredients)


In [4]:
# Normalize Ingredients for NLP Processing

import re

def normalize_ingredients(ingredient_list):
    """Clean and normalize ingredient names."""
    normalized = []
    for ingredient in ingredient_list:
        ingredient = ingredient.lower()  # Lowercase
        ingredient = re.sub(r"\(.*?\)", "", ingredient)  # Remove parentheses
        ingredient = re.sub(r"[^a-zA-Z0-9\s]", "", ingredient)  # Remove special chars
        ingredient = ingredient.strip()
        normalized.append(ingredient)
    return normalized

# Apply normalization
raw_recipes['ingredients'] = raw_recipes['ingredients'].apply(normalize_ingredients)
train_recipes['ingredients'] = train_recipes['ingredients'].apply(normalize_ingredients)


In [5]:
# Extract Cooking Time

raw_recipes['minutes'] = pd.to_numeric(raw_recipes['minutes'], errors='coerce')
raw_recipes = raw_recipes.dropna(subset=['minutes'])  # Drop invalid entries


In [6]:
import nltk
import os
import pickle
from nltk.tokenize.punkt import PunktSentenceTokenizer

nltk.data.path.insert(0, "/Users/nikitaudayshinde/nltk_data")
punkt_path = "/Users/nikitaudayshinde/nltk_data/tokenizers/punkt/english.pickle"

if os.path.exists(punkt_path):
    print("✅ 'punkt' tokenizer found! Loading manually...")

    with open(punkt_path, "rb") as f:
        tokenizer = pickle.load(f)

    nltk.tokenize.sent_tokenize = tokenizer.tokenize
else:
    print("❌ 'punkt' tokenizer NOT found! Check the path.")

# Verify tokenizer works now:
text = "Hello. This is a test sentence."
print(nltk.tokenize.sent_tokenize(text))

✅ 'punkt' tokenizer found! Loading manually...
['Hello.', 'This is a test sentence.']


In [7]:
# Tokenize Recipe Steps (For NLP)
import nltk
from nltk.tokenize import sent_tokenize

# Tokenize steps into sentences
raw_recipes['steps'] = raw_recipes['steps'].apply(lambda x: sent_tokenize(x) if isinstance(x, str) else [])


In [8]:
# Saves the Processed Data Locally

raw_recipes.to_csv("processed_recipes.csv", index=False)
train_recipes.to_json("processed_train.json", orient="records")