In [20]:
import pandas as pd
import numpy as np
import string, re

import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

file_path = 'Food Ingredients and Recipe Dataset with Image Name Mapping.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)
#df.head()
df = df.set_index('Id')
df = df.dropna(subset=['Title'])    #1292

In [21]:
import pandas as pd
import re

# List of measurement words
measure_words = [
    'All' , 'purpose','you want' ,'you need','our','sharp','fork', 
    'half','Filling' , 'finely' , 'into' , 'and' , 'for' ,'with',
    'heat', 'inch' ,'piece','asian', 'taste','sprig','deep frying',
    'bottle', 'bottles', 'box', 'boxes', 'bunch', 'bunches', 'bushel', 'bushels',
    'can', 'cans', 'container', 'cup', 'cups', 'carton', 'cartons', 'dash', 'dashes',
    'drop', 'drops', 'fl', 'fl.', 'fluid', 'jar', 'jars', 'ounce', 'ounces', 
    'gallon', 'gallons', 'glass', 'glasses', 'gram', 'grams', 'kg', 'kgs', 'lb', 'lbs',
    'liter', 'liters',  'large', 'medium', 'ml', 'mls', 'package', 'pkg', 'small',
    'to taste', 'pinch', 'pinches', 'pint', 'pints', 'pound', 'pounds', 'qt', 'qts',
    'quart', 'quarts', 'scoop', 'scoops', 'sliced', 'slivered', 'stick', 'sticks',
    'tablespoon', 'tablespoons', 'tbs', 'tbsp', 'tbsps', 'teaspoon', 'teaspoons', 'tsp',
    'tsps', 'whole' ,  'all' , 'preserved','cut' ,'evaporated','fresh','powder','more',
    'divided', 'plus' ,  'unsweetened' , 'divided','peeled' ,'seeded','squash','cubes','including' ,
    'Tbsp', 'dark' ,  'round' , 'chopped','smoked' ,'new','melted','brown','hot' ,'grated' 
]

# Function to clean the ingredients text
def clean_once(text):
    if not isinstance(text, str):  # Check if text is a string
        return []
    
    ingredlist = []
    for ingred in text.split("#item,"):
        ingred = ingred.replace('#item', '')  # Scraping artifact on last item
        ingred = re.sub(r'\([^)]*\)', '', ingred)  # Remove anything inside parentheses
        #ingred = ingred.split(',')[0]  # Remove anything after a comma
        ingred = re.sub(r'\w*\d\w*', ' ', ingred)  # Remove numeric characters
        ingred = ingred.replace('⁄', ' ')  # Fraction slashes are annoying
        ingred = ' ' + ingred + ' '  # Padding in case measure_word is first or last
        
        for measure_word in measure_words:  # Remove measure words by themselves
            ingred = ingred.replace(measure_word, ' ')
        
        ingred = ingred.strip()
        # Remove words with one or two letters
        ingred = ' '.join([word for word in ingred.split() if len(word) > 2])
        
        # Remove non-alphabetic characters
        ingred = re.sub(r'[^a-zA-Z\s]', '', ingred)
        
        if ingred and not ingred.isupper():  # Important for multi-part recipes
            ingredlist.append(ingred)
    
    return ingredlist


# Applying the cleaning function to the DataFrame
df['CleanIngredients'] = df['Ingredients'].apply(clean_once)

# Drop rows where 'Ingredients' or 'CleanIngredients' columns contain NaN values
df = df.dropna(subset=['Ingredients', 'CleanIngredients'])


# Print the cleaned ingredients column
print(df['CleanIngredients'])

Id
0        [chicken kosher salt acorn sage rosemary unsal...
1        [egg whites potatoes kosher salt black pepper ...
2        [milk milk garlic onion paprika black pepper k...
3        [Italian loaf olive oil sweet Italian sausage ...
4        [sugar water oz bon oz lemon juice apple butte...
                               ...                        
13496    [cocoa doubleacting baking salt eggs granulate...
13497    [lemon butternut dice olive oil onion Israeli ...
13498    [Leftover katsuo bushi from making katsuo bush...
13499    [unsalted butter baby spinach feta crumbled nu...
13500    [poblano chiles tomatoes juice garlic cloves c...
Name: CleanIngredients, Length: 13491, dtype: object


In [22]:
df

Unnamed: 0_level_0,Title,Ingredients,Instructions,Image_Name,CleanIngredients
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,[chicken kosher salt acorn sage rosemary unsal...
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,[egg whites potatoes kosher salt black pepper ...
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,[milk milk garlic onion paprika black pepper k...
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,[Italian loaf olive oil sweet Italian sausage ...
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,[sugar water oz bon oz lemon juice apple butte...
...,...,...,...,...,...
13496,Brownie Pudding Cake,"['1 cup all-purpose flour', '2/3 cup unsweeten...",Preheat the oven to 350°F. Into a bowl sift to...,brownie-pudding-cake-14408,[cocoa doubleacting baking salt eggs granulate...
13497,Israeli Couscous with Roasted Butternut Squash...,"['1 preserved lemon', '1 1/2 pound butternut s...",Preheat oven to 475°F.\nHalve lemons and scoop...,israeli-couscous-with-roasted-butternut-squash...,[lemon butternut dice olive oil onion Israeli ...
13498,Rice with Soy-Glazed Bonito Flakes and Sesame ...,['Leftover katsuo bushi (dried bonito flakes) ...,"If using katsuo bushi flakes from package, moi...",rice-with-soy-glazed-bonito-flakes-and-sesame-...,[Leftover katsuo bushi from making katsuo bush...
13499,Spanakopita,['1 stick (1/2 cup) plus 1 tablespoon unsalted...,Melt 1 tablespoon butter in a 12-inch heavy sk...,spanakopita-107344,[unsalted butter baby spinach feta crumbled nu...


In [23]:
df.to_csv('CleanedRecipes.csv', index=False)