In [40]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag, word_tokenize
import nltk

# Ensure NLTK downloads
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [64]:

def preprocess_ingredients(ingredients_str):
    ingredients_list = eval(ingredients_str)
    processed_ingredients = []
    regex = re.compile('[^a-zA-Z ]')
    lemmatizer = WordNetLemmatizer()
    #stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    # POS tags that represent nouns
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']

    for ingr in ingredients_list:
        ingr = regex.sub(' ', ingr.lower()).strip()
        components = [comp.strip() for comp in ingr.split('and')]

        for comp in components:
            sentence = ""
            tokens = word_tokenize(comp)  # Tokenize each component
            tagged_tokens = pos_tag(tokens)  # Perform POS tagging
            
            # Extract main nouns while handling compound nouns
            nouns = []
            current_noun = ""
            for word, tag in tagged_tokens:
                word = lemmatizer.lemmatize(word.strip())
                if len(word) > 2 and word not in stop_words and tag in noun_tags:
                    if current_noun:
                        nouns.append(current_noun)
                        current_noun = ""
                    current_noun = word
            
            # Add last current noun if exists
            if current_noun:
                nouns.append(current_noun)
            
            for word in nouns:
                singular_comp = lemmatizer.lemmatize(word.strip())
                #stemmed_word = stemmer.stem(singular_comp)
            
                if singular_comp not in stop_words and len(singular_comp) > 2:
                    sentence += singular_comp + " "
                    
            if sentence.strip():
                processed_ingredients.append(sentence.strip())

    return list(set(processed_ingredients))


def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    data.columns = data.columns.str.strip()  # Remove any leading/trailing whitespace from column names

    # Preprocess ingredients to merge compound ingredients
    data['ingredients'] = data['ingredients'].apply(preprocess_ingredients)
    return data


file_path = 'dataset/RAW_merged_top_smallest.csv'
data = load_and_preprocess_data(file_path)

In [65]:
for ing in data['ingredients']:
    print(ing)

['chicken', 'potato', 'celery', 'salt', 'chicken broth', 'pepper', 'vegetable', 'pie crust', 'half', 'onion', 'margarine', 'flour']
['brown sugar', 'cake mix', 'egg', 'vanilla flavor pudding', 'water', 'pie filling', 'oil', 'pecan']
['skewer', 'salt', 'egg', 'milk', 'sugar', 'dog', 'powder', 'cornmeal', 'oil', 'flour']
['plain chocolate', 'hershey chocolate kiss', 'hershey hug chocolate', 'pretzel']
['chicken', 'salt', 'chicken broth', 'pepper', 'milk', 'vegetable', 'pie crust', 'onion', 'margarine', 'flour']
['vanilla', 'miniature peanut butter cup', 'peanut butter', 'brown sugar', 'salt', 'egg', 'sugar', 'soda', 'butter', 'flour']
['vanilla', 'almond', 'confectioner', 'cream cheese', 'strawberry']
['salt', 'egg', 'yeast', 'milk', 'sugar', 'butter', 'flour']
['ground cinnamon', 'buttermilk', 'salt', 'egg', 'sugar', 'soda', 'powder', 'butter', 'flour']
['boneless skinless chicken breast', 'salt', 'breadcrumb', 'monterey jack cheese', 'margarine', 'cheese', 'parsley']
['water', 'butter'