In [25]:
from nltk.corpus import wordnet
synonyms = []
for syn in wordnet.synsets("eggplant"):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)  # Output: ['coriander', 'cilantro', ...]

['eggplant', 'aubergine', 'mad_apple', 'eggplant', 'aubergine', 'brinjal', 'eggplant_bush', 'garden_egg', 'mad_apple', 'Solanum_melongena']


In [5]:
# Import NLTK
import nltk

# Download the WordNet resource
nltk.download('wordnet')

# Verify it works by importing and using WordNet
from nltk.corpus import wordnet
print("WordNet successfully installed!")
print(f"Example: Synsets for 'python': {wordnet.synsets('python')}")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paili\AppData\Roaming\nltk_data...


WordNet successfully installed!
Example: Synsets for 'python': [Synset('python.n.01'), Synset('python.n.02'), Synset('python.n.03')]


In [30]:
import nltk
from nltk.stem import WordNetLemmatizer

# Download WordNet data (only needed first time)
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

test_ingredients = [
    # "japanese eggplants",
    # "persian cucumbers",
    # "baby corns",
    # "cauliflower florets",
    # "roma tomatoes",
    # "fresh basil leaves",
    # "extra virgin olive oil",
    # "minced garlic cloves",
    # "grated parmesan cheese",
    # "whole wheat spaghetti"
    "coleslaw mix",
    "banana leaves",
    "savoy cabbage leaves",
    "napa cabbage leaves",
    "cauliflower florets",
    "celery leaves",
    "red belgian endive",
    "plain yogurt",
    "baby corn",
    "low fat vanilla yogurt",
    "persian cucumber",
    "japanese eggplant",
    "bottled garlic",
    "bottled ginger",
    "lacinato kale",
    "mixed mushrooms",
]

def normalize_with_wordnet(ingredient):
    words = ingredient.split()
    # Try lemmatizing as noun first, then verb if that doesn't change the word
    lemmatized = [lemmatizer.lemmatize(lemmatizer.lemmatize(word, 'n'), 'v') 
                 for word in words]
    return ' '.join(lemmatized)

print("WordNet Lemmatizer Results:")
print("-" * 40)
for ingredient in test_ingredients:
    normalized = normalize_with_wordnet(ingredient)
    print(f"{ingredient.ljust(25)} => {normalized}")

WordNet Lemmatizer Results:
----------------------------------------
coleslaw mix              => coleslaw mix
banana leaves             => banana leaf
savoy cabbage leaves      => savoy cabbage leaf
napa cabbage leaves       => napa cabbage leaf
cauliflower florets       => cauliflower floret
celery leaves             => celery leaf
red belgian endive        => red belgian endive
plain yogurt              => plain yogurt
baby corn                 => baby corn
low fat vanilla yogurt    => low fat vanilla yogurt
persian cucumber          => persian cucumber
japanese eggplant         => japanese eggplant
bottled garlic            => bottle garlic
bottled ginger            => bottle ginger
lacinato kale             => lacinato kale
mixed mushrooms           => mix mushroom


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paili\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\paili\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

test_ingredients = [
    # "japanese eggplants",
    # "persian cucumbers",
    # "baby corns",
    # "cauliflower florets",
    # "roma tomatoes",
    # "fresh basil leaves",
    # "extra virgin olive oil",
    # "minced garlic cloves",
    # "grated parmesan cheese",
    # "whole wheat spaghetti"

"canned cannellini beans",
"canned blackeyed peas",
"dried red lentils",
"canned butter beans",
"red miso",
"kecap manis",
"firm tofu",
"milk substitute",
"lower sodium shoyu",
"lamb",
"corn muffin mix",
"crusty bread",
"crostini",
"wheat flatbreads",
"bread cubes",
"panko",
"matzo",
"oyster crackers",
"shortcrust pastry",
"sesame seed burger bun",
"whole wheat hot dog bun",
"wrap",
"wonton wrappers",
"calumet baking powder",
"flatbread"
]

def normalize_with_spacy(ingredient):
    doc = nlp(ingredient)
    
    # Strategy 1: Get the root/main noun
    root = [token.text for token in doc if token.dep_ == 'ROOT']
    
    # Strategy 2: Get all nouns
    nouns = [token.lemma_ for token in doc if token.pos_ == 'NOUN']
    
    # Strategy 3: Get compound nouns
    compounds = []
    for token in doc:
        if token.dep_ in ('compound', 'amod'):
            compounds.append(token.text)
        if token.pos_ == 'NOUN':
            compounds.append(token.lemma_)
            break
    
    return {
        'root': root[0] if root else None,
        'nouns': ', '.join(nouns) if nouns else None,
        'compounds': ', '.join(compounds) if compounds else None
    }

print("\nspaCy Results:")
print("-" * 40)
for ingredient in test_ingredients:
    results = normalize_with_spacy(ingredient)
    print(f"Original: {ingredient}")
    print(f"  Root: {results['root']}")
    print(f"  Nouns: {results['nouns']}")
    print(f"  Compounds: {results['compounds']}")
    print("-" * 20)


spaCy Results:
----------------------------------------
Original: canned cannellini beans
  Root: beans
  Nouns: cannellini, bean
  Compounds: canned, cannellini, cannellini
--------------------
Original: canned blackeyed peas
  Root: canned
  Nouns: pea
  Compounds: blackeyed, pea
--------------------
Original: dried red lentils
  Root: lentils
  Nouns: lentil
  Compounds: dried, red, lentil
--------------------
Original: canned butter beans
  Root: beans
  Nouns: butter, bean
  Compounds: canned, butter, butter
--------------------
Original: red miso
  Root: miso
  Nouns: miso
  Compounds: red, miso
--------------------
Original: kecap manis
  Root: manis
  Nouns: None
  Compounds: kecap
--------------------
Original: firm tofu
  Root: tofu
  Nouns: firm, tofu
  Compounds: firm, firm
--------------------
Original: milk substitute
  Root: substitute
  Nouns: milk, substitute
  Compounds: milk, milk
--------------------
Original: lower sodium shoyu
  Root: shoyu
  Nouns: sodium, shoyu

In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")

test_ingredients = [
    # "japanese eggplants",
    # "persian cucumbers",
    # "baby corns",
    # "cauliflower florets",
    # "roma tomatoes",
    # "fresh basil leaves",
    # "extra virgin olive oil",
    # "minced garlic cloves",
    # "grated parmesan cheese",
    # "whole wheat spaghetti"

"canned cannellini beans",
"canned blackeyed peas",
"dried red lentils",
"canned butter beans",
"red miso",
"kecap manis",
"firm tofu",
"milk substitute",
"lower sodium shoyu",
"lamb",
"corn muffin mix",
"crusty bread",
"crostini",
"wheat flatbreads",
"bread cubes",
"panko",
"matzo",
"oyster crackers",
"shortcrust pastry",
"sesame seed burger bun",
"whole wheat hot dog bun",
"wrap",
"wonton wrappers",
"calumet baking powder",
"flatbread"
]

def normalize_ingredient(name):
    # Check against known exceptions first
    # for pattern, replacement in normalization_rules.items():
    #     if re.search(pattern, name, re.I):
    #         return replacement
    
    # Otherwise use NLP approach
    doc = nlp(name.lower())
    # Get nouns and compound nouns
    tokens = [token.lemma_ for token in doc if token.pos_ in ('NOUN', 'PROPN')]
    return ' '.join(tokens[-1:])  # Take the last noun as most important

    # nouns = [token.lemma_ for token in doc if token.pos_ == 'NOUN']
    # return max(set(nouns), key=nouns.count) if nouns else name


print("\nspaCy Results:")
print("-" * 40)
for ingredient in test_ingredients:
    result = normalize_ingredient(ingredient)
    print(f"Original: {ingredient}")
    print(f"  Result: {result}")
    print("-" * 20)


spaCy Results:
----------------------------------------
Original: canned cannellini beans
  Result: bean
--------------------
Original: canned blackeyed peas
  Result: pea
--------------------
Original: dried red lentils
  Result: lentil
--------------------
Original: canned butter beans
  Result: bean
--------------------
Original: red miso
  Result: miso
--------------------
Original: kecap manis
  Result: kecap manis
--------------------
Original: firm tofu
  Result: firm
--------------------
Original: milk substitute
  Result: milk
--------------------
Original: lower sodium shoyu
  Result: shoyu
--------------------
Original: lamb
  Result: lamb
--------------------
Original: corn muffin mix
  Result: corn
--------------------
Original: crusty bread
  Result: bread
--------------------
Original: crostini
  Result: crostini
--------------------
Original: wheat flatbreads
  Result: wheat
--------------------
Original: bread cubes
  Result: cube
--------------------
Original: panko


In [17]:
import spacy
from sqlalchemy import create_engine, MetaData, Table, select, text
from sqlalchemy.exc import SQLAlchemyError

# Initialize NLP processor
print("Loading spaCy language model...")
nlp = spacy.load("en_core_web_sm")

# MSSQL Database configuration with Integrated Security
DB_CONFIG = {
    'driver': 'ODBC Driver 17 for SQL Server',
    'server': '(localdb)\\MSSQLLocalDB',
    'database': 'GroceryDB',
    'schema': 'dbo',
    'ingredients_table': 'Ingredients',
    'results_table': 'IngredientName'
}

def get_db_engine():
    """Create SQLAlchemy engine for MSSQL using Windows Authentication"""
    connection_string = (
        f"mssql+pyodbc://{DB_CONFIG['server']}/{DB_CONFIG['database']}?"
        f"driver={DB_CONFIG['driver']}&"
        f"trusted_connection=yes"
    )
    # connection_string = "DRIVER={ODBC Driver 17 for SQL Server};SERVER=(localdb)\\MSSQLLocalDB;DATABASE=GroceryDB;Trusted_Connection=yes;"
    return create_engine(connection_string)

def extract_nouns(text):
    """Extract nouns from text using spaCy"""
    if not text or str(text).strip() == '':
        return None, None
        
    doc = nlp(str(text).lower())
    nouns = [token.lemma_ for token in doc if token.pos_ == 'NOUN']
    
    last_noun = nouns[-1] if nouns else text
    all_nouns = ' '.join(nouns) if nouns else text
    
    return last_noun, all_nouns

def get_unprocessed_ingredients(engine):
    """Retrieve only ingredients that haven't been processed yet"""
    metadata = MetaData()
    
    ingredients = Table(
        DB_CONFIG['ingredients_table'],
        metadata,
        autoload_with=engine,
        schema=DB_CONFIG['schema']
    )
    
    processed = Table(
        DB_CONFIG['results_table'],
        metadata,
        autoload_with=engine,
        schema=DB_CONFIG['schema']
    )
    
    query = select(
        ingredients.c.IngredientId,
        ingredients.c.Name
    ).select_from(
        ingredients.outerjoin(
            processed,
            ingredients.c.IngredientId == processed.c.IngredientId
        )
    ).where(
        processed.c.IngredientId == None
    )
    
    with engine.connect() as conn:
        result = conn.execute(query)
        return result.fetchall()

def process_ingredients():
    """Main processing function with incremental update support"""
    engine = get_db_engine()
    
    try:
        # Get only unprocessed ingredients
        unprocessed = get_unprocessed_ingredients(engine)
        
        if not unprocessed:
            print("No new ingredients to process")
            return
            
        print(f"Found {len(unprocessed)} new ingredients to process...")
        
        # Process in batches
        batch_size = 100
        for i in range(0, len(unprocessed), batch_size):
            batch = unprocessed[i:i + batch_size]
            processed_data = []
            
            for id, name in batch:
                try:
                    last_noun, all_nouns = extract_nouns(name)
                    processed_data.append({
                        'IngredientId': id,
                        'OriginalName': name,
                        'LastNoun': last_noun,
                        'AllNouns': all_nouns
                    })
                except Exception as e:
                    processed_data.append({
                        'IngredientId': id,
                        'OriginalName': name,
                        'LastNoun': name,
                        'AllNouns': name
                    })
            
            # Insert batch into database
            if processed_data:
                insert_stmt = f"""
                INSERT INTO {DB_CONFIG['schema']}.{DB_CONFIG['results_table']} 
                (IngredientId, OriginalName, LastNoun, Processed)
                VALUES (:IngredientId, :OriginalName, :LastNoun, :AllNouns)
                """
                with engine.begin() as conn:
                    conn.execute(text(insert_stmt), processed_data)
            
            print(f"Processed {min(i + batch_size, len(unprocessed))}/{len(unprocessed)}")
        
        print("Processing complete!")
        
    except SQLAlchemyError as e:
        print(f"Database error: {e}")

if __name__ == "__main__":
    process_ingredients()

Loading spaCy language model...
Found 1828 new ingredients to process...
Processed 100/1828
Processed 200/1828
Processed 300/1828
Processed 400/1828
Processed 500/1828
Processed 600/1828
Processed 700/1828
Processed 800/1828
Processed 900/1828
Processed 1000/1828
Processed 1100/1828
Processed 1200/1828
Processed 1300/1828
Processed 1400/1828
Processed 1500/1828
Processed 1600/1828
Processed 1700/1828
Processed 1800/1828
Processed 1828/1828
Processing complete!


In [11]:
pip install sqlalchemy

Collecting sqlalchemy
  Downloading sqlalchemy-2.0.40-cp39-cp39-win_amd64.whl.metadata (9.9 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.1-cp39-cp39-win_amd64.whl.metadata (4.2 kB)
Downloading sqlalchemy-2.0.40-cp39-cp39-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------- ----- 1.8/2.1 MB 10.1 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 9.1 MB/s eta 0:00:00
Downloading greenlet-3.2.1-cp39-cp39-win_amd64.whl (294 kB)
Installing collected packages: greenlet, sqlalchemy
Successfully installed greenlet-3.2.1 sqlalchemy-2.0.40
Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install pyodbc

Collecting pyodbc
  Downloading pyodbc-5.2.0-cp39-cp39-win_amd64.whl.metadata (2.8 kB)
Downloading pyodbc-5.2.0-cp39-cp39-win_amd64.whl (68 kB)
Installing collected packages: pyodbc
Successfully installed pyodbc-5.2.0
Note: you may need to restart the kernel to use updated packages.
