In [10]:
import pandas as pd 
import numpy as np
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("recipe_dataset_large.csv")  # Replace with your actual file path

# Split the dataset into 80% training and 20% testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

Training set size: 1784913
Testing set size: 446229


In [13]:
import spacy
from spacy.tokens import DocBin
import re
import ast

def extract_quantities(text):
    """Extract quantities from the text."""
    quantity_patterns = [
        r'\b\d+\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+/\d+\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+\s*(?:and\s*\d+/\d+)?\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+\s*(?:-\s*\d+)?\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+\s*(?:to\s*\d+)?\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+\s*(?:\.\d+)?\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b½\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b'
    ]
    entities = []
    for pattern in quantity_patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        for match in matches:
            entities.append((match.start(), match.end(), "QUANTITY"))
    return entities

def get_clean_ingredients(ingredient_list):
    """Clean the ingredients list to be used for NER."""
    try:
        return ast.literal_eval(ingredient_list)  # Convert string to list
    except (ValueError, SyntaxError):
        return []

def extract_ingredients(text, ingredients):
    """Extract ingredients from the text."""
    entities = []
    for ingredient in ingredients:
        start_idx = text.lower().find(ingredient.lower())
        if start_idx != -1:
            end_idx = start_idx + len(ingredient)
            entities.append((start_idx, end_idx, "INGREDIENT"))
    return entities

# Initialize SpaCy blank English model
nlp = spacy.blank("en")

# Define chunk size
chunk_size = 1000  # Adjust based on your memory capacity

# Process the training data in chunks
for i in range(0, len(train_df), chunk_size):
    doc_bin = DocBin()
    chunk = train_df.iloc[i:i + chunk_size]
    
    for _, row in chunk.iterrows():
        recipe_text = row['ingredients']  # Assuming 'ingredients' column has the list of ingredients
        ner_labels = get_clean_ingredients(row['NER'])  # Clean the NER column

        # Extract both quantities and ingredients from the text
        quantity_entities = extract_quantities(recipe_text)
        ingredient_entities = extract_ingredients(recipe_text, ner_labels)

        # Combine quantity and ingredient entities
        all_entities = quantity_entities + ingredient_entities

        # Make sure there are no overlapping entities
        added_spans = []
        entities_to_add = []
        doc = nlp.make_doc(recipe_text)

        for start, end, label in all_entities:
            # Check for overlapping spans
            if all(not (start < existing_end and end > existing_start) for existing_start, existing_end in added_spans):
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span:
                    entities_to_add.append(span)
                    added_spans.append((start, end))

        # Assign entities to the doc and add to DocBin
        if entities_to_add:
            doc.ents = entities_to_add
            doc_bin.add(doc)

    # Save the processed training data in SpaCy format for each chunk
    doc_bin.to_disk(f"chunks/ner_training_data_chunk_{i // chunk_size}.spacy")

print("SpaCy training data saved in chunks.")

SpaCy training data saved in chunks.


In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
import glob

# Load the training data from chunks
nlp = spacy.blank("en")
doc_bin = DocBin()

for chunk_file in glob.glob("chunks/ner_training_data_chunk_*.spacy"):
    chunk_bin = DocBin().from_disk(chunk_file)
    for doc in chunk_bin.get_docs(nlp.vocab):
        doc_bin.add(doc)

docs = list(doc_bin.get_docs(nlp.vocab))

# Create a new entity recognizer
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the entity recognizer
for doc in docs:
    for ent in doc.ents:
        ner.add_label(ent.label_)

# Disable other pipelines during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(20):  # Number of iterations
        for doc in docs:
            example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
            nlp.update([example], drop=0.5, sgd=optimizer)

# Save the model
nlp.to_disk("./output/model-best")

In [None]:
import spacy
from spacy.scorer import Scorer
from spacy.training import Example

# Load the trained model
nlp = spacy.load("./output/model-best")

# Prepare the test data
test_docs = []
for _, row in test_df.iterrows():
    recipe_text = row['ingredients']  # Assuming 'ingredients' column has the list of ingredients
    ner_labels = get_clean_ingredients(row['NER'])  # Clean the NER column

    # Extract both quantities and ingredients from the text
    quantity_entities = extract_quantities(recipe_text)
    ingredient_entities = extract_ingredients(recipe_text, ner_labels)

    # Combine quantity and ingredient entities
    all_entities = quantity_entities + ingredient_entities

    # Create a SpaCy Example object
    doc = nlp.make_doc(recipe_text)
    example = Example.from_dict(doc, {"entities": [(start, end, label) for start, end, label in all_entities]})
    test_docs.append(example)

# Evaluate the model
scorer = Scorer()
for example in test_docs:
    nlp.update([example], drop=0.0)
    scorer.score(example)

# Print evaluation results
print(scorer.scores)

In [None]:
# Test the custom NER
doc = nlp("Add 2 cups of flour and 1 tablespoon of sugar.")
for ent in doc.ents:
    print(ent.text, ent.label_)