In [None]:
import pandas as pd 
import numpy as np
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

In [None]:
data = pd.read_csv('recipe_dataset_large.csv')

In [None]:
len(data)

In [None]:
data = data.head(1000)

In [None]:
data['combined'] = data['title'] + ' ' + data['ingredients'] + ' ' + data['directions']

In [None]:
data['combined'][0]

In [None]:
texts = data['combined'].tolist()

ner_tags = data['NER'].apply(eval).tolist()  # Convert string representations of lists to actual lists

In [None]:
texts[89]

In [None]:
ner_tags[89]

In [None]:
# Create a dictionary of terms
terms = {}
patterns = []
nlp = spacy.blank("en")

for tags in ner_tags:
    for tag in tags:
        if tag.lower() not in terms:
            terms[tag.lower()] = {'label': 'INGREDIENT'}
            patterns.append(nlp(tag.lower()))

# Initialize the PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)  # nlp.vocab ~ A storage class for vocabulary and other data shared across a language
matcher.add("INGREDIENT", None, *patterns)

In [None]:
nlp.analyze_pipes()

In [None]:
@Language.component("ingredient_extractor")
def ingredient_extractor(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label='INGREDIENT') for match_id, start, end in matches]

    # Resolve overlaps by keeping the longest span
    filtered_spans = spacy.util.filter_spans(spans)
    
    doc.ents = filtered_spans
    return doc
# Add the custom component to the pipeline
nlp.add_pipe("ingredient_extractor", last=True)

import spacy
from spacy.tokens import Span
from spacy.matcher import Matcher

@Language.component("quantity_extractor")
def quantity_extractor(doc):
    matcher = Matcher(nlp.vocab)
    pattern = [
        {"LIKE_NUM": True},  # Match numbers
        {"LOWER": {"IN": ["cup", "cups", "tablespoon", "tablespoons", "tsp", "teaspoon", "teaspoons", "oz", "ounce", "ounces", "pound", "pounds", "lb", "lbs", "gram", "grams", "kg", "kilogram", "kilograms"]}}
    ]
    matcher.add("QUANTITY", [pattern])
    matches = matcher(doc)
    spans = [Span(doc, start, end, label="QUANTITY") for match_id, start, end in matches]
    filtered_spans = spacy.util.filter_spans(spans)
    
    # Ensure no overlapping entities
    new_ents = [ent for ent in doc.ents if ent.label_ != "QUANTITY"]
    doc.ents = new_ents + filtered_spans
    return doc
nlp.add_pipe("quantity_extractor", last=True)

In [None]:
nlp.analyze_pipes()

In [None]:
from spacy.tokens import DocBin

train_data = [(text, {"entities": []}) for text in texts]

for i, (text, annotations) in enumerate(train_data):
    doc = nlp(text)
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    train_data[i] = (text, {"entities": entities})

In [None]:
def save_training_data(data, output_file):
    nlp = spacy.blank("en")
    doc_bin = DocBin()
    for text, annotations in data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    doc_bin.to_disk(output_file)

save_training_data(train_data, 'training_data.spacy')

In [None]:
import random
from spacy.training import Example
from spacy.util import minibatch, compounding

# Load the training data
nlp = spacy.blank("en")
db = DocBin().from_disk("training_data.spacy")
docs = list(db.get_docs(nlp.vocab))

# Create the NER component and add it to the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)

# Add the labels to the NER component
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# Disable other pipes during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(5):  # 5 iterations
        random.shuffle(docs)
        losses = {}
        batches = minibatch(docs, size=compounding(4.0, 32.0, 1.5))
        for batch in batches:
            for doc in batch:
                example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
                nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {itn}, Losses: {losses}")

# Save the trained model to disk
nlp.to_disk("ner_model")

In [None]:
# Test the custom NER
doc = nlp("Add 2 cups of flour and 1 tablespoon of sugar.")
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
type(train_data)

In [None]:
def evaluate_model(model, test_data):
    examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in test_data]
    scorer = model.evaluate(examples)
    return scorer

In [None]:
evaluation_results = evaluate_model(nlp, train_data)
print("Evaluation Results:", evaluation_results)

In [None]:
# Test the model on some examples
test_texts = "In a heavy 2-quart saucepan, mix 2 lbs brown sugar, nuts, evaporated milk and butter or margarine. Stir over medium heat until mixture bubbles all over top. Boil and stir 5 minutes more. Take off heat. Stir in vanilla and cereal; mix well. Using 2 teaspoons, drop and shape into 30 clusters on wax paper.Let stand until firm, about 30 minutes."
doc = nlp(test_texts)

In [None]:
from spacy import displacy

displacy.render(doc, style='ent')

In [None]:
real_text = """Peel potatoes and keep aside
Heat oil in a wok
Crackle mustard seeds and urad dal
Add Kari Patta
soute ginger garlic paste and chopped onion
Aad potatoes and soute for 1 to 2minutes
Add all dry masala
Cook with lid on slow flame for 10 minutes
Garnish with chopped coriander
Serve hot with roti or poori
"""

In [None]:
doc2 = nlp(real_text)

displacy.render(doc2, style='ent')

In [None]:
import json


entities = []
for ent in doc.ents:
    if ent.label_ == "QUANTITY":
        for i_ent in doc.ents:
            if i_ent.label_ == "INGREDIENT" and i_ent.start == ent.end:
                entities.append({"ingredient": i_ent.text, "quantity": ent.text})
                break
    elif ent.label_ == "INGREDIENT":
        if not any(e["ingredient"] == ent.text for e in entities):
            entities.append({"ingredient": ent.text, "quantity": ""})

entities_json = json.dumps(entities, indent=2)

print(entities_json)

In [None]:
from spacy.training import Example
from spacy.scorer import Scorer

def evaluate_model(model, test_data):
	examples = [Example.from_dict(model.make_doc(text), annotations) for text, annotations in test_data]
	scorer = Scorer()
	for example in examples:
		model.update([example], sgd=None, losses={})
		scorer.score([example])  # Pass a list containing the example
	return scorer.score(examples)  # Use the score method to get the evaluation results

evaluation_results = evaluate_model(nlp, train_data)
print("Evaluation Results:", evaluation_results)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("recipe_dataset_large.csv")  # Replace with your actual file path

# Split the dataset into 80% training and 20% testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

In [None]:
import spacy
from spacy.tokens import DocBin
import re
import ast

def extract_quantities(text):
    """Extract quantities from the text."""
    quantity_patterns = [
        r'\b\d+\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+/\d+\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+\s*(?:and\s*\d+/\d+)?\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+\s*(?:-\s*\d+)?\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+\s*(?:to\s*\d+)?\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b\d+\s*(?:\.\d+)?\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b',
        r'\b½\s*(?:cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|gram|grams|kg|kilogram|kilograms|ml|milliliter|milliliters|l|liter|liters)\b'
    ]
    entities = []
    for pattern in quantity_patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        for match in matches:
            entities.append((match.start(), match.end(), "QUANTITY"))
    return entities

def get_clean_ingredients(ingredient_list):
    """Clean the ingredients list to be used for NER."""
    try:
        return ast.literal_eval(ingredient_list)  # Convert string to list
    except (ValueError, SyntaxError):
        return []

def extract_ingredients(text, ingredients):
    """Extract ingredients from the text."""
    entities = []
    for ingredient in ingredients:
        start_idx = text.lower().find(ingredient.lower())
        if start_idx != -1:
            end_idx = start_idx + len(ingredient)
            entities.append((start_idx, end_idx, "INGREDIENT"))
    return entities

# Initialize SpaCy blank English model
nlp = spacy.blank("en")
doc_bin = DocBin()

# Process the training data
for _, row in train_df.iterrows():
    recipe_text = row['ingredients']  # Assuming 'ingredients' column has the list of ingredients
    ner_labels = get_clean_ingredients(row['NER'])  # Clean the NER column

    # Extract both quantities and ingredients from the text
    quantity_entities = extract_quantities(recipe_text)
    ingredient_entities = extract_ingredients(recipe_text, ner_labels)

    # Combine quantity and ingredient entities
    all_entities = quantity_entities + ingredient_entities

    # Make sure there are no overlapping entities
    added_spans = []
    entities_to_add = []
    doc = nlp.make_doc(recipe_text)

    for start, end, label in all_entities:
        # Check for overlapping spans
        if all(not (start < existing_end and end > existing_start) for existing_start, existing_end in added_spans):
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span:
                entities_to_add.append(span)
                added_spans.append((start, end))

    # Assign entities to the doc and add to DocBin
    if entities_to_add:
        doc.ents = entities_to_add
        doc_bin.add(doc)

# Save the processed training data in SpaCy format
doc_bin.to_disk("ner_training_data.spacy")

In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example

# Load the training data
nlp = spacy.blank("en")
doc_bin = DocBin().from_disk("ner_training_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

# Create a new entity recognizer
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the entity recognizer
for doc in docs:
    for ent in doc.ents:
        ner.add_label(ent.label_)

# Disable other pipelines during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(20):  # Number of iterations
        for doc in docs:
            example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
            nlp.update([example], drop=0.5, sgd=optimizer)

# Save the model
nlp.to_disk("./output/model-best")

In [None]:
import spacy
from spacy.scorer import Scorer
from spacy.training import Example

# Load the trained model
nlp = spacy.load("./output/model-best")

# Prepare the test data
test_docs = []
for _, row in test_df.iterrows():
    recipe_text = row['ingredients']  # Assuming 'ingredients' column has the list of ingredients
    ner_labels = get_clean_ingredients(row['NER'])  # Clean the NER column

    # Extract both quantities and ingredients from the text
    quantity_entities = extract_quantities(recipe_text)
    ingredient_entities = extract_ingredients(recipe_text, ner_labels)

    # Combine quantity and ingredient entities
    all_entities = quantity_entities + ingredient_entities

    # Create a SpaCy Example object
    doc = nlp.make_doc(recipe_text)
    example = Example.from_dict(doc, {"entities": [(start, end, label) for start, end, label in all_entities]})
    test_docs.append(example)

# Evaluate the model
scorer = Scorer()
for example in test_docs:
    nlp.update([example], drop=0.0)
    scorer.score(example)

# Print evaluation results
print(scorer.scores)

In [None]:
# Test the custom NER
doc = nlp("Add 2 cups of flour and 1 tablespoon of sugar.")
for ent in doc.ents:
    print(ent.text, ent.label_)