In [2]:
import pandas as pd 
import numpy as np
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

In [3]:
data = pd.read_csv('recipe_dataset_large.csv')

In [4]:
len(data)

2231142

In [5]:
data = data.head(1000)

In [6]:
data['combined'] = data['title'] + ' ' + data['ingredients'] + ' ' + data['directions']

In [7]:
data['combined'][0]

'No-Bake Nut Cookies ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"] ["In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.", "Stir over medium heat until mixture bubbles all over top.", "Boil and stir 5 minutes more. Take off heat.", "Stir in vanilla and cereal; mix well.", "Using 2 teaspoons, drop and shape into 30 clusters on wax paper.", "Let stand until firm, about 30 minutes."]'

In [8]:
texts = data['combined'].tolist()

ner_tags = data['NER'].apply(eval).tolist()  # Convert string representations of lists to actual lists

In [9]:
texts[89]

'Cheese Ball ["2 pkg. cream cheese", "2 c. shredded Cheddar cheese (8 oz.)", "1 Tbsp. chopped pimento", "1 Tbsp. chopped green pepper", "1 Tbsp. chopped onion", "1 tsp. lemon juice", "2 tsp. Worcestershire sauce", "dash of Tabasco sauce", "dash of salt"] ["Combine softened cream and Cheddar cheeses.", "Add chopped up items.", "Mix and add liquids last.", "Mix well.", "Roll into ball and cover with chopped nuts."]'

In [10]:
ner_tags[89]

['cream cheese',
 'Cheddar cheese',
 'pimento',
 'green pepper',
 'onion',
 'lemon juice',
 'Worcestershire sauce',
 'Tabasco sauce',
 'salt']

In [11]:
# Create a dictionary of terms
terms = {}
patterns = []
nlp = spacy.blank("en")

for tags in ner_tags:
    for tag in tags:
        if tag.lower() not in terms:
            terms[tag.lower()] = {'label': 'INGREDIENT'}
            patterns.append(nlp(tag.lower()))

# Initialize the PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)  # nlp.vocab ~ A storage class for vocabulary and other data shared across a language
matcher.add("INGREDIENT", None, *patterns)

In [12]:
nlp.analyze_pipes()

{'summary': {}, 'problems': {}, 'attrs': {}}

In [13]:
# @Language.component("ingredient_extractor")
# def ingredient_extractor(doc):
#     matches = matcher(doc)
#     spans = [Span(doc, start, end, label='INGREDIENT') for match_id, start, end in matches]

#     # Resolve overlaps by keeping the longest span
#     filtered_spans = spacy.util.filter_spans(spans)
    
#     doc.ents = filtered_spans
#     return doc

# # Add the custom component to the pipeline
# nlp.add_pipe("ingredient_extractor", last=True)

In [14]:
@Language.component("ingredient_extractor")
def ingredient_extractor(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label='INGREDIENT') for match_id, start, end in matches]

    # Resolve overlaps by keeping the longest span
    filtered_spans = spacy.util.filter_spans(spans)
    
    doc.ents = filtered_spans
    return doc
# Add the custom component to the pipeline
nlp.add_pipe("ingredient_extractor", last=True)

import spacy
from spacy.tokens import Span
from spacy.matcher import Matcher

@Language.component("quantity_extractor")
def quantity_extractor(doc):
    matcher = Matcher(nlp.vocab)
    pattern = [
        {"LIKE_NUM": True},  # Match numbers
        {"LOWER": {"IN": ["cup", "cups", "tablespoon", "tablespoons", "tsp", "teaspoon", "teaspoons", "oz", "ounce", "ounces", "pound", "pounds", "lb", "lbs", "gram", "grams", "kg", "kilogram", "kilograms"]}}
    ]
    matcher.add("QUANTITY", [pattern])
    matches = matcher(doc)
    spans = [Span(doc, start, end, label="QUANTITY") for match_id, start, end in matches]
    filtered_spans = spacy.util.filter_spans(spans)
    
    # Ensure no overlapping entities
    new_ents = [ent for ent in doc.ents if ent.label_ != "QUANTITY"]
    doc.ents = new_ents + filtered_spans
    return doc
nlp.add_pipe("quantity_extractor", last=True)

<function __main__.quantity_extractor(doc)>

In [15]:
nlp.analyze_pipes()

{'summary': {'ingredient_extractor': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'quantity_extractor': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'ingredient_extractor': [], 'quantity_extractor': []},
 'attrs': {}}

In [16]:
from spacy.tokens import DocBin

train_data = [(text, {"entities": []}) for text in texts]

for i, (text, annotations) in enumerate(train_data):
    doc = nlp(text)
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    train_data[i] = (text, {"entities": entities})

In [17]:
def save_training_data(data, output_file):
    nlp = spacy.blank("en")
    doc_bin = DocBin()
    for text, annotations in data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    doc_bin.to_disk(output_file)

save_training_data(train_data, 'training_data.spacy')

In [18]:
import random
from spacy.training import Example
from spacy.util import minibatch, compounding

# Load the training data
nlp = spacy.blank("en")
db = DocBin().from_disk("training_data.spacy")
docs = list(db.get_docs(nlp.vocab))

# Create the NER component and add it to the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)

# Add the labels to the NER component
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# Disable other pipes during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(5):  # 5 iterations
        random.shuffle(docs)
        losses = {}
        batches = minibatch(docs, size=compounding(4.0, 32.0, 1.5))
        for batch in batches:
            for doc in batch:
                example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
                nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {itn}, Losses: {losses}")

# Save the trained model to disk
nlp.to_disk("ner_model")

Iteration 0, Losses: {'ner': 9461.843557978062}
Iteration 1, Losses: {'ner': 2846.7632647632736}
Iteration 2, Losses: {'ner': 2048.8373965888636}
Iteration 3, Losses: {'ner': 1607.7984022433698}
Iteration 4, Losses: {'ner': 1487.080349277643}


In [19]:
# import spacy
# from spacy.tokens import Span
# from spacy.matcher import Matcher

# # Define a custom component for Quantity NER
# @Language.component("quantity_ner")
# def quantity_ner(doc):
#     matcher = Matcher(nlp.vocab)
#     pattern = [
#         {"LIKE_NUM": True},  # Match numbers
#         {"LOWER": {"IN": ["cup", "cups", "tablespoon", "tablespoons", "tsp", "teaspoon", "teaspoons", "oz", "ounce", "ounces", "pound", "pounds", "lb", "lbs", "gram", "grams", "kg", "kilogram", "kilograms"]}}
#     ]
#     matcher.add("QUANTITY", [pattern])
#     matches = matcher(doc)
#     spans = [Span(doc, start, end, label="QUANTITY") for match_id, start, end in matches]
#     filtered_spans = spacy.util.filter_spans(spans)
    
#     # Ensure no overlapping entities
#     new_ents = [ent for ent in doc.ents if ent.label_ != "QUANTITY"]
#     doc.ents = new_ents + filtered_spans
#     return doc

# if "quantity_extractor" not in nlp.pipe_names:
#     nlp.add_pipe("quantity_extractor", after="ner")

# Test the custom NER
doc = nlp("Add 2 cups of flour and 1 tablespoon of sugar.")
for ent in doc.ents:
    print(ent.text, ent.label_)

2 cups QUANTITY
flour INGREDIENT
1 tablespoon QUANTITY
sugar INGREDIENT


In [20]:
type(train_data)

list

In [21]:
def evaluate_model(model, test_data):
    examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in test_data]
    scorer = model.evaluate(examples)
    return scorer

In [22]:
evaluation_results = evaluate_model(nlp, train_data)
print("Evaluation Results:", evaluation_results)

Evaluation Results: {'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.980236171535115, 'ents_r': 0.9736403481696401, 'ents_f': 0.976927126885317, 'ents_per_type': {'INGREDIENT': {'p': 0.9771424424932879, 'r': 0.9692650975311308, 'f': 0.9731878297318782}, 'QUANTITY': {'p': 0.9987007362494587, 'r': 1.0, 'f': 0.9993499458288191}}, 'speed': 12550.41133649976}


In [23]:
# Test the model on some examples
test_texts = "In a heavy 2-quart saucepan, mix 2 lbs brown sugar, nuts, evaporated milk and butter or margarine. Stir over medium heat until mixture bubbles all over top. Boil and stir 5 minutes more. Take off heat. Stir in vanilla and cereal; mix well. Using 2 teaspoons, drop and shape into 30 clusters on wax paper.Let stand until firm, about 30 minutes."
doc = nlp(test_texts)

In [24]:
from spacy import displacy

displacy.render(doc, style='ent')

In [25]:
real_text = """Peel potatoes and keep aside
Heat oil in a wok
Crackle mustard seeds and urad dal
Add Kari Patta
soute ginger garlic paste and chopped onion
Aad potatoes and soute for 1 to 2minutes
Add all dry masala
Cook with lid on slow flame for 10 minutes
Garnish with chopped coriander
Serve hot with roti or poori
"""

In [26]:
doc2 = nlp(real_text)

displacy.render(doc2, style='ent')

In [27]:
import json


entities = []
for ent in doc.ents:
    if ent.label_ == "QUANTITY":
        for i_ent in doc.ents:
            if i_ent.label_ == "INGREDIENT" and i_ent.start == ent.end:
                entities.append({"ingredient": i_ent.text, "quantity": ent.text})
                break
    elif ent.label_ == "INGREDIENT":
        if not any(e["ingredient"] == ent.text for e in entities):
            entities.append({"ingredient": ent.text, "quantity": ""})

entities_json = json.dumps(entities, indent=2)

print(entities_json)

[
  {
    "ingredient": "mix",
    "quantity": ""
  },
  {
    "ingredient": "brown sugar",
    "quantity": "2 lbs"
  },
  {
    "ingredient": "nuts",
    "quantity": ""
  },
  {
    "ingredient": "milk",
    "quantity": ""
  },
  {
    "ingredient": "butter",
    "quantity": ""
  },
  {
    "ingredient": "margarine",
    "quantity": ""
  },
  {
    "ingredient": "vanilla",
    "quantity": ""
  },
  {
    "ingredient": "cereal",
    "quantity": ""
  }
]


In [30]:
from spacy.training import Example
from spacy.scorer import Scorer

def evaluate_model(model, test_data):
	examples = [Example.from_dict(model.make_doc(text), annotations) for text, annotations in test_data]
	scorer = Scorer()
	for example in examples:
		model.update([example], sgd=None, losses={})
		scorer.score([example])  # Pass a list containing the example
	return scorer.score(examples)  # Use the score method to get the evaluation results

evaluation_results = evaluate_model(nlp, train_data)
print("Evaluation Results:", evaluation_results)

Evaluation Results: {'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'ents_per_type': {'INGREDIENT': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'QUANTITY': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}
