In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data = pd.read_csv('/content/drive/MyDrive/preprocessed_labelled_data.csv')

In [6]:
data.head()

Unnamed: 0,NER,normalized_combined
0,"['bite size shredded rice biscuits', 'vanilla'...",no-bake nut cookies 1 cup firmly packed brown ...
1,"['cream of mushroom soup', 'beef', 'sour cream...",jewell ball s chicken 1 small jar chipped beef...
2,"['frozen corn', 'pepper', 'cream cheese', 'gar...",creamy corn 2 16 ounce package frozen corn 1 8...
3,"['graham cracker crumbs', 'powdered sugar', 'p...",reeses cup candy 1 cup peanut butter 0.75 cup ...
4,"['sour cream', 'bacon', 'pepper', 'extra lean ...",cheeseburger potato soup 6 baking potatoes 1 p...


In [7]:
len(data)

1930209

In [14]:
data=data.head(1000)

In [15]:
texts = data['normalized_combined'].tolist()
type(texts)

list

In [16]:
ner_tags = data['NER'].apply(eval).tolist()
ner_tags[0]

['bite size shredded rice biscuits',
 'vanilla',
 'brown sugar',
 'nuts',
 'milk',
 'butter']

In [78]:
# Create a dictionary of terms
terms = {}
patterns = []
nlp = spacy.blank("en")

for tags in ner_tags:
    for tag in tags:
        if tag not in terms and tag!='mix':
            terms[tag] = {'label': 'INGREDIENT'}
            patterns.append(nlp(tag))

# Initialize the PhraseMatcher
ingredient_matcher = PhraseMatcher(nlp.vocab)  # nlp.vocab ~ A storage class for vocabulary and other data shared across a language
ingredient_matcher.add("INGREDIENT", None, *patterns)

In [79]:
nlp.analyze_pipes()

{'summary': {}, 'problems': {}, 'attrs': {}}

In [51]:
@Language.component("ingredient_extractor")
def ingredient_extractor(doc):
    matches = ingredient_matcher(doc)
    spans = [Span(doc, start, end, label='INGREDIENT') for match_id, start, end in matches]

    # Resolve overlaps by keeping the longest span
    filtered_spans = spacy.util.filter_spans(spans)

    doc.ents = filtered_spans
    return doc
# Add the custom component to the pipeline
nlp.add_pipe("ingredient_extractor", last=True)


import spacy
from spacy.tokens import Span
from spacy.matcher import Matcher

@Language.component("quantity_extractor")
def quantity_extractor(doc):
    matcher = Matcher(nlp.vocab)
    pattern = [
        {"LIKE_NUM": True},  # Match numbers
        {"LIKE_NUM": True, "OP": "?"},  # Match the second number (e.g., 8), but optional
        {"LOWER": {"IN": ["cup", "tablespoon", "teaspoon", "ounce", "pound", "gram", "kilogram","package", "quart", "liter", "milliliter"]}}
    ]
    matcher.add("QUANTITY", [pattern])
    matches = matcher(doc)
    spans = [Span(doc, start, end, label="QUANTITY") for match_id, start, end in matches]
    filtered_spans = spacy.util.filter_spans(spans)

    # Ensure no overlapping entities
    new_ents = [ent for ent in doc.ents if ent.label_ != "QUANTITY"]
    doc.ents = new_ents + filtered_spans
    return doc
nlp.add_pipe("quantity_extractor", last=True)

In [136]:
import spacy
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy.language import Language

nlp = spacy.blank("en")

# Create a dictionary of terms
terms = {}
patterns = []

for tags in ner_tags:
    for tag in tags:
        if tag not in terms and tag!='mix':
            terms[tag] = {'label': 'INGREDIENT'}
            patterns.append(nlp(tag))

# Initialize the PhraseMatcher
ingredient_matcher = PhraseMatcher(nlp.vocab)  # nlp.vocab ~ A storage class for vocabulary and other data shared across a language
ingredient_matcher.add("INGREDIENT", None, *patterns)

# Quantity extractor component
@Language.component("quantity_extractor")
def quantity_extractor(doc):
    # Extract quantities
    matcher = Matcher(nlp.vocab)
    pattern = [
        {"LIKE_NUM": True},  # Match numbers
        {"LIKE_NUM": True, "OP": "?"},  # Match the second number (e.g., 8), but optional
        {"LOWER": {"IN": ["cup", "tablespoon", "teaspoon", "ounce", "pound", "gram", "kilogram", "package", "quart", "liter", "milliliter"]}}
    ]
    matcher.add("QUANTITY", [pattern])
    matches = matcher(doc)
    quantity_spans = [Span(doc, start, end, label="QUANTITY") for match_id, start, end in matches]

    # Ensure no duplicate spans and keep unique ones
#    seen_spans = set()
#    unique_quantity_spans = []
#    for span in quantity_spans:
#        span_text = span.text.strip()

#        if span_text not in seen_spans:
#            unique_quantity_spans.append(span)
#            seen_spans.add(span_text)

    filtered_spans = spacy.util.filter_spans(quantity_spans)
   # Filter out existing QUANTITY entities
    new_ents = [ent for ent in doc.ents if ent.label_ != "QUANTITY"]
    #print(new_ents)
    # Add the unique quantity spans to the new_ents list
    doc.ents = new_ents + filtered_spans  # Add unique quantity spans

    return doc
nlp.add_pipe("quantity_extractor", last=True)  # Quantity extractor runs first


In [137]:


# Initialize the PhraseMatcher
ingredient_matcher = PhraseMatcher(nlp.vocab)  # nlp.vocab ~ A storage class for vocabulary and other data shared across a language
ingredient_matcher.add("INGREDIENT", None, *patterns)

# Ingredient extractor component
@Language.component("ingredient_extractor")
def ingredient_extractor(doc):
    # Extract ingredients after quantity extraction
    matches = ingredient_matcher(doc)
    spans = [Span(doc, start, end, label='INGREDIENT') for match_id, start, end in matches]

#    seen_spans = set()
    unique_spans = []

    # Check if the ingredient overlaps with any quantity span before adding
    quantity_spans = [ent for ent in doc.ents if ent.label_ == "QUANTITY"]

    for span in spans:
 #       span_text = span.text.strip()
        overlap_found = False

        # Check if the ingredient span overlaps with any quantity span
        for quantity in quantity_spans:
            if span.start < quantity.end and span.end > quantity.start:  # If overlap occurs
                overlap_found = True
                break  # Skip adding this ingredient if it overlaps with a quantity

        # Add ingredient span if there's no overlap with any quantity span
        if not overlap_found:
 #           if span_text not in seen_spans:
                unique_spans.append(span)
 #               seen_spans.add(span_text)

    # Resolve overlaps and filter out duplicate ingredient spans
    filtered_spans = spacy.util.filter_spans(unique_spans)
    new_ents = [ent for ent in doc.ents if ent.label_ != "INGREDIENT"]
    doc.ents = new_ents + filtered_spans  # Add unique ingredient spans, excluding overlapping ones

    return doc

# Add the custom components to the pipeline (reversed order)
nlp.add_pipe("ingredient_extractor", last=True)  # Ingredient extractor runs second



In [138]:
nlp.analyze_pipes()

{'summary': {'quantity_extractor': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'ingredient_extractor': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'quantity_extractor': [], 'ingredient_extractor': []},
 'attrs': {}}

In [139]:
from spacy.tokens import DocBin

train_data = [(text, {"entities": []}) for text in texts]

for i, (text, annotations) in enumerate(train_data):
    doc = nlp(text)
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    train_data[i] = (text, {"entities": entities})


In [140]:
train_data

[('no-bake nut cookies 1 cup firmly packed brown sugar 0.5 cup evaporated milk 0.5 teaspoon vanilla 0.5 cup broken nuts pecans 2 tablespoon butter or margarine 3.5 cup bite size shredded rice biscuits in a heavy 2 quart saucepan mix brown sugar nuts evaporated milk and butter or margarine stir over medium heat until mixture bubbles all over top boil and stir 5 minutes more take off heat stir in vanilla and cereal mix well using 2 teaspoon drop and shape into 30 clusters on wax paper let stand until firm about 30 minutes',
  {'entities': [(20, 25, 'QUANTITY'),
    (40, 51, 'INGREDIENT'),
    (52, 59, 'QUANTITY'),
    (71, 75, 'INGREDIENT'),
    (76, 88, 'QUANTITY'),
    (89, 96, 'INGREDIENT'),
    (97, 104, 'QUANTITY'),
    (112, 116, 'INGREDIENT'),
    (117, 123, 'INGREDIENT'),
    (124, 136, 'QUANTITY'),
    (137, 143, 'INGREDIENT'),
    (147, 156, 'INGREDIENT'),
    (157, 164, 'QUANTITY'),
    (165, 197, 'INGREDIENT'),
    (209, 216, 'QUANTITY'),
    (230, 241, 'INGREDIENT'),
    (24

In [None]:
#To be used only with small dataset

def save_training_data(data, output_file):
    nlp = spacy.blank("en")
    doc_bin = DocBin()
    for text, annotations in data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    doc_bin.to_disk(output_file)

save_training_data(train_data, 'training_data.spacy')

In [None]:
#To be modified/used when working with complete data

#Save in batches

import spacy
from spacy.tokens import DocBin

def save_training_data(data, output_dir, chunk_size=5000):
    """Splits data into smaller DocBins and saves each as a separate file."""
    nlp = spacy.blank("en")

    for i in range(0, len(data), chunk_size):
        doc_bin = DocBin()
        batch = data[i:i + chunk_size]  # Get a chunk of data

        for text, annotations in batch:
            doc = nlp.make_doc(text)
            ents = []
            for start, end, label in annotations["entities"]:
                span = doc.char_span(start, end, label=label)
                if span is not None:
                    ents.append(span)
            doc.ents = ents
            doc_bin.add(doc)

        output_file = f"{output_dir}/training_data_part_{i // chunk_size}.spacy"
        doc_bin.to_disk(output_file)
        print(f" Saved {output_file} with {len(batch)} samples.")

# Save training data in smaller chunks
save_training_data(train_data, 'training_data', chunk_size=5000)


In [None]:
#To be used when training data is saved in batches
#Load training data

import glob
import spacy
from spacy.tokens import DocBin

def load_training_data(directory):
    """Loads multiple DocBin files and merges them into a single DocBin."""
    doc_bin = DocBin()  # Create an empty DocBin

    # Load all .spacy files from the directory
    for file in glob.glob(f"{directory}/*.spacy"):
        print(f"📂 Loading {file} ...")
        temp_bin = DocBin().from_disk(file)  # Load individual file
        for doc in temp_bin.get_docs(nlp.vocab):  # Add docs to main DocBin
            doc_bin.add(doc)

    return doc_bin

# Load all training data
nlp = spacy.blank("en")  # Initialize NLP pipeline
db = load_training_data("training_data")

In [142]:
import random
from spacy.training import Example
from spacy.util import minibatch, compounding

# Load the training data
nlp = spacy.blank("en")
db = DocBin().from_disk("training_data.spacy")
docs = list(db.get_docs(nlp.vocab))

# Create the NER component and add it to the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)

# Add the labels to the NER component
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# Disable other pipes during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(5):  # 5 iterations
        random.shuffle(docs)
        losses = {}
        batches = minibatch(docs, size=compounding(4.0, 32.0, 1.5))
        for batch in batches:
            for doc in batch:
                example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
                nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {itn}, Losses: {losses}")

# Save the trained model to disk
nlp.to_disk("ner_model")

Iteration 0, Losses: {'ner': 12361.166180420714}
Iteration 1, Losses: {'ner': 4542.573362809907}
Iteration 2, Losses: {'ner': 3426.910563268032}
Iteration 3, Losses: {'ner': 2811.8487050707386}
Iteration 4, Losses: {'ner': 2402.452960068132}


In [143]:
# Test the custom NER
doc = nlp("Add 2 cups of flour and 1 tablespoon of sugar.")
for ent in doc.ents:
    print(ent.text, ent.label_)

flour INGREDIENT
1 tablespoon QUANTITY
sugar INGREDIENT


In [144]:
type(train_data)

list

In [145]:
def evaluate_model(model, test_data):
    examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in test_data]
    scorer = model.evaluate(examples)
    return scorer

In [146]:
evaluation_results = evaluate_model(nlp, train_data)
print("Evaluation Results:", evaluation_results)

Evaluation Results: {'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.9617726502769766, 'ents_r': 0.9631837146394021, 'ents_f': 0.9624776652769506, 'ents_per_type': {'QUANTITY': {'p': 0.9996599217820099, 'r': 0.9994899693981639, 'f': 0.9995749383660633}, 'INGREDIENT': {'p': 0.947813557198346, 'r': 0.9497771360411827, 'f': 0.9487943306889091}}, 'speed': 26060.79946733934}


In [159]:
test_texts = "In a heavy 2-quart saucepan, mix 2 lbs brown sugar, nuts, evaporated milk and butter or margarine. Stir over medium heat until mixture bubbles all over top. Boil and stir 5 minutes more. Take off heat. Stir in vanilla and cereal; mix well. Using 2 teaspoons, drop and shape into 30 clusters on wax paper.Let stand until firm, about 30 minutes."
doc = nlp(test_texts)

In [160]:
from spacy import displacy

displacy.render(doc, style='ent')

In [157]:
# Test the model on some examples
test_texts = "In a heavy 2 quart saucepan, mix 2 pound brown sugar, nuts, evaporated milk and butter or margarine. Stir over medium heat until mixture bubbles all over top. Boil and stir 5 minutes more. Take off heat. Stir in vanilla and cereal; mix well. Using 2 teaspoon, drop and shape into 30 clusters on wax paper.Let stand until firm, about 30 minutes."
doc = nlp(test_texts)

In [158]:
from spacy import displacy

displacy.render(doc, style='ent')

In [164]:
real_text = """Peel potatoes and keep aside
Heat oil in a wok
Crackle mustard seeds and urad dal
Add Kari Patta
soute 1 to 2 tablespoon ginger garlic paste and chopped onion
Aad potatoes and soute for 1 to 2minutes
Add all dry masala
Cook with lid on slow flame for 10 minutes
Garnish with chopped coriander
Serve hot with roti or poori
"""

In [165]:
doc2 = nlp(real_text)

displacy.render(doc2, style='ent')

In [166]:
import json


entities = []
for ent in doc.ents:
    if ent.label_ == "QUANTITY":
        for i_ent in doc.ents:
            if i_ent.label_ == "INGREDIENT" and i_ent.start == ent.end:
                entities.append({"ingredient": i_ent.text, "quantity": ent.text})
                break
    elif ent.label_ == "INGREDIENT":
        if not any(e["ingredient"] == ent.text for e in entities):
            entities.append({"ingredient": ent.text, "quantity": ""})

entities_json = json.dumps(entities, indent=2)

print(entities_json)

[
  {
    "ingredient": "brown sugar",
    "quantity": ""
  },
  {
    "ingredient": "nuts",
    "quantity": ""
  },
  {
    "ingredient": "milk",
    "quantity": ""
  },
  {
    "ingredient": "butter",
    "quantity": ""
  },
  {
    "ingredient": "margarine",
    "quantity": ""
  },
  {
    "ingredient": "vanilla",
    "quantity": ""
  },
  {
    "ingredient": "cereal",
    "quantity": ""
  }
]
