In [33]:
import pandas as pd
import spacy
import spacy_transformers
from spacy.pipeline import EntityRuler
from spacy.lang.en import English

In [2]:
# Load spaCy's small English model with pre-trained NER
# nlp = spacy.load("en_core_web_trf")
nlp = English()

In [17]:
ingredients_list = pd.read_csv("../data/output/ingredients-list.csv")

In [24]:
# Define custom patterns
patterns = [
    {"label": "QUANTITY", "pattern": [{"LIKE_NUM": True}, {"LOWER": {"IN": ["ml", "g", "tbsp", "tsp", "cup", "oz", "g", "kg"]}}]},
    {"label": "INGREDIENT", "pattern": [{"LOWER": {"IN": ingredients_list.ingredient.tolist()}}]},
]

In [29]:
# Create a custom EntityRuler
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

In [3]:
ingredients = pd.read_csv("../data/output/ingredients-sample.csv")
ingredients.head()

Unnamed: 0,ingredient
0,150ml whipping cream
1,50g caster sugar
2,"2 tbsp icing sugar, sifted"
3,350g self-raising flour
4,"1 red chilli, deseeded and finely chopped"


In [4]:
ingredients_sample = ingredients.head(10)

In [31]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [32]:
doc = nlp("150ml whipping cream")

print([(ent.text, ent.label_) for ent in doc.ents])

[]


In [15]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

150ml 150ml NOUN NN compound dddxx False False
whipping whipping NOUN NN amod xxxx True False
cream cream NOUN NN ROOT xxxx True False


In [16]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

150ml whipping cream cream ROOT cream


In [8]:
def nlp_extract_ingredient_info(ingredient):
    doc = nlp(ingredient)
    ingredient_name = ''
    amount = ''
    preparation_instructions = ''

    # Iterate through the entities recognized by spaCy
    for ent in doc.ents:
        if ent.label_ in ['QUANTITY', 'CARDINAL', 'ORDINAL']:
            amount += ent.text + ' '
        elif ent.label_ == 'GPE' or ent.label_ == 'ORG' or ent.label_ == 'PERSON':  # Handle misclassifications
            ingredient_name += ent.text + ' '
        else:
            ingredient_name += ent.text + ' '

    # Refinement
    if ',' in ingredient:
        preparation_instructions = ingredient.split(',')[1].strip()

    return {
        'ingredient_name': ingredient_name.strip(),
        'amount': amount.strip(),
        'preparation_instructions': preparation_instructions
    }

In [9]:
# Apply NLP extraction to each row
nlp_structured_data = ingredients_sample['ingredient'].apply(nlp_extract_ingredient_info)
nlp_structured_df = pd.DataFrame(nlp_structured_data.tolist())

In [13]:
nlp_structured_df

Unnamed: 0,ingredient_name,amount,preparation_instructions
0,,,
1,,50.0,
2,,2.0,sifted
3,,350.0,
4,,1.0,deseeded and finely chopped
5,,1.0,
6,,1.0,peeled and diced
7,,,
8,,,
9,,,
