In [2]:
from parser_1 import read_recipe_from_url
import re
import sys
import os
import spacy

nlp = spacy.load("en_core_web_lg")

folder_path = os.path.join(os.getcwd(), 'ourtypes')
sys.path.append(folder_path)

from step import Step, Steps
from ingredient import Ingredient

In [3]:
soup = read_recipe_from_url("https://www.allrecipes.com/pasta-e-ceci-italian-chickpea-and-pasta-stew-recipe-8400705")

In [4]:
ingredients = soup.find("div", {"id":"mntl-structured-ingredients_1-0"})
for child in ingredients.find_all("li"):
    text = child.text
    text = text.replace("\n", "").strip()
    print(text)
    

1 cup dry chickpeas
1 tablespoon baking soda
1 yellow onion
1 stalk celery
1 small zucchini
1 carrot
2 tablespoons olive oil
1 sprig fresh sage leaves
1 sprig fresh rosemary
1 sprig fresh thyme
3 cups chicken broth, or more as needed
2 peeled whole canned tomatoes
8 1/2 ounces ditalini pasta, such as Barilla® Gluten Free Ditalini
salt and freshly ground black pepper to taste
1 fresh cayenne pepper, seeded and minced, or more to taste
1 tablespoon extra virgin olive oil


In [11]:
ingredient_objects = {}
for child in ingredients.find_all("li"):
    text = " ".join(child.text.replace("\n", "").split()[2:]).split(",")[0]
    ingredient_objects[text] = text

steps = soup.find("div", {"id":"recipe__steps-content_1-0"})
steps_collection = Steps()

for child in steps.find_all("p"):
    step_text = child.text.replace("\n", "").strip()
    
    #find the time mentioned for in the step
    time_match = re.findall(r'(\d+\s+to\s+\d+|\d+)\s*(hours?|hrs?|minutes?|mins?|seconds?|secs?)(\s+and\s+\d+\s*(hours?|hrs?|minutes?|mins?|seconds?|secs?))?', step_text)
    time = []
    if time_match:
        for t in time_match:
            time.append(' '.join(filter(None, t)))
    print(time)
    #find the temperature for that step
    temp_match = re.search(r'(\d+ degrees)', step_text)
    if temp_match:
        temp = temp_match.group(1) 
    else:
        temp = None
    
    doc = nlp(step_text)
    tools = []
    methods = []

    for sent in doc.sents:
        for token in sent:
            #check if token is a verb
            if token.pos_ == 'VERB':
                #check if it's the first token in a sentence 
                if token.i == sent.start:
                    methods.append(token.lemma_)
            
                #check if itcomes after punctuation or 'and'
                else:
                    prev_token = token.nbor(-1)
                    if prev_token.is_punct or (prev_token.text.lower() == 'and'):
                        methods.append(token.lemma_)

    #extract compound nouns for full tool names
    for chunk in doc.noun_chunks:
        tokens = chunk.text.split()
        if len(tokens) > 1 and tokens[0].lower() in ['a', 'an']:
            #check if the last word is a noun
            if nlp(chunk.text)[-1].pos_ == 'NOUN':
                tools.append(chunk.text)
        elif chunk.root.dep_ in ['dobj', 'pobj']:  #direct object or object of preposition
            #check if the current chunk is a compound noun
            if any(child.dep_ == 'compound' for child in chunk.root.children):
                tools.append(chunk.text)
            elif chunk.root.pos_ == 'NOUN':
                tools.append(chunk.text)
    #identify ingredients present in this step
    step_ingredients = []
    for ingred in ingredient_objects:
        words = ingred.split()
        for word in words:
            if word in step_text:
                step_ingredients.append(ingred)
                break

    #remove tools are ingredients, do not end in a noun and do not start with "a" or "an":
    # tools = [tool for tool in tools if all(ingr not in tool for ingr in ingredient_objects)]
    tools = [tool for tool in tools if nlp(tool)[-1].pos_ == 'NOUN' and tool.split()[0] in ["a", "an"]]
    
    #create Step object
    step_obj = Step(step_ingredients, tools=tools, methods=methods, description=step_text, time=time, temp=temp)

    #add Step object to Steps collection
    steps_collection.add_step(step_obj)

print(steps_collection)

['8 hours']
[]
[]
['1 minute', '5 minutes']
['1 hour']
['8 to 10 minutes']
Step 1: Step(Ingredients: ['dry chickpeas', 'baking soda', 'freshly ground black pepper to taste'], Tools: ['a large stew pot'], Methods: ['rinse', 'pick', 'rub', 'cover', 'add', 'leave'], Time: ['8 hours'], Temp: None, Description: 'Rinse chickpeas under cool water. Pick up handfuls of chickpeas and rub them together with your hands under the water to wash chickpeas. Place chickpeas in a large stew pot, cover with water and add baking soda; swirl chickpeas and water to distribute baking soda. Leave to soak for 8 hours or overnight.')
Step 2: Step(Ingredients: ['dry chickpeas'], Tools: ['a strainer'], Methods: ['pour', 'rinse', 'eliminate', 'set'], Time: [], Temp: None, Description: 'The next day, pour chickpeas through a strainer and rinse well under cold running water, eliminating any skins or discolored chickpeas; set aside.')
Step 3: Step(Ingredients: ['onion', 'celery', 'zucchini'], Tools: ['a cheese grater