## Scraping - www.allrecipes.com 

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import time


import sys

sys.path.insert(0, "../")  # needed for using the utils file in the notebook.
from utils import nutrition_facts_parser

from rich import inspect
from tqdm.notebook import tqdm
from pprint import pprint

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.action_chains import ActionChains

from selenium_stealth import stealth


import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [4]:
def scrape_all_recipes_nutrition_facts(url):
    print(f"\n {url}")
    driver = setup_web_driver(headless=True)

    driver.get(f"https://{url}")

    time.sleep(np.random.randint(1, 3))

    try:
        driver.find_element(By.ID, "onetrust-reject-all-handler").click()
    except:
        print("nop reject all detected!")

    time.sleep(np.random.randint(2, 3))

    try:
        button_url = driver.find_element(
            By.XPATH, "//*[@id='mntl-nutrition-facts-label_1-0']/button/span[1]"
        )

        time.sleep(np.random.randint(2, 3))

        driver.execute_script("arguments[0].scrollIntoView();", button_url)

        driver.execute_script("window.scrollBy(0,-100)", "")

        time.sleep(np.random.randint(1, 2))

        button_url.click()

        time.sleep(np.random.randint(1, 2))

        nutrition_facts_url = driver.find_element(
            By.XPATH, "//*[@id='mntl-nutrition-facts-label_1-0']/div"
        )

        return nutrition_facts_url.text
    except:
        print("this recipe is doomed!")

    driver.close()
    driver.quit()
    return np.nan


def setup_web_driver(headless: bool):
    chrome_driver_path = (
        "/Users/man-top/Downloads/chromedriver_mac_arm64/chromedriver"
    )

    service = Service(chrome_driver_path)
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless=new")

    driver = webdriver.Chrome(service=service, options=options, keep_alive=True)
    return driver

In the following I had already started to do the scraping, but I updated the cell in order to automatically save the currently obtained data whenever I interrupt it manually or the internet connection gets cuts off;
Every time the cell stops, we need to check the last index which had the nutrition facts scraped and start scraping from the following index:

In [9]:
all_recipes_source_df = pd.read_csv(
    "all_recipes_data_w_nutrition_facts_unstructured.csv"
)

try:
    for link in tqdm(
        all_recipes_source_df.iloc[42173:]
        .query(" Nutrition_facts_unstructured.isnull() ")
        .link.values
    ):
        all_recipes_source_df.loc[
            all_recipes_source_df.query(f" link == '{link}' ").index,
            "Nutrition_facts_unstructured",
        ] = scrape_all_recipes_nutrition_facts(url=link)
except (KeyboardInterrupt, WebDriverException):
    all_recipes_source_df.to_csv(
        "all_recipes_data_w_nutrition_facts_unstructured.csv", index=False
    )

  0%|          | 0/19224 [00:00<?, ?it/s]


 www.allrecipes.com/recipe/246352/sugar-free-black-bean-brownies/

 www.allrecipes.com/recipe/10319/colossal-cookies/

 www.allrecipes.com/recipe/34912/shrimp-red-thai-curry/

 www.allrecipes.com/recipe/255509/sweet-and-spicy-cranberry-chutney/

 www.allrecipes.com/recipe/13469/cranberry-black-cherry-gelatin-salad/

 www.allrecipes.com/recipe/223209/blueberry-grapefruit-salsa/

 www.allrecipes.com/recipe/229963/squash-stuffed-with-dates-and-onion/

 www.allrecipes.com/recipe/263091/puff-pastry-pinwheels-with-smoked-salmon-and-cream-cheese/

 www.allrecipes.com/recipe/12821/firehouse-clam-bake-new-england-style/

 www.allrecipes.com/recipe/79793/grandma-leachs-fruitcake/

 www.allrecipes.com/recipe/44767/dads-double-whole-grain-pancakes/

 www.allrecipes.com/recipe/24555/easy-refrigerator-yeast-rolls/

 www.allrecipes.com/recipe/45947/savory-corn-flatbread/

 www.allrecipes.com/recipe/53513/grilled-hawaiians/

 www.allrecipes.com/recipe/241657/portobello-mushroom-pizzas/

 www.allrecip

In [None]:
all_recipes_source_df.query(
    "link=='www.allrecipes.com/recipe/14121/chaat-dahi-batata-puri/'"
)

Unnamed: 0,link,NER,Nutrition_facts_unstructured
5245,www.allrecipes.com/recipe/14121/chaat-dahi-bat...,"['garbanzo beans', 'plain yogurt', 'ginger', '...",Nutrition Facts\nServings Per Recipe 4\nCalori...


In [8]:
all_recipes_source_df.query("Nutrition_facts_unstructured.notnull()").index[-1]

42173

## adding the nutrition facts as new features:

We need to find the nutrition facts with the longest length to guarantee that it would have the maximum amount of nutrients data present.

In [None]:
max_str_length = (
    all_recipes_source_df.Nutrition_facts_unstructured.str.len().max()
)

longest_text = all_recipes_source_df.query(
    f" Nutrition_facts_unstructured.str.len() == {max_str_length}"
).Nutrition_facts_unstructured.values[0]
print(longest_text)

Nutrition Facts
Servings Per Recipe 1
Calories 5974
% Daily Value *
Total Fat 492g 630%
Saturated Fat 287g 1,434%
Cholesterol 1615mg 538%
Sodium 7191mg 313%
Total Carbohydrate 319g 116%
Dietary Fiber 20g 72%
Total Sugars 264g
Protein 100g
Vitamin C 27mg 133%
Calcium 625mg 48%
Iron 0mg 2%
Potassium 775mg 16%
* Percent Daily Values are based on a 2,000 calorie diet. Your daily values may be higher or lower depending on your calorie needs.
** Nutrient information is not available for all ingredients. Amount is based on available nutrient data.
(-) Information is not currently available for this nutrient. If you are following a medically restrictive diet, please consult your doctor or registered dietitian before preparing this recipe for personal consumption.
Powered by the ESHA Research Database © 2018, ESHA Research, Inc. All Rights Reserved


In [None]:
nutrition_facts_parser(longest_text)

{'servings': 1,
 'calories': 5974,
 'total_fat_g': 492,
 'total_fat_prct_daily': 630,
 'saturated_fat_g': 287,
 'saturated_fat_prct_daily': 1434,
 'sodium_mg': 7191,
 'sodium_prct_daily': 313,
 'cholesterol_mg': 1615,
 'cholesterol_prct_daily': 538,
 'total_carbs_g': 319,
 'total_carbs_prct_daily': 116,
 'dietary_fiber_g': 20,
 'dietary_fiber_prct_daily': 72,
 'total_sugars_g': 264,
 'protein_g': 100,
 'vitamin_c_mg': 27,
 'vitamin_c_prct_daily': 133,
 'calcium_mg': 625,
 'calcium_prct_daily': 48,
 'iron_mg': 0,
 'iron_prct_daily': 2,
 'potassium_mg': 775,
 'potassium_prct_daily': 16}