In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

### Kidney Kitchen Scraper Functions

In [3]:
def get_file_name(url):
    # create a "file name" that is derived from the URL that is primarily used to identify the ingredients
    file_name = url.split('recipe/')[1].strip('/')
    return file_name

In [14]:
def extract_urls_from_soup(soup):
    # create a list of individual recipe URLs from "visible" HTML/soup
    urls = []
    cards = soup.find_all('div', {'class' : 'right'})
    for i in range(1,len(cards)):
        urls.append(soup.find_all('div', {'class' : 'right'})[i].attrs['onclick'].split('=')[1].replace("'","").strip())
    return urls

In [124]:
def extract_recipe_info(url, omit_tags = []):
    # given a recipe page's url, extract key recipe info
    
    time.sleep(3)
    print(f"Getting information from: {url}")
    # extracts ingredients from a specific recipes
    
    ingredients_text = []
    tags = []
    
    # make request and soup
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
    res = requests.get(url, headers=headers)
    if res.status_code != 200:
        print(f"{url} request status: {res.status_code}")
    soup = BeautifulSoup(res.content)
    
    # get the tags
    
    for li in soup.find_all('ul', {'class':'recipe-nutrients'})[0].find_all('li'):
        tags.append(li['class'][0])
    # if tags are not ideal, skip the rest
    if not any(string in tags for string in omit_tags):
        # get the serving size
        serving_size = soup.find_all('div', {'class', 'elementor-element elementor-element-79fbba0c elementor-widget elementor-widget-text-editor'})[0].find('div').text.strip()


        # Ingredients: focus in on the element that contains ingredients using 'li' first
        ingredients_div = soup.find_all('div', {'class': 'elementor-element elementor-element-6eb817d elementor-widget elementor-widget-text-editor'})
        ingredient_items = [ result.find_all('p') for result in ingredients_div]

        # Ingredients: if first pass above yields no results, try for 'li' elements instead
        if len(ingredient_items[0]) == 0:
            ingredient_items = [ result.find_all('li') for result in ingredients_div]

        [ingredients_text.append(l.text) for l in ingredient_items[0]]
        
        # nutritional info:
        nutrition_titles = soup.find_all('p', {'class', 'nutritional-item-title'})
        nutrition_data = soup.find_all('p', {'class', 'nutritional-item-data'})
        calories = None
        satfat = None
        protein = None
        sodium = None
        potassium = None
        phosphorus = None
        
        for i,j in zip(nutrition_titles, nutrition_data):
            if i.text.lower().strip() == 'calories':
                calories = j.text
            elif i.text.lower().strip() == 'saturated fat':
                satfat = j.text
            elif i.text.lower().strip() == 'protein':
                protein = j.text
            elif i.text.lower().strip() == 'sodium':
                sodium = j.text
            elif i.text.lower().strip() == 'potassium':
                potassium = j.text
            elif i.text.lower().strip() == 'phosphorus':
                phosphorus = j.text
        nutrition = {'calories': calories,
                     'saturated fat': satfat,
                     'protein': protein,
                     'sodium': sodium,
                     'potassium': potassium,
                     'phosphorus': phosphorus
                    }
        
        return serving_size, ingredients_text, nutrition
    print(f"Alert: Skipped recipe at url: {url} due to non-ideal nutrient tags")
    return None, None, None

In [6]:
def create_files_from_urls(urls, output_relative_directory):
    
    # run the ingredient extractor secondary func
    for u in urls:
        file_name = get_file_name(u)
        item_ingredients = extract_ingredients(u)
        
        if len(item_ingredients) > 0:
            print('Printing: ' + file_name)
        else:
            print(f"WARNING: {file_name} ingredients list is empty")
        
        # create text file to directory

            
        with open(os.path.join(output_relative_directory, file_name), 'w', encoding='utf-8') as f:
            # for i in item_description:
            #     line = f"{i}\n"
            #     f.write(line)
            for i in item_ingredients:
                line = f"{i}\n"
                f.write(line)

Below, Selenium was used to expand the infinite scrolling recipe list in its entirety so that the full HTML could be extracted. HTML included for convenience

In [10]:
# browser = webdriver.Chrome()
# url = "https://kitchen.kidneyfund.org/find-recipes/"
# browser.get(url)
# time.sleep(2)
# click = 0
# browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
# time.sleep(4)
# while browser.find_element(By.CSS_SELECTOR, "button.wpgb-button.wpgb-load-more"):
#     next_button = browser.find_element(By.CSS_SELECTOR, "button.wpgb-button.wpgb-load-more")
#     next_button.click()
#     click+=1
#     print(f"Completed click: {click}")
#     time.sleep(3)
#     browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
#     time.sleep(2)
#     html = browser.page_source
#     with open('kidney_kitchen_html.txt','w', encoding='utf-8') as f:
#         f.write(html)



Load in the HTML extracted and created as part of the Selenium venture

In [8]:
with open ('../data/kidney-kitchen/kidney_kitchen_html.txt', 'r', encoding='utf-8') as f:
    html = f.read()
soup = BeautifulSoup(html)

Get the list of individual recipe URLs for next step

In [15]:
recipes = extract_urls_from_soup(soup)

### Extract ingredients from each individual URL

In [106]:
all_recipes = []

In [17]:
# uncomment below if DataFramescraping is necessary
# warning: may need to take note of any failures and resume where the error occurred
# bad_tags = ['high-protein', 'high-sodium', 'high-potassium']
# for url in recipes:
#     serving, ingredients, nutrition = extract_recipe_info(url, bad_tags)
#     if not any(value == None for value in [serving, ingredients,nutrition]):
#         all_recipes.append({'url': url,
#                             'serving_size': serving,
#                             'ingredients_raw': ingredients,
#                             'calories': nutrition['calories'],
#                             'saturated fat': nutrition['saturated fat'],
#                             'protein': nutrition['protein'],
#                             'sodium': nutrition['sodium'],
#                             'potassium': nutrition['potassium'],
#                             'phosphorus': nutrition['phosphorus'] 
#                            })

In [None]:
# all_recipes_df = pd.DataFrame(all_recipes)
# all_recipes_df.to_csv('kidney-kitchen-recipes-with-ingredients-and-nutrition.csv')

In [19]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
url = "https://kitchen.kidneyfund.org/recipe/sweet-chiffon-pie/"
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content)

In [43]:
tags = []
for li in soup.find_all('ul', {'class':'recipe-nutrients'})[0].find_all('li'):
    tags.append(li['class'][0])

In [45]:
tags

['low-phosphorus', 'low-potassium', 'low-protein', 'low-sodium']

In [52]:
test_tags = ['high-protein', 'high-sodium']

In [55]:
if not any(string in tags for string in test_tags ):
    print('Good to go')

Good to go


In [38]:
soup.find_all('div', {'class', 'elementor-element elementor-element-79fbba0c elementor-widget elementor-widget-text-editor'})[0].find('div').text.strip()

'12 servings'

In [30]:
for tag in tags:
    print(tag['class'][0])

low-phosphorus
low-potassium
low-protein
low-sodium


In [56]:
for r in recipes:
    if 'tarragon' in r:
        print(recipes.index(r))

24
359


In [91]:
url = 'https://kitchen.kidneyfund.org/recipe/sweet-chiffon-pie/'
test_tags = ['high-protein', 'high-sodium']
serving, ingredients, nutrition = extract_ingredients(url, test_tags)

In [99]:
all_recipes = []

In [95]:
pd.DataFrame(all_recipes)

Unnamed: 0,url,serving_size,ingredients_raw,calories,saturated fat,protein,sodium,potassium,phosphorus
0,https://kitchen.kidneyfund.org/recipe/sweet-ch...,12 servings,"[9-inch-deep dish pie crust, 1 ½ cups (about ½...",\n213\n,\n4 g\n,\n3 g\n,\n91 mg\n,\n197 mg\n,\n49 mg\n


In [59]:
serving

'12 servings'

In [60]:
ingredients

['9-inch-deep dish pie crust',
 '1 ½ cups (about ½ pound) sweet potato, peeled and chopped into small chunks',
 '¼ cup cold water',
 '1 envelope unflavored gelatin',
 '2 tablespoons sugar',
 '1 teaspoon pumpkin pie spice',
 '¼ teaspoon nutmeg',
 '¼ cup honey',
 '2 large eggs, yolk and whites separated',
 '¼ cup heavy cream mixed with ½ cup water',
 '1 ¼ teaspoon vanilla, divided',
 '2 tablespoons unsalted butter, melted and cooled',
 '¼ cup caster sugar',
 '⅛ teaspoon cornstarch',
 'Whipped cream (optional)',
 '\xa0',
 'Special Equipment',
 'Blender',
 'Baking beads',
 'Electric mixer']

In [74]:
nutrition_titles = soup.find_all('p', {'class', 'nutritional-item-title'})
nutrition_data = soup.find_all('p', {'class', 'nutritional-item-data'})
for i,j in zip(nutrition_titles, nutrition_data):
    print(i.text)
    print(j.text)

Calories

213

Fat         

10 g

Saturated Fat

4 g

Trans Fat 

0 g

Cholesterol 

42 mg

Carbohydrates 

26 g

Sugar 

14 g

Fiber 

2 g

Protein 

3 g

Sodium 

91 mg

Calcium

26 mg

Phosphorus

49 mg

Potassium 

197 mg



In [71]:
nutrition_data = soup.find_all('p', {'class', 'nutritional-item-data'})
for i in nutrition_data:
    print(i.find('span').text)

213
10 g
4 g
0 g
42 mg
26 g
14 g
2 g
3 g
91 mg
26 mg
49 mg
197 mg


In [64]:

for url in recipes:
    

['https://kitchen.kidneyfund.org/recipe/poultry-gravy-and-wings/',
 'https://kitchen.kidneyfund.org/recipe/sweet-chiffon-pie/',
 'https://kitchen.kidneyfund.org/recipe/sweet-carrots/',
 'https://kitchen.kidneyfund.org/recipe/sugar-free-pumpkin-cheese-pie/',
 'https://kitchen.kidneyfund.org/recipe/pork-with-glazed-plums/',
 'https://kitchen.kidneyfund.org/recipe/non-alcoholic-eggnog/',
 'https://kitchen.kidneyfund.org/recipe/non-alcoholic-champagne/',
 'https://kitchen.kidneyfund.org/recipe/gingerbread-1/',
 'https://kitchen.kidneyfund.org/recipe/roasted-asparagus-and-wild-mushroom/',
 'https://kitchen.kidneyfund.org/recipe/carrot-leek-and-goat-cheese-tarts/',
 'https://kitchen.kidneyfund.org/recipe/soft-pretzels/',
 'https://kitchen.kidneyfund.org/recipe/shrimp-spread-with-crackers/',
 'https://kitchen.kidneyfund.org/recipe/shrimp-scampi-linguine/',
 'https://kitchen.kidneyfund.org/recipe/raspberry-muffins-with-streusel/',
 'https://kitchen.kidneyfund.org/recipe/popcorn-munch/',
 'http

In [15]:
#soup.find_all('div', {'class' : 'right'})[1].attrs['onclick']

"javascript:window.location.href = 'https://kitchen.kidneyfund.org/recipe/poultry-gravy-and-wings/'"

In [16]:
recipes = []
for i in range(1,len(cards)):
    recipes.append(soup.find_all('div', {'class' : 'right'})[i].attrs['onclick'].split('=')[1].replace("'","").strip())
    

In [21]:
sub_url = recipes[0]
sub_res = requests.get(sub_url, headers=headers)
sub_res.status_code


200

In [22]:
sub_soup = BeautifulSoup(sub_res.content)

In [26]:
divs = sub_soup.find_all('div', {'class': 'elementor-element elementor-element-6eb817d elementor-widget elementor-widget-text-editor'})

In [27]:
len(divs)

1

In [35]:
divs[0].find_all('p')

[<p>½ teaspoon canola oil</p>,
 <p>1 ¼ cups onion, diced</p>,
 <p>1 cup celery, diced</p>,
 <p>1 ½ pounds chicken wings</p>,
 <p>4 cups water</p>,
 <p>¾ teaspoon poultry seasoning</p>,
 <p>¼ teaspoon sage</p>,
 <p>¼ teaspoon white pepper</p>,
 <p>1 teaspoon chicken bouillon</p>,
 <p>¼ cup cornstarch</p>,
 <p>½ cup water</p>]

In [37]:
extract_description_and_ingredients(recipes[0])

['½ teaspoon canola oil',
 '1 ¼ cups onion, diced',
 '1 cup celery, diced',
 '1 ½ pounds chicken wings',
 '4 cups water',
 '¾ teaspoon poultry seasoning',
 '¼ teaspoon sage',
 '¼ teaspoon white pepper',
 '1 teaspoon chicken bouillon',
 '¼ cup cornstarch',
 '½ cup water']