In [2]:
import numpy as np
import pandas as pd
import requests
import time
import os
from bs4 import BeautifulSoup

# from selenium import webdriver
# from selenium.webdriver.common.by import By
# import time

## Functions

In [75]:
def extract_visible_urls(source):
    recipe_urls = []
    for item in source:
        if item.find('a') != None:
            link = item.find('a').attrs['href']
            recipe_urls.append(link)
    return recipe_urls

In [76]:
def get_file_name(url):
    # create a "file name" that is created from the URL that is primarily used to identify the ingredients
    file_name = url.split('recipes/')[1].strip('/')
    return file_name

In [67]:
# Todo: delete as it is obsolete
# def extract_description_and_ingredients(url):
    
#     ingredients_text = []
#     description_text = []
#     # make request and soup
#     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
#     res = requests.get(url, headers=headers)
#     if res.status_code != 200:
#         print(f"{url} request status: {res.status_code}")
#     soup = BeautifulSoup(res.content)
    
#     # get the descriptions from the top of the individual recipes
#     description_items = soup.find_all('div', {'class' : 'col-md-6 col-sm-12 d-flex flex-column'})[0].find_all('p')
#     [description_text.append(p.text) for p in description_items if len(p) > 0]
    
#     # focus in on the element that contains ingredients using 'li' first
#     ingredients_div = soup.find_all('div', {'class': lambda x: 'ingredients' in x if x else False})
#     ingredient_items = [ result.find_all('li') for result in ingredients_div]
    
#     # if first pass above yields no results, try for 'p' elements instead
#     if len(ingredient_items[0]) == 0:
#         ingredient_items = [ result.find_all('p') for result in ingredients_div]
        
#     [ingredients_text.append(l.text) for l in ingredient_items[0]]
#     return description_text, ingredients_text

In [103]:
def extract_recipe_info(url, omit_tags = []):
    time.sleep(3)
    print(f"Getting information from: {url}")
    # extracts ingredients from a specific recipes
    
    ingredients_text = []
    tags = []
    
    # make request and soup
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
    res = requests.get(url, headers=headers)
    if res.status_code != 200:
        print(f"{url} request status: {res.status_code}")
    soup = BeautifulSoup(res.content)
    
    # get the tags
    if soup.find('span', {'class': 'cat-links'}):
        for a in soup.find_all('span', {'class': 'cat-links'})[0].find_all('a'):
            tags.append(a.text)

    # if tags are not ideal, skip the rest
    if not any(string in tags for string in omit_tags):
        # get the serving size
        if (soup.find('div', {'class': lambda name : 'diet-analysis' in name if name else False}).find('p', string = lambda t : 'servings' in t.lower() if t else False)):
            serving_size = soup.find('div', {'class': lambda name : 'diet-analysis' in name if name else False}). \
            find('p', string = lambda t : 'servings' in t.lower() if t else False).text
        else:
            serving_size = None


        # Ingredients: focus in on the element that contains ingredients using 'li' first
        ingredients_div = soup.find_all('div', {'class': lambda x: 'ingredients' in x if x else False})
        ingredient_items = [ result.find_all('li') for result in ingredients_div]

        # Ingredients: if first pass above yields no results, try for 'p' elements instead
        if len(ingredient_items[0]) == 0:
            ingredient_items = [ result.find_all('p') for result in ingredients_div]

        [ingredients_text.append(l.text) for l in ingredient_items[0]]
        
        
        nutrition_data = [item.text.split(':') for item in soup.find_all('ul', {'class':'list-group'})[0].find_all('li')]
        calories = None
        totfat = None
        protein = None
        sodium = None
        potassium = None
        phosphorus = None
        
        for i in nutrition_data:
            if i[0].lower().strip() == 'calories':
                calories = i[1]
            elif i[0].lower().strip() == 'total fat':
                totfat = i[1]
            elif i[0].lower().strip() == 'protein':
                protein = i[1]
            elif i[0].lower().strip() == 'sodium':
                sodium = i[1]
            elif i[0].lower().strip() == 'potassium':
                potassium = i[1]
            elif i[0].lower().strip() == 'phosphorus':
                phosphorus = i[1]
        
        nutrition = {'calories': calories,
                     'saturated fat': totfat,
                     'protein': protein,
                     'sodium': sodium,
                     'potassium': potassium,
                     'phosphorus': phosphorus
                    }
        
        return serving_size, ingredients_text, nutrition
    print(f"Alert: Skipped recipe at url: {url} due to non-ideal nutrient tags")
    return None, None, None

In [118]:

# primary function
# intended to be used once per page
def scrape_page(url, omit_tags):
    page_recipes = []
    # make the primary request and create soup
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
    res = requests.get(url, headers=headers)
    print(f"Primary call status: {res.status_code}")
    primary_soup = BeautifulSoup(res.content)
    
    
    # get h2s (the clickable recipe blocks)
    h2s = primary_soup.find_all('h2', {'class' : 'entry-title card-title'})
    
    # pass h2s to secondary url extraction func and get list of recipe urls for that page
    visible_urls = extract_visible_urls(h2s)
    
    # run the ingredient extractor secondary func
    for u in visible_urls:
        # file_name = get_file_name(u)
        serving, ingredients, nutrition = extract_recipe_info(u, omit_tags)
        if not any(value == None for value in [serving, ingredients,nutrition]):
            page_recipes.append({'url': u,
                                'serving_size': serving,
                                'ingredients_raw': ingredients,
                                'calories': nutrition['calories'],
                                'saturated fat': nutrition['saturated fat'],
                                'protein': nutrition['protein'],
                                'sodium': nutrition['sodium'],
                                'potassium': nutrition['potassium'],
                                'phosphorus': nutrition['phosphorus'] 
                               })
    return page_recipes
        

In [115]:
# url = 'https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/steamed-whole-fish-sea-bass-%e9%b1%b8%e9%ad%9a-or-tilapia-%e7%be%85%e9%9d%9e%e9%ad%9a/'
# name, items = extract_ingredients(url)

# with open(os.path.join('./recipes', name), 'w', encoding='utf-8') as f:
#     for i in items:
#         line = f"{i}\n"
#         f.write(line)

## Main function call

In [119]:
all_recipes = []

In [120]:
bad_tags = ['High Protein', 'High Sodium', 'High Potassium']
initial_url = 'https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/'
page_results = scrape_page(initial_url, bad_tags)
for result in page_results:
    all_recipes.append(result)


Primary call status: 200
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/roasted-red-pepper-chickpea-hummus-2/
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/shrimp-and-fish-creole-style-stew/
Alert: Skipped recipe at url: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/shrimp-and-fish-creole-style-stew/ due to non-ideal nutrient tags
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/steamed-whole-fish-sea-bass-%e9%b1%b8%e9%ad%9a-or-tilapia-%e7%be%85%e9%9d%9e%e9%ad%9a/
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/tofu-and-veggie-frittata-vegan/
Alert: Skipped recipe at url: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/tofu-and-veggie-frittata-vegan/ due to non-ideal nutrient tags
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/strawberry-rhubarb-lemonade/
Getting information from: https://ww

In [121]:
for i in range(2, 15):
    print(f"Page: {i}")
    page_results = scrape_page(initial_url+'?_paged='+str(i), bad_tags)
    for result in page_results:
        all_recipes.append(result)

Page: 2
Primary call status: 200
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/spiced-apple-coffee-cake/
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/no-bake-blueberry-pie/
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/kidney-friendly-roasted-red-bell-pepper-cauliflower-pizza/
Alert: Skipped recipe at url: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/kidney-friendly-roasted-red-bell-pepper-cauliflower-pizza/ due to non-ideal nutrient tags
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/acadian-fricot-with-dumplings/
Alert: Skipped recipe at url: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/acadian-fricot-with-dumplings/ due to non-ideal nutrient tags
Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/kidney-friendly-chicken-and-ginger-congee/
Alert: Skipped recipe at url: https://www.ki

In [122]:
all_recipes_df = pd.DataFrame(all_recipes)

In [123]:
all_recipes_df.to_csv('kidney-ca-recipes-with-ingredients-and-nutrition.csv')

In [None]:
bad_tags = ['High Protein', 'High Sodium', 'High Potassium']


## Scratch Pad / Debugging

In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
res = requests.get('https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/shrimp-and-fish-creole-style-stew/', headers=headers)
if res.status_code != 200:
    print(f"{url} request status: {res.status_code}")
soup = BeautifulSoup(res.content)

In [20]:
soup.find_all('span', {'class': 'cat-links'})[0].find_all('a')#[0].text

[<a href="https://www.kidneycommunitykitchen.ca/dietcategory/high-protein/" rel="tag">High Protein</a>,
 <a href="https://www.kidneycommunitykitchen.ca/dietcategory/low-phosphorus/" rel="tag">Low Phosphorus</a>]

In [21]:
for a in soup.find_all('span', {'class': 'cat-links'})[0].find_all('a'):
    print(a.text)

High Protein
Low Phosphorus


In [39]:
soup.find('div', {'class': lambda name : 'diet-analysis' in name if name else False}).find('p', string = lambda t : 'servings' in t.lower() if t else False).text

'Servings per recipe: 5'

In [45]:
n_test = [item.text.split(':') for item in soup.find_all('ul', {'class':'list-group'})[0].find_all('li')]
n_test

[['Calories', ' 292 KCal'],
 ['Protein', ' 20.2 g'],
 ['Carbohydrates', ' 43.6 g'],
 ['Fibre', ' 2.2 g'],
 ['Total Fat', ' 3.9 g'],
 ['Sodium', ' 139 mg'],
 ['Phosphorus', ' 251 mg'],
 ['Potassium', ' 536 mg']]

In [92]:
if soup.find('span', {'class': 'cat-links'}):
    print('good to go')

good to go


In [47]:
for i in n_test:
    if i[0].lower().strip() == 'calories':
        calories = i[1]
    # elif i.text.lower().strip() == 'saturated fat':
    #     satfat = j.text
    # elif i.text.lower().strip() == 'protein':
    #     protein = j.text
    # elif i.text.lower().strip() == 'sodium':
    #     sodium = j.text
    # elif i.text.lower().strip() == 'potassium':
    #     potassium = j.text
    # elif i.text.lower().strip() == 'phosphorus':
    #     phosphorus = j.text

print(calories)

 292 KCal


In [62]:
url = 'https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/roasted-red-pepper-chickpea-hummus-2/'
bad_tags = ['High Protein', 'High Sodium', 'High Potassium']
s, i, n = extract_recipe_info(url, bad_tags)

Getting information from: https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/roasted-red-pepper-chickpea-hummus-2/
['Low Potassium']


In [63]:
s

'Servings per recipe: 5'

In [64]:
i

['½ cup canned chickpeas rinsed and drained',
 '2 cups red pepper cut in 2-inch wedges',
 '2 garlic cloves',
 '2 tablespoons extra-virgin olive oil \xa0',
 '2 tablespoons lemon juice',
 '1 tablespoon of tahini',
 'Black pepper to taste',
 'You may also add cumin or paprika or chili pepper if you like']

In [65]:
n

{'calories': ' 102 KCal',
 'saturated fat': ' 6.9 g',
 'protein': ' 1.4 g',
 'sodium': ' 36.4 mg',
 'potassium': ' 122.4 mg',
 'phosphorus': None}

In [21]:
for p in flavor_text_ps:
    if len(p) > 0:
        print(p.text)

Recipe from The Canadian Association of Nephrology Dietitians’ Kidney Friendly cookbook


In [29]:
create_files('https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/?_paged=2', test=True)

Primary call status: 200
Printing: spiced-apple-coffee-cake
Printing: no-bake-blueberry-pie
Printing: kidney-friendly-roasted-red-bell-pepper-cauliflower-pizza
Printing: acadian-fricot-with-dumplings
Printing: kidney-friendly-chicken-and-ginger-congee
Printing: shiitake-broth
Printing: korean-style-short-ribs
Printing: asian-inspired-summer-salad-with-a-lime-ginger-vinaigrette
Printing: fish-tacos
Printing: low-sodium-dinner-rolls-hamburger-buns
Printing: cucumber-dill-salad
Printing: classic-waldorf-salad
Printing: the-classic-nicoise
Printing: zaatar-chicken-with-garlic-yogurt-sauce
Printing: new-orleans-style-pain-perdu
Printing: zucchini-brownies
Printing: balsamic-vinaigrette
Printing: warm-falafel-wraps


In [144]:
# url = "https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/"
# ingredients_from_visible(url)

Primary call status: 200
Printing: roasted-red-pepper-chickpea-hummus-2
Printing: shrimp-and-fish-creole-style-stew
Printing: steamed-whole-fish-sea-bass-%e9%b1%b8%e9%ad%9a-or-tilapia-%e7%be%85%e9%9d%9e%e9%ad%9a
Printing: tofu-and-veggie-frittata-vegan
Printing: strawberry-rhubarb-lemonade
Printing: soya-sauce-substitute
Printing: lemon-curd
Printing: mini-pavlovas
Printing: kidney-friendly-vegan-kimchi
Printing: instant-pot-vegan-and-13-bean-chili
Printing: pink-lemonade
Printing: apple-cider-smash
Printing: cranberry-mint-mocktail
Printing: pineapple-punch
Printing: lime-and-mint-soda
Printing: cranberry-margarita
Printing: molasses-ginger-cookies
Printing: carrot-and-apple-soup


In [130]:
# soup.find_all('h2', {'class' : 'entry-title card-title'})[0].find('a').attrs['href']

'https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/roasted-red-pepper-chickpea-hummus-2/'

In [43]:
# soup.find_all('div', {'class': lambda x: 'ingredients' in x if x else False})[0].find_all('li')[0].text

'½ cup canned chickpeas rinsed and drained'

### After obtaining high protein recipes with the main function call, used the following to remove those recipes by filename from a more comprehensive list, yielding recipes that are NOT high in protein

In [35]:
# for file in os.listdir('./high_protein_ingredients_and_descriptions/'):
#     if os.path.exists(os.path.join('./no_high_protein_ingredients_and_descriptions', file)):
#         os.remove(os.path.join('./no_high_protein_ingredients_and_descriptions', file))

Backup experiment with Selenium

In [152]:
# browser = webdriver.Chrome()
# url = "https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/"
# browser.get(url)
# time.sleep(2)

# browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
# time.sleep(2)
    
# while browser.find_element(By.CSS_SELECTOR, "a.facetwp-page.next"):
#     next_button = browser.find_element(By.CSS_SELECTOR, "a.facetwp-page.next")
#     next_button.click()
#     time.sleep(2)