In [1]:
import numpy as np
import pandas as pd
import requests
import time
import os
from bs4 import BeautifulSoup

# from selenium import webdriver
# from selenium.webdriver.common.by import By
# import time

## Functions

In [23]:
def extract_visible_urls(source):
    recipe_urls = []
    for item in source:
        if item.find('a') != None:
            link = item.find('a').attrs['href']
            recipe_urls.append(link)
    return recipe_urls

In [5]:
def get_file_name(url):
    # create a "file name" that is created from the URL that is primarily used to identify the ingredients
    file_name = url.split('recipes/')[1].strip('/')
    return file_name

In [25]:
def extract_description_and_ingredients(url):
    
    ingredients_text = []
    description_text = []
    # make request and soup
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
    res = requests.get(url, headers=headers)
    if res.status_code != 200:
        print(f"{url} request status: {res.status_code}")
    soup = BeautifulSoup(res.content)
    
    # get the descriptions from the top of the individual recipes
    description_items = soup.find_all('div', {'class' : 'col-md-6 col-sm-12 d-flex flex-column'})[0].find_all('p')
    [description_text.append(p.text) for p in description_items if len(p) > 0]
    
    # focus in on the element that contains ingredients using 'li' first
    ingredients_div = soup.find_all('div', {'class': lambda x: 'ingredients' in x if x else False})
    ingredient_items = [ result.find_all('li') for result in ingredients_div]
    
    # if first pass above yields no results, try for 'p' elements instead
    if len(ingredient_items[0]) == 0:
        ingredient_items = [ result.find_all('p') for result in ingredients_div]
        
    [ingredients_text.append(l.text) for l in ingredient_items[0]]
    return description_text, ingredients_text

In [31]:
# primary function
# intended to be used once per page, ideally selenium clicks button and this runs again
def create_files(url, output_relative_directory):
    # make the primary request and create soup
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
    res = requests.get(url, headers=headers)
    print(f"Primary call status: {res.status_code}")
    primary_soup = BeautifulSoup(res.content)
    
    
    # get h2s (the clickable recipe blocks)
    h2s = primary_soup.find_all('h2', {'class' : 'entry-title card-title'})
    
    # pass h2s to secondary url extraction func and get list of recipe urls
    visible_urls = extract_visible_urls(h2s)
    
    # run the ingredient extractor secondary func
    for u in visible_urls:
        file_name = get_file_name(u)
        item_description, item_ingredients = extract_ingredients(u)
        print('Printing: ' + file_name)
        
        # create text file to directory

            
        with open(os.path.join(output_relative_directory, file_name), 'w', encoding='utf-8') as f:
            for i in item_description:
                line = f"{i}\n"
                f.write(line)
            for i in item_ingredients:
                line = f"{i}\n"
                f.write(line)

In [115]:
# url = 'https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/steamed-whole-fish-sea-bass-%e9%b1%b8%e9%ad%9a-or-tilapia-%e7%be%85%e9%9d%9e%e9%ad%9a/'
# name, items = extract_ingredients(url)

# with open(os.path.join('./recipes', name), 'w', encoding='utf-8') as f:
#     for i in items:
#         line = f"{i}\n"
#         f.write(line)

## Main function call

In [34]:
initial_url = 'https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/?_diet_types=high-protein&'
create_files(initial_url, './high_protein_ingredients_and_descriptions')
for i in range(2, 6):
    create_files(initial_url+'_paged='+str(i), './high_protein_ingredients_and_descriptions')

Primary call status: 200
Printing: shrimp-and-fish-creole-style-stew
Printing: tofu-and-veggie-frittata-vegan
Printing: kidney-friendly-roasted-red-bell-pepper-cauliflower-pizza
Printing: acadian-fricot-with-dumplings
Printing: kidney-friendly-chicken-and-ginger-congee
Printing: korean-style-short-ribs
Printing: zaatar-chicken-with-garlic-yogurt-sauce
Printing: warm-falafel-wraps
Printing: madras-spiced-baked-tilapia
Printing: savory-crustless-quiche
Printing: red-lentil-dahl
Printing: tuna-macaroni-salad
Printing: summer-pot-pie
Printing: old-fashioned-canadian-stew
Printing: jamaican-steamed-fish
Printing: lime-grilled-turkey
Printing: ma-pos-tofu-bean-curd
Printing: spicy-shrimp-linguine
Primary call status: 200
Printing: honey-grilled-pork-and-pear-kabobs
Printing: curried-turkey-casserole
Printing: one-pot-mediterranean-chicken-pasta
Printing: barbecue-chicken-salad
Printing: persian-chicken
Printing: chicken-in-mushroom-sauce
Printing: cranberry-glazed-pork-chops
Printing: beef-s

## Scratch Pad / Debugging

In [7]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
res = requests.get('https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/shrimp-and-fish-creole-style-stew/', headers=headers)
if res.status_code != 200:
    print(f"{url} request status: {res.status_code}")
sub_soup = BeautifulSoup(res.content)

In [21]:
for p in flavor_text_ps:
    if len(p) > 0:
        print(p.text)

Recipe from The Canadian Association of Nephrology Dietitians’ Kidney Friendly cookbook


In [29]:
create_files('https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/?_paged=2', test=True)

Primary call status: 200
Printing: spiced-apple-coffee-cake
Printing: no-bake-blueberry-pie
Printing: kidney-friendly-roasted-red-bell-pepper-cauliflower-pizza
Printing: acadian-fricot-with-dumplings
Printing: kidney-friendly-chicken-and-ginger-congee
Printing: shiitake-broth
Printing: korean-style-short-ribs
Printing: asian-inspired-summer-salad-with-a-lime-ginger-vinaigrette
Printing: fish-tacos
Printing: low-sodium-dinner-rolls-hamburger-buns
Printing: cucumber-dill-salad
Printing: classic-waldorf-salad
Printing: the-classic-nicoise
Printing: zaatar-chicken-with-garlic-yogurt-sauce
Printing: new-orleans-style-pain-perdu
Printing: zucchini-brownies
Printing: balsamic-vinaigrette
Printing: warm-falafel-wraps


In [144]:
# url = "https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/"
# ingredients_from_visible(url)

Primary call status: 200
Printing: roasted-red-pepper-chickpea-hummus-2
Printing: shrimp-and-fish-creole-style-stew
Printing: steamed-whole-fish-sea-bass-%e9%b1%b8%e9%ad%9a-or-tilapia-%e7%be%85%e9%9d%9e%e9%ad%9a
Printing: tofu-and-veggie-frittata-vegan
Printing: strawberry-rhubarb-lemonade
Printing: soya-sauce-substitute
Printing: lemon-curd
Printing: mini-pavlovas
Printing: kidney-friendly-vegan-kimchi
Printing: instant-pot-vegan-and-13-bean-chili
Printing: pink-lemonade
Printing: apple-cider-smash
Printing: cranberry-mint-mocktail
Printing: pineapple-punch
Printing: lime-and-mint-soda
Printing: cranberry-margarita
Printing: molasses-ginger-cookies
Printing: carrot-and-apple-soup


In [130]:
# soup.find_all('h2', {'class' : 'entry-title card-title'})[0].find('a').attrs['href']

'https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/roasted-red-pepper-chickpea-hummus-2/'

In [43]:
# soup.find_all('div', {'class': lambda x: 'ingredients' in x if x else False})[0].find_all('li')[0].text

'½ cup canned chickpeas rinsed and drained'

### After obtaining high protein recipes with the main function call, used the following to remove those recipes by filename from a more comprehensive list, yielding recipes that are NOT high in protein

In [35]:
# for file in os.listdir('./high_protein_ingredients_and_descriptions/'):
#     if os.path.exists(os.path.join('./no_high_protein_ingredients_and_descriptions', file)):
#         os.remove(os.path.join('./no_high_protein_ingredients_and_descriptions', file))

Backup experiment with Selenium

In [152]:
# browser = webdriver.Chrome()
# url = "https://www.kidneycommunitykitchen.ca/kkcookbook/recipes/"
# browser.get(url)
# time.sleep(2)

# browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
# time.sleep(2)
    
# while browser.find_element(By.CSS_SELECTOR, "a.facetwp-page.next"):
#     next_button = browser.find_element(By.CSS_SELECTOR, "a.facetwp-page.next")
#     next_button.click()
#     time.sleep(2)