In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import re

In [175]:
# example scraping functions found online
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [174]:
# function used to create nutrition table from recipe

def create_nutrition_table(url):
    raw_html = simple_get(url)
    html = BeautifulSoup(raw_html, 'html.parser')
    nutrition_info = html.findAll('div',{'class':'wprm-nutrition-label-container-simple'})

    # create table to hold recipe
    nutrition_table = pd.DataFrame(columns=['Nutrient','Recipe Amount','Recipe Unit'])
    
    # add nutrients
    nutrients = nutrition_info[0].findAll('span',{'class':'wprm-nutrition-label-text-nutrition-label wprm-block-text-bold'})
    #print(nutrients)
    for nutrient in range(len(nutrients)):
        nutrition_table.loc[nutrient,'Nutrient'] = nutrients[nutrient].text.replace(': ','')
        
    # add amounts
    amounts = nutrition_info[0].findAll('span',{'class':'wprm-nutrition-label-text-nutrition-value'})
    for amount in range(len(amounts)):
        nutrition_table.loc[amount,'Recipe Amount'] = amounts[amount].text

    # add units
    units = nutrition_info[0].findAll('span',{'class':'wprm-nutrition-label-text-nutrition-unit'})
    for unit in range(len(units)):
        nutrition_table.loc[unit,'Recipe Unit'] = units[unit].text


    # adjustments for given recipe sites to allow for better joins to daily value table
    nutrition_table.loc[nutrition_table['Nutrient'] == 'Carbohydrates','Nutrient'] = 'Total carbohydrate'
    nutrition_table.loc[nutrition_table['Nutrient'] == 'Fiber','Nutrient'] = 'Dietary Fiber'
    nutrition_table.loc[nutrition_table['Nutrient'] == 'Saturated Fat','Nutrient'] = 'Saturated fat'
        
    #print(nutrition_table)

    return nutrition_table

create_nutrition_table('https://minimalistbaker.com/garlicky-kale-salad-with-crispy-chickpeas/')

Unnamed: 0,Nutrient,Recipe Amount,Recipe Unit
0,Serving,1.0,serving
1,Calories,562.0,
2,Total carbohydrate,37.3,g
3,Protein,15.0,g
4,Fat,40.8,g
5,Saturated fat,5.7,g
6,Polyunsaturated Fat,11.48,g
7,Monounsaturated Fat,21.04,g
8,Trans Fat,0.0,g
9,Cholesterol,0.0,mg


In [173]:
# function used to create daily value table

def create_daily_value_table():
    # using statis webpage with average daily values
    raw_html = simple_get('https://www.fda.gov/food/nutrition-facts-label/daily-value-nutrition-and-supplement-facts-labels')
    html = BeautifulSoup(raw_html, 'html.parser')
    daily_value_table_html = html.findAll('table')

    # create table to hold daily values
    daily_value_table = pd.DataFrame(columns=['Nutrient','Daily Value'])
    
    # add nutrients
    daily_value_table_html_table_rows = daily_value_table_html[0].findAll('tr')
    
    for row in daily_value_table_html_table_rows:
        columns = row.find_all('td')
        if(columns != []):
            nutrient = columns[0].text
            dv = columns[1].text
            daily_value_table = pd.concat([daily_value_table,pd.DataFrame([{'Nutrient':nutrient, 'Daily Value':dv}])], ignore_index=True)
    
    daily_value_table[['DV Amount','DV Unit']] = daily_value_table['Daily Value'].str.split(r'([\d.]+)', expand=True).drop(0, axis=1)

    daily_value_table.drop(columns=['Daily Value'],inplace=True)

    #print(daily_value_table)
        
    return daily_value_table

create_daily_value_table()

Unnamed: 0,Nutrient,DV Amount,DV Unit
0,Added sugars,50.0,g
1,Biotin,30.0,mcg
2,Calcium,1300.0,mg
3,Chloride,2300.0,mg
4,Choline,550.0,mg
5,Cholesterol,300.0,mg
6,Chromium,35.0,mcg
7,Copper,0.9,mg
8,Dietary Fiber,28.0,g
9,Fat,78.0,g


In [172]:
def join_and_format_nutrition_tables(nutrition_table, daily_value_table):
    
    combined_nutrition_table = nutrition_table.merge(daily_value_table,how='left',on='Nutrient')

    combined_nutrition_table['Recipe Amount'] = combined_nutrition_table['Recipe Amount'].astype(float)
    combined_nutrition_table['DV Amount'] = combined_nutrition_table['DV Amount'].astype(float)
    combined_nutrition_table['Percent DV'] = (combined_nutrition_table['Recipe Amount'] / combined_nutrition_table['DV Amount']) * 100
    
    #print(combined_nutrition_table)

    return combined_nutrition_table


nutrition_table = create_nutrition_table('https://minimalistbaker.com/garlicky-kale-salad-with-crispy-chickpeas/')
daily_value_table = create_daily_value_table()
join_and_format_nutrition_tables(nutrition_table, daily_value_table)

               Nutrient  Recipe Amount Recipe Unit  DV Amount  DV Unit  \
0               Serving           1.00     serving        NaN      NaN   
1              Calories         562.00                    NaN      NaN   
2    Total carbohydrate          37.30           g      275.0        g   
3               Protein          15.00           g       50.0        g   
4                   Fat          40.80           g       78.0        g   
5         Saturated fat           5.70           g       20.0        g   
6   Polyunsaturated Fat          11.48           g        NaN      NaN   
7   Monounsaturated Fat          21.04           g        NaN      NaN   
8             Trans Fat           0.00           g        NaN      NaN   
9           Cholesterol           0.00          mg      300.0       mg   
10               Sodium         277.00          mg     2300.0       mg   
11            Potassium         441.00          mg     4700.0       mg   
12        Dietary Fiber          14.00