In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import time
import pickle
import os

In [2]:
root_url = "https://us.openfoodfacts.org/"

In [3]:
# get soup of home page
soup = BeautifulSoup(requests.get(root_url).text, 'html.parser')

In [4]:
max_page = np.max([int(re.search(r'\d+', str(atag)).group()) for atag in soup.find(id="pages").find_all("a")])
max_page

3925

In [5]:
def is_valid_product_link(link):
    try:
        return "/product/" in link['href']
    except KeyError:
        return False

In [6]:
os.listdir("./data")

['completed_urls.pkl', 'product_links.pkl']

In [7]:
product_links = []
completed_urls = []

if "completed_urls.pkl" in os.listdir("./data/"):
    print(f"Found previous url list.")
    with open("./data/completed_urls.pkl", "rb") as f:
        completed_urls = pickle.load(f)
        
if "product_links.pkl" in os.listdir("./data/"):
    print(f"Found previous product link list.")
    with open("./data/product_links.pkl", "rb") as f:
        product_links = pickle.load(f)
        
print(f"\nSo far:")
print(f"{len(completed_urls)} urls scraped.")
print(f"{len(product_links)} product links discovered.")

Found previous url list.
Found previous product link list.

So far:
15 urls scraped.
1500 product links discovered.


In [8]:
max_scrape_index = 20
save_interval = 5

In [10]:
for page_num in range(1,max_scrape_index + 1):
    if page_num in completed_urls:
        continue
    print(f"Fetching {root_url}{page_num}")
    start_time = time.time()
    r = requests.get(f"{root_url}{page_num}")
    if r.status_code == 200:
        page_soup = BeautifulSoup(r.text, 'html.parser')
        new_links = [link['href'] for link in page_soup.find_all("a") if is_valid_product_link(link)]
        product_links.extend(new_links)
        stop_time = time.time()
        print(f"Scraped {len(new_links)} new product links, {len(product_links)} total links. ({stop_time-start_time:.2}s)")
        completed_urls.append(page_num)
    else:
        print(f"Failed to fetch {root_url}{page_num}, status code {r.status_code}")
        
    if page_num % save_interval == 0:
        print(f"Pickling current scraping progress...")
        with open("./data/completed_urls.pkl", "wb") as f:
            pickle.dump(completed_urls, f)
            print(f"Pickled completed_urls.pkl successfully.")
        with open("./data/product_links.pkl", "wb") as f:
            pickle.dump(product_links, f)
            print(f"Pickled product_links.pkl successfully.")
    time.sleep(1)

Fetching https://us.openfoodfacts.org/18
Scraped 100 new product links, 1800 total links. (2.9s)
Fetching https://us.openfoodfacts.org/19
Scraped 100 new product links, 1900 total links. (2.9s)
Fetching https://us.openfoodfacts.org/20
Scraped 100 new product links, 2000 total links. (2.7s)
Pickling current scraping progress...
Pickled completed_urls.pkl successfully.
Pickled product_links.pkl successfully.


In [11]:
len(product_links)

2000

## Individual product ingredients and classification

In [12]:
product_base_url = "https://us.openfoodfacts.org"

In [24]:
product_links = []
item_info = {}
bad_links = []

if "product_links.pkl" in os.listdir("./data/"):
    print(f"Found previous product link list.")
    with open("./data/product_links.pkl", "rb") as f:
        product_links = pickle.load(f)
        
if "bad_links.pkl" in os.listdir("./data/"):
    print(f"Found previous bad link list.")
    with open("./data/bad_links.pkl", "rb") as f:
        bad_links = pickle.load(f)

        
if "item_info.pkl" in os.listdir("./data/"):
    print(f"Found previous item info list.")
    with open("./data/item_info.pkl", "rb") as f:
        item_info = pickle.load(f)
        
print(f"\nSo far:")
print(f"{len(product_links)} product links discovered.")
print(f"{len(bad_links)} links are bad.")
print(f"{len(item_info)} items scraped.")

Found previous product link list.

So far:
2000 product links discovered.
0 links are bad.
0 items scraped.


In [27]:
counter = 0
for link in product_links:
    counter += 1
    if link in item_info or link in bad_links:
        continue
    print(f"Fetching {product_base_url}{link}")
    start_time = time.time()
    r = requests.get(f"{product_base_url}{link}")
    if r.status_code == 200:
        page_soup = BeautifulSoup(r.text, 'html.parser')
        try:
            new_data = {
                "name": page_soup.title.text.lower(),
                "ingredients": page_soup.find("div", id="ingredients_list").text,
                "is_vegan": "Vegan" in [child.text.strip() for child in page_soup.find("p", id="ingredients_analysis").find_all("span", class_="alert")]
            }
            
            if len([child.text.strip() for child in page_soup.find("p", id="ingredients_analysis").find_all("span", class_="alert")]) == 0:
                raise AttributeError
            
            item_info[link] = new_data
            is_vegan_formatted = "Vegan" if new_data["is_vegan"] else "Not Vegan"
            stop_time = time.time()
            print(f"Scraped {new_data['name']} ({is_vegan_formatted}), {len(item_info)} total items. ({stop_time-start_time:.2}s)")
        except AttributeError:
            print(f"Bad link! Skipping.")
            bad_links.append(link)
            with open("./data/bad_links.pkl", "wb") as f:
                pickle.dump(bad_links, f)
                print(f"Pickled bad_links.pkl successfully.")

    else:
        print(f"Failed to fetch {product_base_url}{link}, status code {r.status_code}")
        
    if counter % save_interval == 0:
        print(f"Pickling current scraping progress...")
        with open("./data/item_info.pkl", "wb") as f:
            pickle.dump(item_info, f)
            print(f"Pickled item_info.pkl successfully.")
    time.sleep(1)

Fetching https://us.openfoodfacts.org/product/3564707135856/coeur-fondant-chocolat-noir-bio-x-2-bio-village
Scraped coeur fondant chocolat noir bio x 2 - bio village - 180 g (Not Vegan), 294 total items. (3.1s)
Fetching https://us.openfoodfacts.org/product/00985535/kinder-bueno
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/4334011082660/tip
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/8719128117485/schwip-schwap-zero-pepsico
Scraped schwip schwap zero - pepsico - 440ml (Not Vegan), 295 total items. (1.0s)
Fetching https://us.openfoodfacts.org/product/85170100/orgain
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0043647440013/old-times-orange-fine-cut-marmalade-wilkin-sons-ltd
Scraped 'old times' orange fine cut marmalade - wilkin sons ltd - 340 g 

Fetching https://us.openfoodfacts.org/product/0602652171680/kind-bar-dark-chocolate-nuts-and-sea-salt
Scraped kind bar, dark chocolate nuts and sea salt (Not Vegan), 321 total items. (1.2s)
Fetching https://us.openfoodfacts.org/product/20003395/creamy-almond-butter-belbake
Scraped creamy almond butter - belbake - 200g (Vegan), 322 total items. (1.8s)
Fetching https://us.openfoodfacts.org/product/5412514931889/gorditas-soft-tacos-poco-loco
Scraped gorditas soft tacos - poco loco (Not Vegan), 323 total items. (1.6s)
Fetching https://us.openfoodfacts.org/product/5413110013160/red-curry-poulet-et-riz-parfume-isali
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/01212901/pepsi
Scraped pepsi - 591 ml (Not Vegan), 324 total items. (0.96s)
Fetching https://us.openfoodfacts.org/product/07811403/boisson-americaine-au-gingembre-canada-dry-ginger-ale
Scraped boisson améri

Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/3564700004807/petits-pois-carottes-notre-jardin
Scraped petits pois carottes - notre jardin - 3 x 130 g (Not Vegan), 349 total items. (0.93s)
Fetching https://us.openfoodfacts.org/product/7622300735838/dairy-milk-chocolate-bar-cadbury
Scraped dairy milk chocolate bar - cadbury - 200 g (Not Vegan), 350 total items. (1.1s)
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/8411320234006/ortiz-white-tuna-in-olive-oil
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/3760128849352/creme-dessert-au-chocolat-yabon
Scraped creme dessert au chocolat - yabon - 500 g (Not Vegan), 351 total items. (1.6s)
Fetching https://us.openfoodfacts.org/product/7501011110335/frit-os-limon-y-sal-sabritas
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfac

Fetching https://us.openfoodfacts.org/product/3252540949746/cantal-jeune-aop-ets-schoepfer
Scraped cantal jeune aop - ets schoepfer - 180 g (Not Vegan), 369 total items. (1.0s)
Fetching https://us.openfoodfacts.org/product/3261570000136/la-biere-du-demon-goudale
Scraped la bière du démon,goudale - 33 cl (Vegan), 370 total items. (1.2s)
Fetching https://us.openfoodfacts.org/product/3553460000143/spiced-sardines-in-vegetable-oil-hot-titus
Scraped spiced sardines in vegetable oil - hot titus - 125 g (Not Vegan), 371 total items. (1.3s)
Fetching https://us.openfoodfacts.org/product/3859888858336/cream-cheese
Scraped cream cheese (Not Vegan), 372 total items. (1.0s)
Fetching https://us.openfoodfacts.org/product/5010029219494/super-smooth-porridge-weetabix
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/7613033014673/barszcz-czerwony-instant-winiary
Scraped barszcz 

Bad link! Skipping.
Pickled bad_links.pkl successfully.
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/20045463/chimichurri-cauliflower-steaks-le-cesarin-gourmand
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/20153144/pepinillos-con-miel-freshona
Scraped pepinillos con miel - freshona (Not Vegan), 394 total items. (2.8s)
Fetching https://us.openfoodfacts.org/product/3263670128710/connetable-sardines-genereuses-a-la-catalane
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/5000183501108/milk-chocolate-spread-cadbury
Scraped milk chocolate spread - cadbury - 400 g (Not Vegan), 395 total items. (0.99s)
Fetching https://us.openfoodfacts.org/product/5060336505049/original-pickle-branston
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Pickling current scraping progress...
Pickled item_info.pkl successful

Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/8008343200424/conchiglie-rigate-n-42-rummo
Scraped conchiglie rigate n° 42 - rummo - 500 g (Vegan), 415 total items. (1.3s)
Fetching https://us.openfoodfacts.org/product/0012000286186/lipton-pure-leaf-lemon-iced-tea-in-bottle
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0016000122505/chex-mix-traditional-less-fat-than-regular-potato-chips
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0021000047079/crunchy-soft-taco-dinner-kit-toco-bell
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0040000513018/mm-s-peanut-butter-m-m-s
Scraped mm's peanut butter - m&m's - 272 g (Not Vegan), 416 total items. (1.5s)
Fetching https://us.openfoodfacts.org/product/0043

Fetching https://us.openfoodfacts.org/product/4061458056205/frischkasezuberaitung-cremi-alpenmark
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/02915708/southwest-chopped-salad-kit
Scraped southwest chopped salad kit (Not Vegan), 448 total items. (1.7s)
Fetching https://us.openfoodfacts.org/product/0087684000953/pacific-cooler-mixed-fruit-flavored-juice-drink-blend-caprisun
Scraped pacific cooler mixed fruit flavored juice drink blend - caprisun (Not Vegan), 449 total items. (1.7s)
Fetching https://us.openfoodfacts.org/product/07933925/tiny-tangy-crunchy-candy-nerds-wild-about-nerds
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0016229909109/pomegranate-nectar-foco
Scraped pomegranate nectar - foco - 11.8 fl oz, 350 ml (Vegan), 450 total items. (1.1s)
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/022838

Fetching https://us.openfoodfacts.org/product/0049000036473/diet-coke-lime
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0049733091015/original-hot-sauce-original-cholula
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0070847010463/energy-iced-tea-energy-drink-monster
Scraped energy iced tea energy drink - monster - 15.5 fl oz (Not Vegan), 475 total items. (1.6s)
Fetching https://us.openfoodfacts.org/product/0073141152310/pocky-chocolate-covered-biscuit-sticks-glico
Scraped pocky chocolate covered biscuit sticks - glico - 1.37 oz (39 g) (Not Vegan), 476 total items. (2.0s)
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0074780000703/carbonated-mineral-water-perrier
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0076406021505/apricot-nectar-from-concen

Fetching https://us.openfoodfacts.org/product/0041449402277/supreme-muffin-mix-krusteaz
Scraped supreme muffin mix - krusteaz - 436 gr (Not Vegan), 498 total items. (2.0s)
Fetching https://us.openfoodfacts.org/product/0041498166083/whole-natural-almonds-southern-grove
Scraped whole natural almonds - southern grove (Vegan), 499 total items. (1.3s)
Fetching https://us.openfoodfacts.org/product/0041508800129/sparkling-natural-mineral-water-san-pellegrino
Scraped sparkling natural mineral water - san pellegrino - 25.3 fl. oz (1 pt 9.3 fl. oz) 750 ml (Vegan), 500 total items. (1.5s)
Fetching https://us.openfoodfacts.org/product/0049000075328/mccafe-frappe-caramel-coffee-bottle-13-7-fl-oz-mcdonald-s
Scraped mccafe frappe caramel coffee bottle, 13.7 fl oz - mcdonald's - 405 ml (Not Vegan), 501 total items. (1.7s)
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0049283802185/s-pellegrino
Bad link! Skipping.
Pickled bad_lin

KeyboardInterrupt: 

In [28]:
print(f"{100*(np.sum([item_info[item]['is_vegan'] for item in item_info])/len(item_info)):.2f}% of {len(item_info)} items are vegan")

33.46% of 508 items are vegan
