In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import time
import pickle
import os

In [2]:
root_url = "https://us.openfoodfacts.org/"

In [3]:
# get soup of home page
soup = BeautifulSoup(requests.get(root_url).text, 'html.parser')

In [4]:
max_page = np.max([int(re.search(r'\d+', str(atag)).group()) for atag in soup.find(id="pages").find_all("a")])
max_page

3925

In [5]:
def is_valid_product_link(link):
    try:
        return "/product/" in link['href']
    except KeyError:
        return False

In [6]:
os.listdir("./data")

['completed_urls.pkl', 'product_links.pkl']

In [7]:
product_links = []
completed_urls = []

if "completed_urls.pkl" in os.listdir("./data/"):
    print(f"Found previous url list.")
    with open("./data/completed_urls.pkl", "rb") as f:
        completed_urls = pickle.load(f)
        
if "product_links.pkl" in os.listdir("./data/"):
    print(f"Found previous product link list.")
    with open("./data/product_links.pkl", "rb") as f:
        product_links = pickle.load(f)
        
print(f"\nSo far:")
print(f"{len(completed_urls)} urls scraped.")
print(f"{len(product_links)} product links discovered.")

Found previous url list.
Found previous product link list.

So far:
15 urls scraped.
1500 product links discovered.


In [8]:
max_scrape_index = 20
save_interval = 5

In [10]:
for page_num in range(1,max_scrape_index + 1):
    if page_num in completed_urls:
        continue
    print(f"Fetching {root_url}{page_num}")
    start_time = time.time()
    r = requests.get(f"{root_url}{page_num}")
    if r.status_code == 200:
        page_soup = BeautifulSoup(r.text, 'html.parser')
        new_links = [link['href'] for link in page_soup.find_all("a") if is_valid_product_link(link)]
        product_links.extend(new_links)
        stop_time = time.time()
        print(f"Scraped {len(new_links)} new product links, {len(product_links)} total links. ({stop_time-start_time:.2}s)")
        completed_urls.append(page_num)
    else:
        print(f"Failed to fetch {root_url}{page_num}, status code {r.status_code}")
        
    if page_num % save_interval == 0:
        print(f"Pickling current scraping progress...")
        with open("./data/completed_urls.pkl", "wb") as f:
            pickle.dump(completed_urls, f)
            print(f"Pickled completed_urls.pkl successfully.")
        with open("./data/product_links.pkl", "wb") as f:
            pickle.dump(product_links, f)
            print(f"Pickled product_links.pkl successfully.")
    time.sleep(1)

Fetching https://us.openfoodfacts.org/18
Scraped 100 new product links, 1800 total links. (2.9s)
Fetching https://us.openfoodfacts.org/19
Scraped 100 new product links, 1900 total links. (2.9s)
Fetching https://us.openfoodfacts.org/20
Scraped 100 new product links, 2000 total links. (2.7s)
Pickling current scraping progress...
Pickled completed_urls.pkl successfully.
Pickled product_links.pkl successfully.


In [11]:
len(product_links)

2000

## Individual product ingredients and classification

In [12]:
product_base_url = "https://us.openfoodfacts.org"

In [18]:
product_links = []
item_info = {}
bad_links = []

if "product_links.pkl" in os.listdir("./data/"):
    print(f"Found previous product link list.")
    with open("./data/product_links.pkl", "rb") as f:
        product_links = pickle.load(f)
        
if "bad_links.pkl" in os.listdir("./data/"):
    print(f"Found previous bad link list.")
    with open("./data/bad_links.pkl", "rb") as f:
        bad_links = pickle.load(f)

        
if "item_info.pkl" in os.listdir("./data/"):
    print(f"Found previous item info list.")
    with open("./data/item_info.pkl", "rb") as f:
        item_info = pickle.load(f)
        
print(f"\nSo far:")
print(f"{len(product_links)} product links discovered.")
print(f"{len(bad_links)} links are bad.")
print(f"{len(item_info)} items scraped.")

Found previous product link list.
Found previous bad link list.

So far:
2000 product links discovered.
116 links are bad.
0 items scraped.


In [None]:
counter = 0
for link in product_links:
    counter += 1
    if link in item_info or link in bad_links:
        continue
    print(f"Fetching {product_base_url}{link}")
    start_time = time.time()
    r = requests.get(f"{product_base_url}{link}")
    if r.status_code == 200:
        page_soup = BeautifulSoup(r.text, 'html.parser')
        try:
            new_data = {
                "name": page_soup.title.text.lower(),
                "nutriscore": page_soup.find("tr", id="nutriment_nutriscore_tr").find("td", class_="nutriment_value").text.strip(),
                "ingredients": page_soup.find("div", id="ingredients_list").text,
                "is_vegan": "Vegan" in [child.text.strip() for child in page_soup.find("p", id="ingredients_analysis").find_all("span", class_="alert")]
            }
            
            if len([child.text.strip() for child in page_soup.find("p", id="ingredients_analysis").find_all("span", class_="alert")]) == 0:
                raise AttributeError
            
            item_info[link] = new_data
            is_vegan_formatted = "Vegan" if new_data["is_vegan"] else "Not Vegan"
            stop_time = time.time()
            print(f"Scraped {new_data['name']} ({is_vegan_formatted}), {len(item_info)} total items. ({stop_time-start_time:.2}s)")
        except AttributeError:
            print(f"Bad link! Skipping.")
            bad_links.append(link)
            with open("./data/bad_links.pkl", "wb") as f:
                pickle.dump(bad_links, f)
                print(f"Pickled bad_links.pkl successfully.")

    else:
        print(f"Failed to fetch {product_base_url}{link}, status code {r.status_code}")
        
    if counter % save_interval == 0:
        print(f"Pickling current scraping progress...")
        with open("./data/item_info.pkl", "wb") as f:
            pickle.dump(item_info, f)
            print(f"Pickled item_info.pkl successfully.")
    time.sleep(1)

Fetching https://us.openfoodfacts.org/product/20008017/zumo-de-manzana-solevita
Scraped zumo de manzana - solevita - 1,5 l (Vegan), 114 total items. (1.6s)
Fetching https://us.openfoodfacts.org/product/3700749302157/soupe-butternut-patate-douce-marcel-bio
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/4005500087038/delikatess-mayonnaise-thomy
Scraped delikatess mayonnaise - thomy - 250ml (Not Vegan), 115 total items. (1.3s)
Fetching https://us.openfoodfacts.org/product/5601378850300/tremocos-macaroco
Scraped tremoços - maçaroco - 550 gr (Vegan), 116 total items. (1.8s)
Fetching https://us.openfoodfacts.org/product/7622210307460/original-philadelphia
Scraped original - philadelphia - 180 g (Not Vegan), 117 total items. (1.7s)
Fetching https://us.openfoodfacts.org/product/8000500026731/kinder-surprise-3-pack
Scraped kinder surprise 3 pack - 3 x 20g (Not Vegan), 118 total items. (1.1s)
Pickling current scraping progress...
Pickled ite

Fetching https://us.openfoodfacts.org/product/3564700282793/torsades-cuisson-rapide-turini
Scraped torsades cuisson rapide - turini - 500 g (Vegan), 147 total items. (1.3s)
Fetching https://us.openfoodfacts.org/product/5400141234800/sirop-d-agave-bio-boni
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/4388860638110/erdbeer-banane-smoothie-rewe-to-go
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Fetching https://us.openfoodfacts.org/product/0012345678905/sphere-plaisir
Scraped sphère plaisir - 1 (Not Vegan), 148 total items. (1.3s)
Fetching https://us.openfoodfacts.org/product/0024463061095/sriracha-chili-sauce-huy-fong
Bad link! Skipping.
Pickled bad_links.pkl successfully.
Pickling current scraping progress...
Pickled item_info.pkl successfully.
Fetching https://us.openfoodfacts.org/product/27029114/delinut-light
Scraped delinut light - 200 g (Not Vegan), 149 total items. (1.2s)
Fetching https://us.openfoodfacts.org/prod

In [None]:
print(f"{100*(np.sum([item_info[item]['is_vegan'] for item in item_info])/len(item_info)):.2f}% of {len(item_info)} items are vegan")