In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import time
import pickle
import os

In [2]:
root_url = "https://us.openfoodfacts.org/"

In [3]:
# get soup of home page
soup = BeautifulSoup(requests.get(root_url).text, 'html.parser')

In [4]:
max_page = np.max([int(re.search(r'\d+', str(atag)).group()) for atag in soup.find(id="pages").find_all("a")])
max_page

3925

In [5]:
def is_valid_product_link(link):
    try:
        return "/product/" in link['href']
    except KeyError:
        return False

In [6]:
os.listdir("./data")

['completed_urls.pkl',
 '.DS_Store',
 'bad_links.pkl',
 'product_links.pkl',
 'item_info.pkl']

In [7]:
product_links = []
completed_urls = []

if "completed_urls.pkl" in os.listdir("./data/"):
    print(f"Found previous url list.")
    with open("./data/completed_urls.pkl", "rb") as f:
        completed_urls = pickle.load(f)
        
if "product_links.pkl" in os.listdir("./data/"):
    print(f"Found previous product link list.")
    with open("./data/product_links.pkl", "rb") as f:
        product_links = pickle.load(f)
        
print(f"\nSo far:")
print(f"{len(completed_urls)} urls scraped.")
print(f"{len(product_links)} product links discovered.")

Found previous url list.
Found previous product link list.

So far:
20 urls scraped.
2000 product links discovered.


In [19]:
max_scrape_index = 50
save_interval = 5

In [20]:
for page_num in range(1,max_scrape_index + 1):
    if page_num in completed_urls:
        continue
    print(f"Fetching {root_url}{page_num}")
    start_time = time.time()
    r = requests.get(f"{root_url}{page_num}")
    if r.status_code == 200:
        page_soup = BeautifulSoup(r.text, 'html.parser')
        new_links = [link['href'] for link in page_soup.find_all("a") if is_valid_product_link(link)]
        product_links.extend(new_links)
        stop_time = time.time()
        print(f"Scraped {len(new_links)} new product links, {len(product_links)} total links. ({stop_time-start_time:.2}s)")
        completed_urls.append(page_num)
    else:
        print(f"Failed to fetch {root_url}{page_num}, status code {r.status_code}")
        
    if page_num % save_interval == 0:
        print(f"Pickling current scraping progress...")
        with open("./data/completed_urls.pkl", "wb") as f:
            pickle.dump(completed_urls, f)
            print(f"Pickled completed_urls.pkl successfully.")
        with open("./data/product_links.pkl", "wb") as f:
            pickle.dump(product_links, f)
            print(f"Pickled product_links.pkl successfully.")
    time.sleep(1)

Fetching https://us.openfoodfacts.org/31
Scraped 100 new product links, 3100 total links. (2.0s)
Fetching https://us.openfoodfacts.org/32
Scraped 100 new product links, 3200 total links. (2.3s)
Fetching https://us.openfoodfacts.org/33
Scraped 100 new product links, 3300 total links. (1.7s)
Fetching https://us.openfoodfacts.org/34
Scraped 100 new product links, 3400 total links. (1.9s)
Fetching https://us.openfoodfacts.org/35
Scraped 100 new product links, 3500 total links. (1.8s)
Pickling current scraping progress...
Pickled completed_urls.pkl successfully.
Pickled product_links.pkl successfully.
Fetching https://us.openfoodfacts.org/36
Scraped 100 new product links, 3600 total links. (2.2s)
Fetching https://us.openfoodfacts.org/37
Scraped 100 new product links, 3700 total links. (2.4s)
Fetching https://us.openfoodfacts.org/38
Scraped 100 new product links, 3800 total links. (2.3s)
Fetching https://us.openfoodfacts.org/39
Scraped 100 new product links, 3900 total links. (2.5s)
Fetching

In [21]:
len(product_links)

5000

## Individual product ingredients and classification

In [22]:
product_base_url = "https://us.openfoodfacts.org"

In [23]:
product_links = []
item_info = {}
bad_links = []

if "product_links.pkl" in os.listdir("./data/"):
    print(f"Found previous product link list.")
    with open("./data/product_links.pkl", "rb") as f:
        product_links = pickle.load(f)
        
if "bad_links.pkl" in os.listdir("./data/"):
    print(f"Found previous bad link list.")
    with open("./data/bad_links.pkl", "rb") as f:
        bad_links = pickle.load(f)

        
if "item_info.pkl" in os.listdir("./data/"):
    print(f"Found previous item info list.")
    with open("./data/item_info.pkl", "rb") as f:
        item_info = pickle.load(f)
        
print(f"\nSo far:")
print(f"{len(product_links)} product links discovered.")
print(f"{len(bad_links)} links are bad.")
print(f"{len(item_info)} items scraped.")

Found previous product link list.
Found previous bad link list.
Found previous item info list.

So far:
5000 product links discovered.
329 links are bad.
552 items scraped.


In [None]:
counter = 0
for link in product_links:
    counter += 1
    if link in item_info or link in bad_links:
        continue
    print(f"Fetching {product_base_url}{link}")
    start_time = time.time()
    r = requests.get(f"{product_base_url}{link}")
    if r.status_code == 200:
        page_soup = BeautifulSoup(r.text, 'html.parser')
        try:
            new_data = {
                "name": page_soup.title.text.lower(),
                "ingredients": page_soup.find("div", id="ingredients_list").text,
                "is_vegan": "Vegan" in [child.text.strip() for child in page_soup.find("p", id="ingredients_analysis").find_all("span", class_="alert")]
            }
            
            if len([child.text.strip() for child in page_soup.find("p", id="ingredients_analysis").find_all("span", class_="alert")]) == 0:
                raise AttributeError
            
            item_info[link] = new_data
            is_vegan_formatted = "Vegan" if new_data["is_vegan"] else "Not Vegan"
            stop_time = time.time()
            print(f"Scraped {new_data['name']} ({is_vegan_formatted}), {len(item_info)} total items. ({stop_time-start_time:.2}s)")
        except AttributeError:
            print(f"Bad link! Skipping.")
            bad_links.append(link)
            with open("./data/bad_links.pkl", "wb") as f:
                pickle.dump(bad_links, f)
                print(f"Pickled bad_links.pkl successfully.")

    else:
        print(f"Failed to fetch {product_base_url}{link}, status code {r.status_code}")
        
    if counter % save_interval == 0:
        print(f"Pickling current scraping progress...")
        with open("./data/item_info.pkl", "wb") as f:
            pickle.dump(item_info, f)
            print(f"Pickled item_info.pkl successfully.")
    time.sleep(1)

In [30]:
print(f"{100*(np.sum([item_info[item]['is_vegan'] for item in item_info])/len(item_info)):.2f}% of {len(item_info)} items are vegan")

28.52% of 1897 items are vegan


In [40]:
item_info

{'/product/3274080005003/spring-water-cristaline': {'name': 'spring water - cristaline - 1,5\xa0l',
  'ingredients': 'spring water',
  'is_vegan': True},
 '/product/7622210449283/prince-chocolat-lu': {'name': 'prince chocolat - lu - 300\xa0g e',
  'ingredients': 'cereal 50,7% (blé flour 35%, whole wheat flour 15,7%), sugar, vegetable oils (palm, rapeseed), lean cocoa powder 4,5%, glucose syrup, blé starch, rise powder (ammonium acid carbonate, sodium acid carbonate, disodium diphosphate), emulsifiers (soja lecithin, sunflower lecithin), salt, milk skimmed powder, lactose and milk proteins, flavours,',
  'is_vegan': False},
 '/product/3256540000698/pains-au-lait-pasquier': {'name': 'pains au lait - pasquier - 350\xa0g',
  'ingredients': 'blé flour 41%, sourdough 20% (blé flour 11%, water, salt), freshufs 11%, sugar, butter pastry, rapeseed oil, yeast, milk skimmed powder (equivalent to 10% of reconstituted skimmed milk), emulsifier: mono and digests of fatty acids, salt, blé protein, bl