In [1]:
from openfoodfacts import API, APIVersion, Country, Environment, Flavor
import json
import pandas as pd 

In [None]:
# https://openfoodfacts.github.io/openfoodfacts-python/

In [2]:

user_agent = "CocinEco/1.0 (maria.a3isp@gmail.com)"

# User-Agent is mandatory
api = API(user_agent=user_agent, 
          username=None,
          password=None,
          country=Country.world, # can we set a specific country here?
          flavor=Flavor.off,
          version=APIVersion.v2,
          environment=Environment.org)

In [None]:
code = "3017620422003"
api.product.get(code, fields=["code", "product_name"])
# {'code': '3017620422003', 'product_name': 'Nutella'}


In [None]:
api.product.text_search("mineral water")
# {"count": 3006628, "page": 1, "page_count": 20, "page_size": 20, "products": [{...}], "skip": 0}



In [None]:
res = api.product.text_search("mineral water")
# {"count": 3006628, "page": 1, "page_count": 20, "page_size": 20, "products": [{...}], "skip": 0}

len(res)


In [None]:
print(type(res))
print(res.keys())

In [None]:
for k, v in res.items():
    print(f"{k}: {v}")

In [3]:
# this filter probably is not returning correct results 
res = api.product.text_search("Italy")
# {"count": 3006628, "page": 1, "page_count": 20, "page_size": 20, "products": [{...}], "skip": 0}
for k, v in res.items():
    print(f"{k}: {v}")


count: 8174
page: 1
page_count: 20
page_size: 20
skip: 0


In [4]:
# results from one page only? only 20 products?
# products key has 20 items 
# each product has 335 keys 
len(res["products"])



20

In [None]:
with open("product_example_1.json", "w") as f:
    json.dump(res["products"][0], f, indent=4)
with open("product_example_2.json", "w") as f:
    json.dump(res["products"][1], f, indent=4)

In [11]:
# "nutrient_levels" key -> this turns into 4 categories: fat, saturated-fat, sugars, salt 
# here we take all the potential values of this key for all the products, to have standardized columns in the DB
nutrient_levels_all_cat = []

for i, product in enumerate(res["products"]):
    nutrient_levels = product.get("nutrient_levels")
    
    if nutrient_levels is not None:
        nutrient_levels_cat = list(nutrient_levels.keys())

    nutrient_levels_all_cat.extend(nutrient_levels_cat)

nutrient_levels_all_cat = list(set(nutrient_levels_all_cat))
nutrient_levels_all_cat = [f"nutrient_levels_{c}" for c in nutrient_levels_all_cat]
print(len(nutrient_levels_all_cat))
nutrient_levels_all_cat


4


['nutrient_levels_sugars',
 'nutrient_levels_fat',
 'nutrient_levels_saturated-fat',
 'nutrient_levels_salt']

In [10]:
# "nutriments" key -> this turns into several other categories
# here we take all the potential values of this key for all the products, to have standardized columns in the DB
# (this key can have different subkeys for different products)
nutriments_levels_all_cat = []

for i, product in enumerate(res["products"]):
    nutrient_levels = product.get("nutriments")
    
    if nutrient_levels is not None:
        nutrient_levels_cat = list(nutrient_levels.keys())

    nutrient_levels_all_cat.extend(nutrient_levels_cat)

nutriments_levels_all_cat = list(set(nutrient_levels_all_cat))
nutriments_levels_all_cat = [f"nutriments_{c}" for c in nutriments_levels_all_cat]

print(len(nutriments_levels_all_cat))
nutriments_levels_all_cat



131


['nutriments_nutrient_levels_saturated-fat',
 'nutriments_nutrient_levels_fat',
 'nutriments_nutrition-score-fr_100g',
 'nutriments_sugars_value',
 'nutriments_fruits-vegetables-nuts-estimate-from-ingredients_serving',
 'nutriments_iron_unit',
 'nutriments_sodium_100g',
 'nutriments_fiber_100g',
 'nutriments_energy-kj_value',
 'nutriments_iron_label',
 'nutriments_iron',
 'nutriments_energy-kcal_prepared_unit',
 'nutriments_alpha-linolenic-acid_value',
 'nutriments_salt_value',
 'nutriments_magnesium',
 'nutriments_fat_unit',
 'nutriments_carbohydrates_100g',
 'nutriments_calcium_100g',
 'nutriments_magnesium_serving',
 'nutriments_fruits-vegetables-nuts_prepared_unit',
 'nutriments_saturated-fat',
 'nutriments_saturated-fat_unit',
 'nutriments_carbohydrates_value',
 'nutriments_phosphore_100g',
 'nutriments_carbon-footprint-from-known-ingredients_serving',
 'nutriments_energy-kj_value_computed',
 'nutriments_phosphorus_serving',
 'nutriments_energy-kcal_100g',
 'nutriments_carbohydrat

In [19]:
keys_to_keep = [
    "product_name", 
    "categories_tags", 
    "ingredients_text",
    "manufacturing_places",
    "stores",
    "stores_tags"
]

print(len(nutrient_levels_all_cat))
print(len(nutriments_levels_all_cat))
print(len(keys_to_keep))

keys_to_keep.extend(nutrient_levels_all_cat)
#keys_to_keep.extend(nutriments_levels_all_cat)

print(len(keys_to_keep))
keys_to_keep


4
131
6
10


['product_name',
 'categories_tags',
 'ingredients_text',
 'manufacturing_places',
 'stores',
 'stores_tags',
 'nutrient_levels_sugars',
 'nutrient_levels_fat',
 'nutrient_levels_saturated-fat',
 'nutrient_levels_salt']

In [86]:
from collections import defaultdict

def get_simple_keys(product, keys):

    result = {k: None for k in keys}

    for k in keys:
        if k in product:
            result[k] = product[k]

    return result


def get_sub_dict_results(product, key, all_subkeys):
        
    result = {k: None for k in all_subkeys}

    if key in product:
        sub_dict = product[key]
   
        for k in all_subkeys:

            subkey_updated = k.replace(f"{key}_", "")

            if subkey_updated in sub_dict:
                result[k] = sub_dict[subkey_updated]
  
    return result

In [87]:
all_res = []

# TODO: generalize this code for all possible dict/list that we want to keep 

simple_keys = ['product_name',
 'categories_tags',
 'ingredients_text',
 'manufacturing_places',
 'stores',
 'stores_tags'
 ]

nutrient_levels_subkeys = nutrient_levels_all_cat.copy()
nutriments_levels_subkeys = nutriments_levels_all_cat.copy()

for i, product in enumerate(res["products"]):
    product_res_dict = {}
    simple_results = get_simple_keys(product, simple_keys)
    nutrients_results = get_sub_dict_results(product, key="nutrient_levels", all_subkeys=nutrient_levels_subkeys)
    nutriments_levels_results = get_sub_dict_results(product, key="nutriments", all_subkeys=nutriments_levels_subkeys)

    product_res_dict.update(simple_results)
    product_res_dict.update(nutrients_results)
    product_res_dict.update(nutriments_levels_results)

    all_res.append(product_res_dict)

df_full = pd.DataFrame(all_res)
df_full.head()

Unnamed: 0,product_name,categories_tags,ingredients_text,manufacturing_places,stores,stores_tags,nutrient_levels_sugars,nutrient_levels_fat,nutrient_levels_saturated-fat,nutrient_levels_salt,...,nutriments_zinc_100g,nutriments_salt_prepared_unit,nutriments_calcium_unit,nutriments_energy-kcal_serving,nutriments_energy_unit,nutriments_calcium_value,nutriments_energy-kj_unit,nutriments_carbon-footprint_prepared_unit,nutriments_energy-kcal_value_computed,nutriments_nutrient_levels_salt
0,Pesto Genovese 190g Barilla,"[en:condiments, en:sauces, en:pasta-sauces, en...","sunflower oil, fresh basil 30%, cashew nuts, p...",Italy,"carrefour.fr,Coop Obs!,Denner AG,Carrefour,Sup...","[carrefour-fr, coop-obs, denner-ag, carrefour,...",moderate,high,high,high,...,,g,,234.0,kJ,,kJ,g,491.8,
1,Bio-Almond-Mandelmilch,"[en:beverages-and-beverages-preparations, en:p...","Bio-Getränk auf Mandelbasis, ohne Zucker. : Wa...",Italie,Lidl,[lidl],low,low,low,low,...,,,,27.0,kJ,,kJ,,26.4,
2,BjORG AMANDE ALMOND SANS SUGRES- NO SUGAR,"[en:plant-based-foods-and-beverages, en:bevera...","Lait d'amandes (eau, amandes 2,8%), amidon de ...",Italie,"Magasins U,Intermarché,Auchan,carrefour.fr","[magasins-u, intermarche, auchan, carrefour-fr]",low,moderate,low,low,...,,,mg,,kJ,60.0,kJ,,25.5,
3,Organic Piadina 4 Wholeblend Flatbreads with E...,"[en:plant-based-foods-and-beverages, en:plant-...","Wholemeal Wheat Flour (35%)*, Wheat Flour&quot...",,sainsburys,[sainsburys],low,moderate,moderate,high,...,,,,,kJ,,kJ,,304.7,
4,Classic Torinesi Breadsticks,"[en:plant-based-foods-and-beverages, en:plant-...","Wheat Flour, Extra Virgin Olive Oil (7%), Malt...",,"Waitrose,Tesco","[waitrose, tesco]",low,moderate,low,high,...,,,,,kJ,,kJ,,440.1,


In [88]:
for idx, row in df_full.iterrows():
    print("------")
    print(row.product_name)
    print(row.categories_tags)
    print(row.ingredients_text)
    print(row.manufacturing_places)
    print(row.stores)
    print(row.stores_tags)
    print(row.nutrient_levels_fat)
    print("\n")

------
Pesto Genovese 190g Barilla
['en:condiments', 'en:sauces', 'en:pasta-sauces', 'en:pestos', 'en:green-pestos', 'en:groceries']
sunflower oil, fresh basil 30%, cashew nuts, parmigiano reggiano pdo cheese 5%, (milk), maize fibre, whey powder (milk), salt, milk protein, extra virgin olive oil, sugar, basil extract, natural flavourings (milk), acidity regulator: lactic acid, garlic
Italy
carrefour.fr,Coop Obs!,Denner AG,Carrefour,Super U
['carrefour-fr', 'coop-obs', 'denner-ag', 'carrefour', 'super-u']
high


------
Bio-Almond-Mandelmilch
['en:beverages-and-beverages-preparations', 'en:plant-based-foods-and-beverages', 'en:beverages', 'en:plant-based-foods', 'en:dairy-substitutes', 'en:milk-substitutes', 'en:nuts-and-their-products', 'en:plant-based-beverages', 'en:plant-based-milk-alternatives', 'en:nut-based-drinks', 'en:almond-based-drinks', 'en:fruhstucke', 'en:getranke', 'en:nusse-und-nussprodukte', 'en:pflanzliche-getranke', 'en:pflanzliche-lebensmittel-und-getranke']
Bio-Geträ

In [89]:
df_full.to_csv("openfoodfacts_data.csv", index=False)