# Exploring the data downloaded from USDA FoodData Central

See the download here: https://fdc.nal.usda.gov/download-datasets.html

This notework works with the JSON version of the data.





In [1]:
# Append the upper level directory to sys
import os
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from pathlib import Path

import torch

from utils.misc import sort_dict_by_values

# Get config
from configs.default_config import config

args = config

# Connect to GCP
from utils.gcp_utils import set_gcp_credentials, test_gcp_connection
set_gcp_credentials(path_to_key="../utils/google-storage-key.json")
test_gcp_connection()

import wandb

# Initialize a new run
from utils.wandb_utils import wandb_load_artifact, wandb_download_and_load_labels

notes = f"autolabel new images"

run = wandb.init(project=args.wandb_project, 
                 job_type=args.wandb_job_type,
                 tags=['auto_label_new_images'],
                 notes=notes)

annotations, class_names, class_dict, reverse_class_dict, labels_path = wandb_download_and_load_labels(wandb_run=run,
wandb_labels_artifact_name=args.wandb_labels_artifact)

len(annotations.label.unique())

[INFO] GCP credentials set!
[INFO] GCP connection successful! Access to GCP for saving/loading data and models available.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrdbourke[0m. Use [1m`wandb login --relogin`[0m to force relogin


[INFO] Labels directory: ./artifacts/food_vision_labels:v77
[INFO] Labels path: artifacts/food_vision_labels:v77/annotations.csv
[INFO] Working with: 331 classes


331

In [2]:
import gspread
import pandas as pd

# Open the sheet with Google Sheets
gc = gspread.oauth()

# See the link here: https://docs.google.com/spreadsheets/d/1fgNtqfuaBSbPBPQfyzYAmRb1UUHdsRphoAqXHfP-dD0/edit?usp=sharing 
sheet = gc.open("nutrify-name-to-fdc-id")
worksheet = sheet.sheet1
rows = worksheet.get_all_records()
nutrify_to_fdcid = pd.DataFrame(rows)

# Replace all blank rows in fdc_id with NaN
nutrify_to_fdcid['fdc_id'] = nutrify_to_fdcid['fdc_id'].replace('', None)

# Go through class_names and add classes that don't appear in the nutrify_to_fdcid dataframe to the Google Sheet
classes_added_to_sheet = []
for class_name in class_names:
    if class_name not in nutrify_to_fdcid['class_name'].values:
        print(f"Adding row for: {class_name}")
        classes_added_to_sheet.append(class_name)
        worksheet.append_row([0, class_name, None])
if len(classes_added_to_sheet) == 0:
    print(f"Looks like all the classes are already in the Google Sheet! Onwards!")
else:
    print(f"Added {len(classes_added_to_sheet)} classes to the Google Sheet! See:\n{classes_added_to_sheet}")

# Get the latest version of the sheet
# See the link here: https://docs.google.com/spreadsheets/d/1fgNtqfuaBSbPBPQfyzYAmRb1UUHdsRphoAqXHfP-dD0/edit?usp=sharing 
sheet = gc.open("nutrify-name-to-fdc-id")
worksheet = sheet.sheet1
rows = worksheet.get_all_records()
nutrify_to_fdcid = pd.DataFrame(rows)

# Replace all blank rows in fdc_id with NaN
nutrify_to_fdcid['fdc_id'] = nutrify_to_fdcid['fdc_id'].replace('', None)

# Delete duplicates
nutrify_to_fdcid = nutrify_to_fdcid.drop_duplicates(subset=['class_name'])
nutrify_to_fdcid.head()

Looks like all the classes are already in the Google Sheet! Onwards!


Unnamed: 0,number,class_name,fdc_id
0,0,achacha,
1,1,almond_butter,2262074.0
2,2,almonds,2346393.0
3,3,apple_custard,168175.0
4,4,apple_green,1750342.0


In [3]:
# How many rows have an FDC ID?
rows_with_fdcid = nutrify_to_fdcid[nutrify_to_fdcid["fdc_id"].notna()]
rows_without_fdcid = nutrify_to_fdcid[nutrify_to_fdcid["fdc_id"].isna()]

print(f"Rows with FDC ID: {len(rows_with_fdcid)}")
print(f"Rows without FDC ID: {len(rows_without_fdcid)}")
# nutrify_to_fdcid.isna()

Rows with FDC ID: 280
Rows without FDC ID: 51


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from tqdm.auto import tqdm

from pathlib import Path

## Get FDC Data

Download data from here: https://fdc.nal.usda.gov/download-datasets.html

In [5]:
targ_dir = "data/2022/JSON"

# Get all the files in the directory
data_files = sorted(list(Path(targ_dir).glob("*.json")))
data_files

# Combine them into one big JSON?
data = {}
for file in tqdm(data_files):
    with open(file) as f:
        data = {**data, **json.load(f)}

dataset_names = list(data.keys())
print(f"Dataset names: {dataset_names}")

print(f"Length of Foundation Foods: {len(data['FoundationFoods'])}")
print(f"Length of Legacy Foods: {len(data['SRLegacyFoods'])}")
print(f"Length of Survey Foods: {len(data['SurveyFoods'])}")

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset names: ['FoundationFoods', 'SRLegacyFoods', 'SurveyFoods']
Length of Foundation Foods: 210
Length of Legacy Foods: 7793
Length of Survey Foods: 5624


In [6]:
# nutrify_to_fdcid_keys = [int(key) for key in fdcid_to_nutrify_name.keys() if key is not None]
nutrify_to_fdcid_keys = nutrify_to_fdcid["fdc_id"].dropna().tolist()
print(nutrify_to_fdcid_keys[:10])

# Create list of nutrify food items
nutrify_food_items = []
all_nutrients = []

# Get a list of all descriptions and map to FDC ID
fdcid_to_description_list = []

# Get a list of all FDC IDs (to make sure the Nutrify FDC IDs are correct FDC IDs)
fdcid_list = []

for dataset_name in dataset_names:
    for item in data[dataset_name]:
        fdcid = item["fdcId"]
        description = item["description"]
        fdcid_to_description_dict = {}
        fdcid_to_description_dict["fdcid"] = fdcid
        fdcid_to_description_dict["description"] = description
        fdcid_to_description_list.append(fdcid_to_description_dict)

        fdcid_list.append(fdcid)
        # fdcid_to_description_list.append((fdcid, description))

        if item["fdcId"] in nutrify_to_fdcid_keys:
            nutrify_food_items.append(item)
            for nutrient in item["foodNutrients"]:
                all_nutrients.append(nutrient["nutrient"]["name"])

# Make sure all nutrify_to_fdcid_keys are in the fdcid_list
for fdcid in nutrify_to_fdcid_keys:
    if fdcid not in fdcid_list:
        print(f"Nutrify FDC ID {fdcid} is not in the FDC ID list! Please update before continuing.")
        # Find the row it belongs to
        row = nutrify_to_fdcid[nutrify_to_fdcid["fdc_id"] == fdcid]
        print(f"Key {fdcid} belongs to row: {row}")
        
# Sort and set the all_nutrients
all_nutrients = sorted(list(set(all_nutrients)))
print(f"Number of Nutrify foods that line up with FDC IDs: {len(nutrify_food_items)}")
print(f"Number of unique nutrients: {len(all_nutrients)}")

[2262074, 2346393, 168175, 1750342, 1750339, 171697, 169205, 168389, 171705, 749420]
Number of Nutrify foods that line up with FDC IDs: 279
Number of unique nutrients: 236


In [7]:
fdcid_to_description_df = pd.DataFrame(fdcid_to_description_list)

# Drop duplicates based on FDC ID
fdcid_to_description_df = fdcid_to_description_df.drop_duplicates(subset=["fdcid"])

# Drop duplicates based on description
fdcid_to_description_df = fdcid_to_description_df.drop_duplicates(subset=["description"])
len(fdcid_to_description_df)

13475

In [8]:
# TODO: sort all nutrients into different categories, e.g. minerals, vitamins, macronutrients, etc.
# See here for a list of essential nutrients: https://www.britannica.com/science/human-nutrition/Essential-nutrients 
all_nutrients

[' Ergosta-5,7-dienol',
 ' Ergosta-7,22-dienol',
 '10-Formyl folic acid (10HCOFA)',
 '25-hydroxycholecalciferol',
 '5-Formyltetrahydrofolic acid (5-HCOH4',
 '5-methyl tetrahydrofolate (5-MTHF)',
 'Alanine',
 'Alcohol, ethyl',
 'Arginine',
 'Ash',
 'Aspartic acid',
 'Beta-glucan',
 'Beta-sitostanol',
 'Beta-sitosterol',
 'Betaine',
 'Biotin',
 'Boron, B',
 'Brassicasterol',
 'Caffeine',
 'Calcium, Ca',
 'Campestanol',
 'Campesterol',
 'Carbohydrate, by difference',
 'Carbohydrate, by summation',
 'Carotene, alpha',
 'Carotene, beta',
 'Carotene, gamma',
 'Cholesterol',
 'Choline, free',
 'Choline, from glycerophosphocholine',
 'Choline, from phosphocholine',
 'Choline, from phosphotidyl choline',
 'Choline, from sphingomyelin',
 'Choline, total',
 'Citric acid',
 'Cobalt, Co',
 'Copper, Cu',
 'Cryptoxanthin, alpha',
 'Cryptoxanthin, beta',
 'Cysteine',
 'Cystine',
 'Daidzein',
 'Daidzin',
 'Delta-5-avenasterol',
 'Delta-7-Stigmastenol',
 'Energy',
 'Energy (Atwater General Factors)',
 '

In [9]:
import random
nutrify_food_items[random.randint(0, len(nutrify_food_items))].keys()

dict_keys(['foodClass', 'description', 'foodNutrients', 'foodAttributes', 'foodCode', 'startDate', 'endDate', 'wweiaFoodCategory', 'foodPortions', 'publicationDate', 'inputFoods', 'fdcId', 'dataType'])

In [10]:
target_key = "foodNutrients"
target_food = nutrify_food_items[random.randint(0, len(nutrify_food_items))]

print(f"Target key: {target_key}")
print(f"Target food: {target_food['description']}")
print(f"Target food fdc id: {target_food['fdcId']}")
print(f"Target food data type: {target_food['dataType']}")
print(f"Target food with key:\n{target_food[target_key][0].keys()}")

Target key: foodNutrients
Target food: Beans, Dry, Black (0% moisture)
Target food fdc id: 747444
Target food data type: Foundation
Target food with key:
dict_keys(['type', 'id', 'nutrient', 'dataPoints', 'foodNutrientDerivation', 'max', 'min', 'median', 'amount'])


Details about the keys:
- `fdcId`: Unique identifier for a food item
- `description`: Description of the food item
- `wweiaFoodCategory`: 
```
{'wweiaFoodCategoryCode': 2643393,
 'wweiaFoodCategoryDescription': 'Poultry mixed dishes'}
```
* `foodPortions`: The amount of a certain food you'd have in a portion, e.g. 1 cup of corn (the default is 100g but some foods also have information per serving)
* `dataType`: Where the data comes from, e.g. `Foundation` or `SR Legacy` or `Survey (FNDDS)`
* `foodNutrients`: The nutrients of the food (macronutrients, micronutrients, etc.)
* `foodClass`: Similar to the `dataType` but shows simpler category e.g. `Survey`

In [11]:
# Get a list of macronutrients
macronutrients = ["Protein", 
                  "Total lipid (fat)", 
                  "Carbohydrate, by difference",
                  "Alcohol, ethyl"]

macronutrients_to_key = {"Protein": "protein",
                          "Total lipid (fat)": "fat",
                          "Carbohydrate, by difference": "carbohydrate",
                          "Alcohol, ethyl": "alcohol"}

# Create a dictionary mapping macronutrients to their energy density (kcal/g)
energy_dict = {"protein": 4,
               "fat": 9,
               "carbohydrate": 4,
               "alcohol": 7}

## Create a list of micronutrients to get started (see: https://www.hsph.harvard.edu/nutritionsource/vitamins/)
# Create a list of minerals (from FDC)
minerals = [
    "Calcium, Ca",
    "Copper, Cu",
    "Fluoride, F",
    "Iodine, I", 
    "Iron, Fe",
    "Magnesium, Mg",
    "Manganese, Mn",
    "Molybdenum, Mo",
    "Nickel, Ni",
    "Phosphorus, P",
    "Potassium, K",
    "Selenium, Se",
    "Sodium, Na",
    "Zinc, Zn",
]

# Create a list of vitamins (from FDC)
vitamins = [
    "Vitamin A", # IU = international units = 0.3 mcg retinol activity equivalents (RAE)
    "Thiamin", # vitamin B1
    "Riboflavin", # vitamin B2
    "Niacin", # vitamin B3
    "Pantothenic acid", # vitamin B5
    "Vitamin B-6", # (pyridoxal, pyridoxine, pyridoxamine)
    "Biotin", # vitamin B7
    "Folate, total", # vitamin B9, folic acid
    "Vitamin B-12", # cobalamin
    "Vitamin C, total ascorbic acid", # ascorbic acid
    "Choline, total", # choline
    "Vitamin D (D2 + D3)", # calciferol
    "Vitamin E (alpha-tocopherol)", # alpha-tocopherol
    "Vitamin K (phylloquinone)", # phylloquinone
]

# Create a list of other
other_food_nutrients = [
    "Caffeine",
    "Citric acid"
]

# Oils (e.g. olive oil) have a different layout to other kinds of foods, as they are mostly fats rather than 
# carbohydrates and protein.
lipids = {"Fatty acids, total saturated": "SFA",
          "Fatty acids, total monounsaturated": "MUFA",
          "Fatty acids, total polyunsaturated": "PUFA"} 

lipids_to_key = {"Fatty acids, total saturated": "saturated_fat",
                 "Fatty acids, total monounsaturated": "monounsaturated_fat",
                 "Fatty acids, total polyunsaturated": "polyunsaturated_fat"}


# Get oil names from Foundation Foods (can make this more general later, e.g. semantic search across the whole database)
foundation_food_oil_types = ["Oil, canola", "Oil, coconut", "Oil, corn", "Oil, peanut", "Oil, safflower", "Oil, soybean", "Oil, sunflower", "Oil, olive, extra light", "Oil, olive, extra virgin"]
oil_items = [item for item in data["FoundationFoods"] if item["description"] in foundation_food_oil_types]
oil_names = [item["description"] for item in oil_items]
print(oil_names)

['Oil, coconut', 'Oil, canola', 'Oil, corn', 'Oil, soybean', 'Oil, olive, extra virgin', 'Oil, peanut', 'Oil, sunflower', 'Oil, safflower', 'Oil, olive, extra light']


In [12]:
import random
target_item = nutrify_food_items[random.randint(0, len(nutrify_food_items))]
print(target_item["fdcId"])
print(target_item["description"])

for item in target_item["foodNutrients"]:
    if item["nutrient"]["name"] in macronutrients:
        print(item["nutrient"]["name"])

target_item

2342931
Soybean soup, miso broth
Protein
Total lipid (fat)
Carbohydrate, by difference
Alcohol, ethyl


{'foodClass': 'Survey',
 'description': 'Soybean soup, miso broth',
 'foodNutrients': [{'type': 'FoodNutrient',
   'id': 28681639,
   'nutrient': {'id': 1003,
    'number': '203',
    'name': 'Protein',
    'rank': 600,
    'unitName': 'g'},
   'amount': 2.5},
  {'type': 'FoodNutrient',
   'id': 28681640,
   'nutrient': {'id': 1004,
    'number': '204',
    'name': 'Total lipid (fat)',
    'rank': 800,
    'unitName': 'g'},
   'amount': 1.56},
  {'type': 'FoodNutrient',
   'id': 28681641,
   'nutrient': {'id': 1005,
    'number': '205',
    'name': 'Carbohydrate, by difference',
    'rank': 1110,
    'unitName': 'g'},
   'amount': 2.27},
  {'type': 'FoodNutrient',
   'id': 28681642,
   'nutrient': {'id': 1008,
    'number': '208',
    'name': 'Energy',
    'rank': 300,
    'unitName': 'kcal'},
   'amount': 32.0},
  {'type': 'FoodNutrient',
   'id': 28681643,
   'nutrient': {'id': 1018,
    'number': '221',
    'name': 'Alcohol, ethyl',
    'rank': 18200,
    'unitName': 'g'},
   'amoun

In [13]:
import json

json.dump(target_item, open("target_item.json", "w"), indent=4)

In [14]:
for i, item in enumerate(nutrify_food_items):
    if item["fdcId"] == 748608: # Olive oil, see: https://fdc.nal.usda.gov/fdc-app.html#/food-details/748608/nutrients
        print(i)
        print(item)

25
{'foodClass': 'FinalFood', 'description': 'Oil, olive, extra virgin', 'foodNutrients': [{'type': 'FoodNutrient', 'id': 8529145, 'nutrient': {'id': 1264, 'number': '612', 'name': 'SFA 14:0', 'rank': 10500, 'unitName': 'g'}, 'dataPoints': 36, 'foodNutrientDerivation': {'code': 'A', 'description': 'Analytical', 'foodNutrientSource': {'id': 1, 'code': '1', 'description': 'Analytical or derived from analytical'}}, 'max': 0.021, 'min': 0.009, 'median': 0.012, 'amount': 0.013}, {'type': 'FoodNutrient', 'id': 8529146, 'nutrient': {'id': 1265, 'number': '613', 'name': 'SFA 16:0', 'rank': 10700, 'unitName': 'g'}, 'dataPoints': 36, 'foodNutrientDerivation': {'code': 'A', 'description': 'Analytical', 'foodNutrientSource': {'id': 1, 'code': '1', 'description': 'Analytical or derived from analytical'}}, 'max': 15.8, 'min': 9.48, 'median': 12, 'amount': 12.1}, {'type': 'FoodNutrient', 'id': 8529147, 'nutrient': {'id': 1266, 'number': '614', 'name': 'SFA 18:0', 'rank': 10900, 'unitName': 'g'}, 'dat

In [15]:
# Get info for Olive oil
nutrify_food_items[25]

# Get info for Yoghurt
# nutrify_food_items[47]

{'foodClass': 'FinalFood',
 'description': 'Oil, olive, extra virgin',
 'foodNutrients': [{'type': 'FoodNutrient',
   'id': 8529145,
   'nutrient': {'id': 1264,
    'number': '612',
    'name': 'SFA 14:0',
    'rank': 10500,
    'unitName': 'g'},
   'dataPoints': 36,
   'foodNutrientDerivation': {'code': 'A',
    'description': 'Analytical',
    'foodNutrientSource': {'id': 1,
     'code': '1',
     'description': 'Analytical or derived from analytical'}},
   'max': 0.021,
   'min': 0.009,
   'median': 0.012,
   'amount': 0.013},
  {'type': 'FoodNutrient',
   'id': 8529146,
   'nutrient': {'id': 1265,
    'number': '613',
    'name': 'SFA 16:0',
    'rank': 10700,
    'unitName': 'g'},
   'dataPoints': 36,
   'foodNutrientDerivation': {'code': 'A',
    'description': 'Analytical',
    'foodNutrientSource': {'id': 1,
     'code': '1',
     'description': 'Analytical or derived from analytical'}},
   'max': 15.8,
   'min': 9.48,
   'median': 12,
   'amount': 12.1},
  {'type': 'FoodNutrie

In [16]:
source_url_base = "https://fdc.nal.usda.gov/fdc-app.html#/food-details/{}/nutrients"

nutrify_food_items_nutrients = []
for item in tqdm(nutrify_food_items):
    calories_string = "total_energy_calories"

    item_dict = {}
    item_dict["fdcId"] = item["fdcId"]
    # print(f"On item: {item['fdcId']}")
    item_dict["description"] = item["description"]
    item_dict["default_size"] = 100
    item_dict["unit"] = "g"
    item_dict[calories_string] = 0

    # Get the macronutrients + water + micronutrients
    item_macronutrients = {}

    # Create empty lists for various minerals and vitamins
    mineral_list = []
    vitamin_list = []
    other_nutrient_list = []

    # Setup micronutrients dictionary
    item_dict["micronutrients"] = {"vitamins": "",
                                   "minerals": ""}

    # Setup other nutrients dictionary
    item_other_nutrients = {} 
    
    for nutrient in item["foodNutrients"]:

        nutrient_name = nutrient["nutrient"]["name"]

        # Get the macronutrients
        if nutrient_name in macronutrients:
            macronutrient_name = macronutrients_to_key[nutrient_name]
            item_macronutrients[macronutrient_name] = nutrient["amount"]
            item_dict["macronutrients"] = item_macronutrients
        
        # Get the water
        if nutrient_name == "Water":
            item_dict["Water"] = nutrient["amount"]

        # Get the vitamin micronutrients
        if nutrient_name in vitamins:
            item_vitamin_micronutrients = {}
            vitamin_name = nutrient["nutrient"]["name"]
            # print(f"On vitamin name: {vitamin_name}")
            try:
                vitamin_amount = nutrient["amount"]
                vitamin_unit = nutrient["nutrient"]["unitName"]
            except:
                print(f"Could not get amount for vitamin: {vitamin_name}")
                if nutrient_name == "Vitamin A": # replace for Retinol
                    vitamin_amount = [nutrient["amount"] for nutrient in item["foodNutrients"] if nutrient["nutrient"]["name"] == "Retinol"][0]
                    vitamin_unit = [nutrient["nutrient"]["unitName"] for nutrient in item["foodNutrients"] if nutrient["nutrient"]["name"] == "Retinol"][0]
                    
            item_vitamin_micronutrients["name"] = vitamin_name
            item_vitamin_micronutrients["amount"] = vitamin_amount
            item_vitamin_micronutrients["unit"] = vitamin_unit

            vitamin_list.append(item_vitamin_micronutrients)
            # print(len(vitamin_list))
        
        # Get the mineral micronutrients
        if nutrient_name in minerals:
            item_mineral_micronutrients = {}
            mineral_name = nutrient["nutrient"]["name"]
            # print(f"On mineral name: {mineral_name}")
            try:
                mineral_amount = nutrient["amount"]
                mineral_unit = nutrient["nutrient"]["unitName"]
            except:
                print(f"Could not get amount for mineral: {mineral_name}")
            
            item_mineral_micronutrients["name"] = mineral_name
            item_mineral_micronutrients["amount"] = mineral_amount
            item_mineral_micronutrients["unit"] = mineral_unit

            mineral_list.append(item_mineral_micronutrients)
            # print(len(mineral_list))
        
        # Get the other nutrients
        if nutrient_name in other_food_nutrients:
            item_other_nutrients = {}
            other_nutrient_name = nutrient["nutrient"]["name"]
            # print(f"On other nutrient name: {other_nutrient_name}")
            try:
                other_nutrient_amount = nutrient["amount"]
                other_nutrient_unit = nutrient["nutrient"]["unitName"]
            except:
                print(f"Could not get amount for other nutrient: {other_nutrient_name}")

            item_other_nutrients["name"] = other_nutrient_name
            item_other_nutrients["amount"] = other_nutrient_amount
            item_other_nutrients["unit"] = other_nutrient_unit

            other_nutrient_list.append(item_other_nutrients)
            # print(len(other_nutrient_list))
            

    item_dict["micronutrients"]["vitamins"] = vitamin_list
    item_dict["micronutrients"]["minerals"] = mineral_list
    item_dict["other_nutrients"] = other_nutrient_list
    
    # Handle oils
    if item["description"] in oil_names:
        for nutrient in item["foodNutrients"]:
            if nutrient["nutrient"]["name"] in lipids:
                target_lipid_name = lipids_to_key[nutrient["nutrient"]["name"]]
                item_macronutrients[target_lipid_name] = nutrient["amount"]
                item_dict["lipids"] = item_macronutrients
        lipid_items = item_dict["lipids"].keys()
        for lipid in lipid_items:
            item_dict[calories_string] += round(item_dict["lipids"][lipid] * energy_dict["fat"]) # all lipids are fat

    # Calculate the total energy (this will fail on items like Table Salt + Olive Oil, their values in the FDC are different to others, see: 746775 and https://fdc.nal.usda.gov/fdc-app.html#/food-details/748608/nutrients)
    for nutrient in macronutrients:
        try:
            target_macronutrient = macronutrients_to_key[nutrient]
            if target_macronutrient in item_dict["macronutrients"]:
                item_dict[calories_string] += round(item_dict["macronutrients"][target_macronutrient] * energy_dict[target_macronutrient])
        except Exception as e:
            print(e)
            print(item_dict)
    
    # Add the source url
    item_dict["fdc_source_url"] = source_url_base.format(item["fdcId"])
    
    nutrify_food_items_nutrients.append(item_dict)

  0%|          | 0/279 [00:00<?, ?it/s]

'macronutrients'
{'fdcId': 746775, 'description': 'Salt, table, iodized', 'default_size': 100, 'unit': 'g', 'total_energy_calories': 0, 'micronutrients': {'vitamins': [], 'minerals': [{'name': 'Iron, Fe', 'amount': 0, 'unit': 'mg'}, {'name': 'Magnesium, Mg', 'amount': 0, 'unit': 'mg'}, {'name': 'Phosphorus, P', 'amount': 0, 'unit': 'mg'}, {'name': 'Sodium, Na', 'amount': 38700, 'unit': 'mg'}, {'name': 'Copper, Cu', 'amount': 0, 'unit': 'mg'}, {'name': 'Manganese, Mn', 'amount': 0.032, 'unit': 'mg'}, {'name': 'Calcium, Ca', 'amount': 50, 'unit': 'mg'}, {'name': 'Potassium, K', 'amount': 2, 'unit': 'mg'}, {'name': 'Zinc, Zn', 'amount': 0, 'unit': 'mg'}, {'name': 'Iodine, I', 'amount': 5080, 'unit': 'µg'}]}, 'Water': 0.42, 'other_nutrients': []}
'macronutrients'
{'fdcId': 746775, 'description': 'Salt, table, iodized', 'default_size': 100, 'unit': 'g', 'total_energy_calories': 0, 'micronutrients': {'vitamins': [], 'minerals': [{'name': 'Iron, Fe', 'amount': 0, 'unit': 'mg'}, {'name': 'Magn

In [22]:
nutrify_food_items_nutrients[random.randint(0, len(nutrify_food_items_nutrients))]

{'fdcId': 2342418,
 'description': 'Cuban sandwich',
 'default_size': 100,
 'unit': 'g',
 'total_energy_calories': 225,
 'micronutrients': {'vitamins': [{'name': 'Vitamin E (alpha-tocopherol)',
    'amount': 0.3,
    'unit': 'mg'},
   {'name': 'Vitamin D (D2 + D3)', 'amount': 0.4, 'unit': 'µg'},
   {'name': 'Vitamin C, total ascorbic acid', 'amount': 0.6, 'unit': 'mg'},
   {'name': 'Thiamin', 'amount': 0.365, 'unit': 'mg'},
   {'name': 'Riboflavin', 'amount': 0.279, 'unit': 'mg'},
   {'name': 'Niacin', 'amount': 3.29, 'unit': 'mg'},
   {'name': 'Vitamin B-6', 'amount': 0.21, 'unit': 'mg'},
   {'name': 'Folate, total', 'amount': 32.0, 'unit': 'µg'},
   {'name': 'Vitamin B-12', 'amount': 0.68, 'unit': 'µg'},
   {'name': 'Choline, total', 'amount': 45.5, 'unit': 'mg'},
   {'name': 'Vitamin K (phylloquinone)', 'amount': 3.9, 'unit': 'µg'}],
  'minerals': [{'name': 'Calcium, Ca', 'amount': 159, 'unit': 'mg'},
   {'name': 'Iron, Fe', 'amount': 1.75, 'unit': 'mg'},
   {'name': 'Magnesium, Mg'

In [18]:
len(nutrify_food_items_nutrients)

279

In [19]:
# Save to JSON
with open("nutrify_foodvision_items_nutrients.json", "w") as f:
    json.dump(nutrify_food_items_nutrients, f)

In [20]:
# Upload to Google Storage
!gsutil cp nutrify_foodvision_items_nutrients.json gs://food_vision_bucket_with_object_versioning/nutrition_information/nutrify_foodvision_items_nutrients.json

Copying file://nutrify_foodvision_items_nutrients.json [Content-Type=application/json]...
- [1 files][407.7 KiB/407.7 KiB]                                                
Operation completed over 1 objects/407.7 KiB.                                    


In [None]:
# TODO: 
# back the nutrition information up to Google Storage ✅
    # could also track this as an Artifact in Weights & Biases? 
# Merge the Nutrition information with the Nutrify Food Item names ✅
# For foods without an FDC ID - could match the closest food item based on the name and then use the nutrition information from that food item
    # Could do the same with every other food, e.g. encode the text and show links to the closest food items (e.g. apple_red -> red delicious apple + more, apple_green -> granny smith apple + more, etc.)
    # Could just do something with ChatGPT in terms of similar foods, just generate similar foods and show the closest ones (e.g. what types of red apple are there, then provide the closest ones)
    # This would be the same with text-based foods as well, just include the most similar foods based on the text lookup (e.g. canola oil -> canola oil + more, olive oil -> olive oil + more, etc.)

## Find a match for foods with no exact match

In [227]:
# TODO: 
# UPTOHERE
# Find classes which don't have nutrients
# Display options for the *most* similar items
    # E.g. if someone types in "Cheese, blue" and there is no exact match, display the closest matches
    # Could also just do this for the existing foods (find the most similar foods via name)

In [39]:
# Which rows don't have FDC IDs?
nutrify_to_fdcid[nutrify_to_fdcid["fdc_id"].isna()]

Unnamed: 0,number,class_name,fdc_id
0,0,achacha,
11,11,bacon_and_egg_burger,
15,15,banana_bread,
22,22,beef_diced,
23,23,beef_kebab,
30,30,biltong,
31,31,black_pepper,
48,48,cape_gooseberries,
54,54,carrot_purple,
68,68,chicken_stir_fry,


In [40]:
# Create a function to similarity match the class names (e.g. code which string is most like another string)
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import dot_score
model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_list_of_classes(class_names: list, model: SentenceTransformer):
    """
    Embeds a list of class names.
    """

    # Map the class_name to the embedding
    class_name_to_embedding = {class_name: embedding for class_name, embedding in zip(class_names, model.encode(class_names))}

    return class_name_to_embedding

# Create a function to similarity match the class names (e.g. code which string is most like another string)
def find_most_similar_class_name(target_class_name, class_name_embedding_dict, num_similar=3):
    """
    Finds the most similar class name to the class_name provided.
    """
    # Get the embedding of the target_class_name
    target_class_name_embedding = model.encode([target_class_name])[0]

    # Create a dictionary of the class_name and the dot score between the target_class_name and the class_name
    class_name_embedding_dict = {class_name: dot_score(embedding, target_class_name_embedding) for class_name, embedding in class_name_embedding_dict.items()}

    # Sort the dictionary by the dot score
    most_similar_class_names = {k: v[0].item() for k, v in sorted(class_name_embedding_dict.items(), key=lambda item: item[1], reverse=True)}

    # Only return the top num_similar
    # most_similar_class_names = list(most_similar_class_names.items())[:num_similar]

    return most_similar_class_names

In [41]:
# Get a list of the unique descriptions
unique_descriptions = fdcid_to_description_df["description"].unique().tolist()
len(unique_descriptions)

13475

In [42]:
# Create an embedding of all the unique descriptions
class_name_embedding_dict_unique_descriptions = embed_list_of_classes(unique_descriptions, model)

In [43]:
food_item_without_fdcid = rows_without_fdcid.class_name.to_list()
food_item_without_fdcid[:5]

['achacha', 'bacon_and_egg_burger', 'banana_bread', 'beef_diced', 'beef_kebab']

In [48]:
# target_food_item = random.choice(food_item_without_fdcid)
target_food_item = "red apple"
print(f"Showing the most similar class names to: {target_food_item}")
most_similar_class_names = find_most_similar_class_name(target_food_item, class_name_embedding_dict_unique_descriptions, num_similar=10)
most_similar_class_names

Showing the most similar class names to: red apple


{'Apples, red delicious, with skin, raw': 0.7135860919952393,
 'Rose-apples, raw': 0.6315896511077881,
 'Apple, dried': 0.6260167360305786,
 'Crisp, apple': 0.6217784881591797,
 'Apple, raw': 0.6178090572357178,
 'Applesauce, regular': 0.6107631921768188,
 'Apple, candied': 0.5980645418167114,
 'Apple, baked': 0.5918879508972168,
 'Applesauce, flavored': 0.5883489847183228,
 'Pie, apple': 0.5762279629707336,
 'Pears, raw, red anjou': 0.5739957690238953,
 "Apples, raw, red delicious, with skin (Includes foods for USDA's Food Distribution Program)": 0.5658494830131531,
 'Apple cider': 0.5499900579452515,
 'Applesauce, unsweetened': 0.5496532917022705,
 'Apple juice, 100%': 0.5481484532356262,
 'Babyfood, juice, apple - cherry': 0.5468789935112,
 'Apples, raw, golden delicious, with skin': 0.5462113618850708,
 'Apples, raw, without skin': 0.5439801812171936,
 'Baby Toddler juice, apple': 0.5428827404975891,
 'Babyfood, juice, apple': 0.5408956408500671,
 'Babyfood, juice, orange and apple

In [117]:
import openai

openai.api_key_path = "../foodvision/utils/openai_api_key.txt"

def chat_complete(prompt: str, **openai_kwargs) -> str:
    """Generate completion from OpenAI ChatGPT API"""
    default_kwargs = {"model": "gpt-3.5-turbo", "max_tokens": 256, "temperature": 0.2, "top_p": 1}
    openai_kwargs = {**default_kwargs, **openai_kwargs}
    response = openai.ChatCompletion.create(
        **openai_kwargs,
        messages=[
                {"role": "system", "content": "You are a helpful assistant who is very good at designing apps and making them look good."},
                {"role": "user", "content": prompt},
            ]
        )
    return response

base_prompt_for_sorting_food_names = """You are a food scientist and nutritionist very knowledgable on all different kinds of foods.\n
Given the following list of foods and the target text, please sort the list of foods into which ones relate most to the target text.\n
For example, the list ['apples, red delicious, with skin, raw',
 'apples, gala, with skin, raw',
 'apples, honeycrisp, with skin, raw',
 'apples, granny smith, with skin, raw',
 'applesauce, unsweetened, with added vitamin c',
 ...]
and the target text 'apple_red' would be sorted as a JSON dictionary,
 '1': 'apples, red delicious, with skin, raw', 
 '2': 'apples, gala, with skin, raw', 
 '3': 'apples, honeycrisp, with skin, raw', 
 '4': 'apples, granny smith, with skin, raw', 
 '5': 'applesauce, unsweetened, with added vitamin c', 
 ....\n
Please do not include any extra foods or text, only return the list of foods in the target food list.\n
Put a favour on similar types of foods, e.g. if the target text contains "capsicum_green" then "pepper, red" should be higher than "cabbage, green".\n
If the target list of foods to sort has N items, you should return a list of N items.\n
For example, if the target list of foods to sort has 10 items, you should return a list of 10 items.\n
Target text: {target_text}\n
List of foods to sort: {target_food_list}\n
Sorted list of food as properly formatted JSON dictionary:\n
"""

def sort_list_of_foods(base_prompt: str, target_text: str, target_food_list: list) -> list:
    # print(base_prompt)
    target_text_dict = {"target_text": target_text, "target_food_list": target_food_list}
    target_text_prompt = base_prompt.format(**target_text_dict)
    # target_text_prompt = base_prompt.format(target_text, target_food_list)
    # print(target_text_prompt)
    answer = chat_complete(target_text_prompt)
    answer_formatted = str(answer.choices[0].message.content)
    # Remove "\n"
    # print(f"Display name generated: {target_text} -> {answer_formatted}")
    return answer_formatted

# target_text = "capsicum_red"
target_text = random.choice(food_item_without_fdcid)
print(f"Predicting the most similar class names to: {target_text}")

most_similar_class_names = find_most_similar_class_name(target_text, class_name_embedding_dict_unique_descriptions)

# Get the top 10 most similar class names
target_food_list = list(most_similar_class_names.keys())[:10]
print(len(target_food_list))

# Turn target_food_list into a dictionary with keys in order of 1, 2, 3...
target_food_list = {i+1: food_name for i, food_name in enumerate(target_food_list)}
print(target_food_list)
# print(target_food_list)

sorted_list_of_foods = sort_list_of_foods(base_prompt=base_prompt_for_sorting_food_names, target_text=target_text, target_food_list=target_food_list)
sorted_list_of_foods


Predicting the most similar class names to: melon_spanish
10
{1: 'Melon, banana (Navajo)', 2: 'Honeydew melon, raw', 3: 'Melons, honeydew, raw', 4: 'Winter melon, cooked', 5: 'Melons, casaba, raw', 6: 'Bitter melon, cooked', 7: 'Melons, cantaloupe, raw', 8: 'Horned melon (Kiwano)', 9: 'Waxgourd, (chinese preserving melon), raw', 10: 'Waxgourd, (chinese preserving melon), cooked, boiled, drained, with salt'}


'{\n    "1": "Melons, honeydew, raw",\n    "2": "Honeydew melon, raw",\n    "3": "Melons, cantaloupe, raw",\n    "4": "Melons, casaba, raw",\n    "5": "Horned melon (Kiwano)",\n    "6": "Waxgourd, (chinese preserving melon), raw",\n    "7": "Waxgourd, (chinese preserving melon), cooked, boiled, drained, with salt",\n    "8": "Winter melon, cooked",\n    "9": "Bitter melon, cooked",\n    "10": "Melon, banana (Navajo)"\n}'

In [193]:
import json
sorted_list_of_foods = json.loads(sorted_list_of_foods)
sorted_list_of_foods

{'1': 'peppers, bell, red, raw',
 '2': 'cabbage, red, raw',
 '3': 'cherries, sweet, dark red, raw',
 '4': 'lettuce, leaf, red, raw',
 '5': 'onions, red, raw',
 '6': 'peppers, bell, orange, raw',
 '7': 'peppers, bell, yellow, raw',
 '8': 'peppers, bell, green, raw',
 '9': 'mustard, prepared, yellow',
 '10': 'beans, dry, medium red (0% moisture)'}