# Exploring the data downloaded from USDA FoodData Central

See the download here: https://fdc.nal.usda.gov/download-datasets.html

This notework works with the JSON version of the data.





In [305]:
import gspread
import pandas as pd

# Open the sheet with Google Sheets
gc = gspread.oauth()

# See the link here: https://docs.google.com/spreadsheets/d/1fgNtqfuaBSbPBPQfyzYAmRb1UUHdsRphoAqXHfP-dD0/edit?usp=sharing 
sheet = gc.open("nutrify-name-to-fdc-id")
worksheet = sheet.sheet1
rows = worksheet.get_all_records()

nutrify_to_fdcid = pd.DataFrame(rows)

# Replace all blank rows in fdc_id with NaN
nutrify_to_fdcid['fdc_id'] = nutrify_to_fdcid['fdc_id'].replace('', None)
nutrify_to_fdcid.head()

Unnamed: 0,number,class_name,fdc_id
0,0,achacha,
1,1,almond_butter,2262074.0
2,2,almonds,2346393.0
3,3,apple_custard,168175.0
4,4,apple_green,1750342.0


In [306]:
# How many rows have an FDC ID?
rows_with_fdcid = nutrify_to_fdcid[nutrify_to_fdcid["fdc_id"].notna()]
rows_without_fdcid = nutrify_to_fdcid[nutrify_to_fdcid["fdc_id"].isna()]

print(f"Rows with FDC ID: {len(rows_with_fdcid)}")
print(f"Rows without FDC ID: {len(rows_without_fdcid)}")
# nutrify_to_fdcid.isna()

Rows with FDC ID: 275
Rows without FDC ID: 48


In [307]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from tqdm.auto import tqdm

from pathlib import Path

## Get FDC Data

Download data from here: https://fdc.nal.usda.gov/download-datasets.html

In [308]:
targ_dir = "data/2022/JSON"

# Get all the files in the directory
data_files = sorted(list(Path(targ_dir).glob("*.json")))
data_files

# Combine them into one big JSON?
data = {}
for file in tqdm(data_files):
    with open(file) as f:
        data = {**data, **json.load(f)}

dataset_names = list(data.keys())
print(f"Dataset names: {dataset_names}")

print(f"Length of Foundation Foods: {len(data['FoundationFoods'])}")
print(f"Length of Legacy Foods: {len(data['SRLegacyFoods'])}")
print(f"Length of Survey Foods: {len(data['SurveyFoods'])}")

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset names: ['FoundationFoods', 'SRLegacyFoods', 'SurveyFoods']
Length of Foundation Foods: 210
Length of Legacy Foods: 7793
Length of Survey Foods: 5624


In [309]:
nutrify_to_fdcid["fdc_id"]

0         None
1      2262074
2      2346393
3       168175
4      1750342
        ...   
318    2346264
319     168572
320    2259793
321     169291
322     171714
Name: fdc_id, Length: 323, dtype: object

In [310]:
# nutrify_to_fdcid_keys = [int(key) for key in fdcid_to_nutrify_name.keys() if key is not None]
nutrify_to_fdcid_keys = nutrify_to_fdcid["fdc_id"].dropna().tolist()
print(nutrify_to_fdcid_keys[:10])

# Create list of nutrify food items
nutrify_food_items = []
all_nutrients = []

# Get a list of all descriptions and map to FDC ID
fdcid_to_description_list = []

for dataset_name in dataset_names:
    for item in data[dataset_name]:
        fdcid = item["fdcId"]
        description = item["description"]
        fdcid_to_description_dict = {}
        fdcid_to_description_dict["fdcid"] = fdcid
        fdcid_to_description_dict["description"] = description
        fdcid_to_description_list.append(fdcid_to_description_dict)
        # fdcid_to_description_list.append((fdcid, description))

        if item["fdcId"] in nutrify_to_fdcid_keys:
            nutrify_food_items.append(item)
            for nutrient in item["foodNutrients"]:
                all_nutrients.append(nutrient["nutrient"]["name"])

# Sort and set the all_nutrients
all_nutrients = sorted(list(set(all_nutrients)))
print(f"Number of Nutrify foods that line up with FDC IDs: {len(nutrify_food_items)}")
print(f"Number of unique nutrients: {len(all_nutrients)}")

[2262074, 2346393, 168175, 1750342, 1750339, 171697, 169205, 168389, 171705, 749420]
Number of Nutrify foods that line up with FDC IDs: 274
Number of unique nutrients: 236


In [311]:
len(fdcid_to_description_list)

13627

In [312]:
fdcid_to_description_df = pd.DataFrame(fdcid_to_description_list)

# Drop duplicates based on FDC ID
fdcid_to_description_df = fdcid_to_description_df.drop_duplicates(subset=["fdcid"])

# Drop duplicates based on description
fdcid_to_description_df = fdcid_to_description_df.drop_duplicates(subset=["description"])
len(fdcid_to_description_df)

13475

In [313]:
# TODO: sort all nutrients into different categories, e.g. minerals, vitamins, macronutrients, etc.
# See here for a list of essential nutrients: https://www.britannica.com/science/human-nutrition/Essential-nutrients 
all_nutrients

[' Ergosta-5,7-dienol',
 ' Ergosta-7,22-dienol',
 '10-Formyl folic acid (10HCOFA)',
 '25-hydroxycholecalciferol',
 '5-Formyltetrahydrofolic acid (5-HCOH4',
 '5-methyl tetrahydrofolate (5-MTHF)',
 'Alanine',
 'Alcohol, ethyl',
 'Arginine',
 'Ash',
 'Aspartic acid',
 'Beta-glucan',
 'Beta-sitostanol',
 'Beta-sitosterol',
 'Betaine',
 'Biotin',
 'Boron, B',
 'Brassicasterol',
 'Caffeine',
 'Calcium, Ca',
 'Campestanol',
 'Campesterol',
 'Carbohydrate, by difference',
 'Carbohydrate, by summation',
 'Carotene, alpha',
 'Carotene, beta',
 'Carotene, gamma',
 'Cholesterol',
 'Choline, free',
 'Choline, from glycerophosphocholine',
 'Choline, from phosphocholine',
 'Choline, from phosphotidyl choline',
 'Choline, from sphingomyelin',
 'Choline, total',
 'Citric acid',
 'Cobalt, Co',
 'Copper, Cu',
 'Cryptoxanthin, alpha',
 'Cryptoxanthin, beta',
 'Cysteine',
 'Cystine',
 'Daidzein',
 'Daidzin',
 'Delta-5-avenasterol',
 'Delta-7-Stigmastenol',
 'Energy',
 'Energy (Atwater General Factors)',
 '

In [314]:
import random
nutrify_food_items[random.randint(0, len(nutrify_food_items))].keys()

dict_keys(['foodClass', 'description', 'foodNutrients', 'scientificName', 'foodAttributes', 'nutrientConversionFactors', 'isHistoricalReference', 'ndbNumber', 'foodCategory', 'fdcId', 'dataType', 'inputFoods', 'publicationDate', 'foodPortions'])

In [315]:
target_key = "foodNutrients"
target_food = nutrify_food_items[random.randint(0, len(nutrify_food_items))]

print(f"Target key: {target_key}")
print(f"Target food: {target_food['description']}")
print(f"Target food fdc id: {target_food['fdcId']}")
print(f"Target food data type: {target_food['dataType']}")
print(f"Target food with key:\n{target_food[target_key][0].keys()}")

Target key: foodNutrients
Target food: Apples, red delicious, with skin, raw
Target food fdc id: 1750339
Target food data type: Foundation
Target food with key:
dict_keys(['type', 'id', 'nutrient', 'dataPoints', 'foodNutrientDerivation', 'max', 'min', 'median', 'amount'])


Details about the keys:
- `fdcId`: Unique identifier for a food item
- `description`: Description of the food item
- `wweiaFoodCategory`: 
```
{'wweiaFoodCategoryCode': 2643393,
 'wweiaFoodCategoryDescription': 'Poultry mixed dishes'}
```
* `foodPortions`: The amount of a certain food you'd have in a portion, e.g. 1 cup of corn (the default is 100g but some foods also have information per serving)
* `dataType`: Where the data comes from, e.g. `Foundation` or `SR Legacy` or `Survey (FNDDS)`
* `foodNutrients`: The nutrients of the food (macronutrients, micronutrients, etc.)
* `foodClass`: Similar to the `dataType` but shows simpler category e.g. `Survey`

In [352]:
# Get a list of macronutrients
macronutrients = ["Protein", 
                  "Total lipid (fat)", 
                  "Carbohydrate, by difference",
                  "Alcohol, ethyl"]

macronutrients_to_key = {"Protein": "protein",
                          "Total lipid (fat)": "fat",
                          "Carbohydrate, by difference": "carbohydrate",
                          "Alcohol, ethyl": "alcohol"}

# Create a dictionary mapping macronutrients to their energy density (kcal/g)
energy_dict = {"protein": 4,
               "fat": 9,
               "carbohydrate": 4,
               "alcohol": 7}

## Create a list of micronutrients to get started (see: https://www.hsph.harvard.edu/nutritionsource/vitamins/)
# Create a list of minerals (from FDC)
minerals = [
    "Calcium, Ca",
    "Copper, Cu",
    "Fluoride, F",
    "Iodine, I", 
    "Iron, Fe",
    "Magnesium, Mg",
    "Manganese, Mn",
    "Molybdenum, Mo",
    "Nickel, Ni",
    "Phosphorus, P",
    "Potassium, K",
    "Selenium, Se",
    "Sodium, Na",
    "Zinc, Zn",
]

# Create a list of vitamins (from FDC)
vitamins = [
    "Vitamin A", # IU = international units = 0.3 mcg retinol activity equivalents (RAE)
    "Thiamin", # vitamin B1
    "Riboflavin", # vitamin B2
    "Niacin", # vitamin B3
    "Pantothenic acid", # vitamin B5
    "Vitamin B-6", # (pyridoxal, pyridoxine, pyridoxamine)
    "Biotin", # vitamin B7
    "Folate, total", # vitamin B9, folic acid
    "Vitamin B-12", # cobalamin
    "Vitamin C, total ascorbic acid", # ascorbic acid
    "Choline, total", # choline
    "Vitamin D (D2 + D3)", # calciferol
    "Vitamin E (alpha-tocopherol)", # alpha-tocopherol
    "Vitamin K (phylloquinone)", # phylloquinone
]

# Create a list of other
other_food_nutrients = [
    "Caffeine",
    "Citric acid"
]

# Oils (e.g. olive oil) have a different layout to other kinds of foods, as they are mostly fats rather than 
# carbohydrates and protein.
lipids = {"Fatty acids, total saturated": "SFA",
          "Fatty acids, total monounsaturated": "MUFA",
          "Fatty acids, total polyunsaturated": "PUFA"} 

lipids_to_key = {"Fatty acids, total saturated": "saturated_fat",
                 "Fatty acids, total monounsaturated": "monounsaturated_fat",
                 "Fatty acids, total polyunsaturated": "polyunsaturated_fat"}


# Get oil names from Foundation Foods (can make this more general later, e.g. semantic search across the whole database)
foundation_food_oil_types = ["Oil, canola", "Oil, coconut", "Oil, corn", "Oil, peanut", "Oil, safflower", "Oil, soybean", "Oil, sunflower", "Oil, olive, extra light", "Oil, olive, extra virgin"]
oil_items = [item for item in data["FoundationFoods"] if item["description"] in foundation_food_oil_types]
oil_names = [item["description"] for item in oil_items]
print(oil_names)

['Oil, coconut', 'Oil, canola', 'Oil, corn', 'Oil, soybean', 'Oil, olive, extra virgin', 'Oil, peanut', 'Oil, sunflower', 'Oil, safflower', 'Oil, olive, extra light']


In [353]:
len(oil_names)

9

In [354]:
import random
target_item = nutrify_food_items[random.randint(0, len(nutrify_food_items))]
print(target_item["fdcId"])
print(target_item["description"])

for item in target_item["foodNutrients"]:
    if item["nutrient"]["name"] in macronutrients:
        print(item["nutrient"]["name"])
    # print(item["nutrient"]["name"])

target_item

2344649
Soup, mostly noodles
Protein
Total lipid (fat)
Carbohydrate, by difference
Alcohol, ethyl


{'foodClass': 'Survey',
 'description': 'Soup, mostly noodles',
 'foodNutrients': [{'type': 'FoodNutrient',
   'id': 28793309,
   'nutrient': {'id': 1003,
    'number': '203',
    'name': 'Protein',
    'rank': 600,
    'unitName': 'g'},
   'amount': 1.55},
  {'type': 'FoodNutrient',
   'id': 28793310,
   'nutrient': {'id': 1004,
    'number': '204',
    'name': 'Total lipid (fat)',
    'rank': 800,
    'unitName': 'g'},
   'amount': 2.68},
  {'type': 'FoodNutrient',
   'id': 28793311,
   'nutrient': {'id': 1005,
    'number': '205',
    'name': 'Carbohydrate, by difference',
    'rank': 1110,
    'unitName': 'g'},
   'amount': 9.17},
  {'type': 'FoodNutrient',
   'id': 28793312,
   'nutrient': {'id': 1008,
    'number': '208',
    'name': 'Energy',
    'rank': 300,
    'unitName': 'kcal'},
   'amount': 67.0},
  {'type': 'FoodNutrient',
   'id': 28793313,
   'nutrient': {'id': 1018,
    'number': '221',
    'name': 'Alcohol, ethyl',
    'rank': 18200,
    'unitName': 'g'},
   'amount':

In [355]:
import json

json.dump(target_item, open("target_item.json", "w"), indent=4)

In [356]:
for i, item in enumerate(nutrify_food_items):
    if item["fdcId"] == 748608: # Olive oil, see: https://fdc.nal.usda.gov/fdc-app.html#/food-details/748608/nutrients
        print(i)
        print(item)

25
{'foodClass': 'FinalFood', 'description': 'Oil, olive, extra virgin', 'foodNutrients': [{'type': 'FoodNutrient', 'id': 8529145, 'nutrient': {'id': 1264, 'number': '612', 'name': 'SFA 14:0', 'rank': 10500, 'unitName': 'g'}, 'dataPoints': 36, 'foodNutrientDerivation': {'code': 'A', 'description': 'Analytical', 'foodNutrientSource': {'id': 1, 'code': '1', 'description': 'Analytical or derived from analytical'}}, 'max': 0.021, 'min': 0.009, 'median': 0.012, 'amount': 0.013}, {'type': 'FoodNutrient', 'id': 8529146, 'nutrient': {'id': 1265, 'number': '613', 'name': 'SFA 16:0', 'rank': 10700, 'unitName': 'g'}, 'dataPoints': 36, 'foodNutrientDerivation': {'code': 'A', 'description': 'Analytical', 'foodNutrientSource': {'id': 1, 'code': '1', 'description': 'Analytical or derived from analytical'}}, 'max': 15.8, 'min': 9.48, 'median': 12, 'amount': 12.1}, {'type': 'FoodNutrient', 'id': 8529147, 'nutrient': {'id': 1266, 'number': '614', 'name': 'SFA 18:0', 'rank': 10900, 'unitName': 'g'}, 'dat

In [357]:
# Get info for Olive oil
nutrify_food_items[25]

# Get info for Yoghurt
# nutrify_food_items[47]

{'foodClass': 'FinalFood',
 'description': 'Oil, olive, extra virgin',
 'foodNutrients': [{'type': 'FoodNutrient',
   'id': 8529145,
   'nutrient': {'id': 1264,
    'number': '612',
    'name': 'SFA 14:0',
    'rank': 10500,
    'unitName': 'g'},
   'dataPoints': 36,
   'foodNutrientDerivation': {'code': 'A',
    'description': 'Analytical',
    'foodNutrientSource': {'id': 1,
     'code': '1',
     'description': 'Analytical or derived from analytical'}},
   'max': 0.021,
   'min': 0.009,
   'median': 0.012,
   'amount': 0.013},
  {'type': 'FoodNutrient',
   'id': 8529146,
   'nutrient': {'id': 1265,
    'number': '613',
    'name': 'SFA 16:0',
    'rank': 10700,
    'unitName': 'g'},
   'dataPoints': 36,
   'foodNutrientDerivation': {'code': 'A',
    'description': 'Analytical',
    'foodNutrientSource': {'id': 1,
     'code': '1',
     'description': 'Analytical or derived from analytical'}},
   'max': 15.8,
   'min': 9.48,
   'median': 12,
   'amount': 12.1},
  {'type': 'FoodNutrie

In [359]:
source_url_base = "https://fdc.nal.usda.gov/fdc-app.html#/food-details/{}/nutrients"

nutrify_food_items_nutrients = []
for item in tqdm(nutrify_food_items):
    calories_string = "total_energy_calories"

    item_dict = {}
    item_dict["fdcId"] = item["fdcId"]
    print(f"On item: {item['fdcId']}")
    item_dict["description"] = item["description"]
    item_dict["default_size"] = 100
    item_dict["unit"] = "g"
    item_dict[calories_string] = 0

    # Get the macronutrients + water + micronutrients
    item_macronutrients = {}

    # Create empty lists for various minerals and vitamins
    mineral_list = []
    vitamin_list = []
    other_nutrient_list = []

    # Setup micronutrients dictionary
    item_dict["micronutrients"] = {"vitamins": "",
                                   "minerals": ""}

    # Setup other nutrients dictionary
    item_other_nutrients = {} 
    
    for nutrient in item["foodNutrients"]:

        nutrient_name = nutrient["nutrient"]["name"]

        # Get the macronutrients
        if nutrient_name in macronutrients:
            macronutrient_name = macronutrients_to_key[nutrient_name]
            item_macronutrients[macronutrient_name] = nutrient["amount"]
            item_dict["macronutrients"] = item_macronutrients
        
        # Get the water
        if nutrient_name == "Water":
            item_dict["Water"] = nutrient["amount"]

        # Get the vitamin micronutrients
        if nutrient_name in vitamins:
            item_vitamin_micronutrients = {}
            vitamin_name = nutrient["nutrient"]["name"]
            # print(f"On vitamin name: {vitamin_name}")
            try:
                vitamin_amount = nutrient["amount"]
                vitamin_unit = nutrient["nutrient"]["unitName"]
            except:
                print(f"Could not get amount for vitamin: {vitamin_name}")
                if nutrient_name == "Vitamin A": # replace for Retinol
                    vitamin_amount = [nutrient["amount"] for nutrient in item["foodNutrients"] if nutrient["nutrient"]["name"] == "Retinol"][0]
                    vitamin_unit = [nutrient["nutrient"]["unitName"] for nutrient in item["foodNutrients"] if nutrient["nutrient"]["name"] == "Retinol"][0]
                    
            item_vitamin_micronutrients["name"] = vitamin_name
            item_vitamin_micronutrients["amount"] = vitamin_amount
            item_vitamin_micronutrients["unit"] = vitamin_unit

            vitamin_list.append(item_vitamin_micronutrients)
            # print(len(vitamin_list))
        
        # Get the mineral micronutrients
        if nutrient_name in minerals:
            item_mineral_micronutrients = {}
            mineral_name = nutrient["nutrient"]["name"]
            # print(f"On mineral name: {mineral_name}")
            try:
                mineral_amount = nutrient["amount"]
                mineral_unit = nutrient["nutrient"]["unitName"]
            except:
                print(f"Could not get amount for mineral: {mineral_name}")
            
            item_mineral_micronutrients["name"] = mineral_name
            item_mineral_micronutrients["amount"] = mineral_amount
            item_mineral_micronutrients["unit"] = mineral_unit

            mineral_list.append(item_mineral_micronutrients)
            # print(len(mineral_list))
        
        # Get the other nutrients
        if nutrient_name in other_food_nutrients:
            item_other_nutrients = {}
            other_nutrient_name = nutrient["nutrient"]["name"]
            # print(f"On other nutrient name: {other_nutrient_name}")
            try:
                other_nutrient_amount = nutrient["amount"]
                other_nutrient_unit = nutrient["nutrient"]["unitName"]
            except:
                print(f"Could not get amount for other nutrient: {other_nutrient_name}")

            item_other_nutrients["name"] = other_nutrient_name
            item_other_nutrients["amount"] = other_nutrient_amount
            item_other_nutrients["unit"] = other_nutrient_unit

            other_nutrient_list.append(item_other_nutrients)
            # print(len(other_nutrient_list))
            

    item_dict["micronutrients"]["vitamins"] = vitamin_list
    item_dict["micronutrients"]["minerals"] = mineral_list
    item_dict["other_nutrients"] = other_nutrient_list
    
    # Handle oils
    if item["description"] in oil_names:
        for nutrient in item["foodNutrients"]:
            if nutrient["nutrient"]["name"] in lipids:
                target_lipid_name = lipids_to_key[nutrient["nutrient"]["name"]]
                item_macronutrients[target_lipid_name] = nutrient["amount"]
                item_dict["lipids"] = item_macronutrients
        lipid_items = item_dict["lipids"].keys()
        for lipid in lipid_items:
            item_dict[calories_string] += round(item_dict["lipids"][lipid] * energy_dict["fat"]) # all lipids are fat

    # Calculate the total energy (this will fail on items like Table Salt + Olive Oil, their values in the FDC are different to others, see: 746775 and https://fdc.nal.usda.gov/fdc-app.html#/food-details/748608/nutrients)
    for nutrient in macronutrients:
        try:
            target_macronutrient = macronutrients_to_key[nutrient]
            if target_macronutrient in item_dict["macronutrients"]:
                item_dict[calories_string] += round(item_dict["macronutrients"][target_macronutrient] * energy_dict[target_macronutrient])
        except Exception as e:
            print(e)
            print(item_dict)
    
    # Add the source url
    item_dict["fdc_source_url"] = source_url_base.format(item["fdcId"])
    
    nutrify_food_items_nutrients.append(item_dict)

  0%|          | 0/274 [00:00<?, ?it/s]

On item: 321360
On item: 323121
On item: 323505
On item: 324653
On item: 325430
On item: 327046
On item: 327357
On item: 328637
On item: 331897
On item: 331960
On item: 332397
On item: 334194
On item: 334536
On item: 335240
On item: 746762
On item: 746768
On item: 746769
On item: 746771
On item: 746773
On item: 746775
'macronutrients'
{'fdcId': 746775, 'description': 'Salt, table, iodized', 'default_size': 100, 'unit': 'g', 'total_energy_calories': 0, 'micronutrients': {'vitamins': [], 'minerals': [{'name': 'Iron, Fe', 'amount': 0, 'unit': 'mg'}, {'name': 'Magnesium, Mg', 'amount': 0, 'unit': 'mg'}, {'name': 'Phosphorus, P', 'amount': 0, 'unit': 'mg'}, {'name': 'Sodium, Na', 'amount': 38700, 'unit': 'mg'}, {'name': 'Copper, Cu', 'amount': 0, 'unit': 'mg'}, {'name': 'Manganese, Mn', 'amount': 0.032, 'unit': 'mg'}, {'name': 'Calcium, Ca', 'amount': 50, 'unit': 'mg'}, {'name': 'Potassium, K', 'amount': 2, 'unit': 'mg'}, {'name': 'Zinc, Zn', 'amount': 0, 'unit': 'mg'}, {'name': 'Iodine, I'

In [360]:
for item in nutrify_food_items_nutrients:
    if "olive," in item["description"].lower():
        print(item)

{'fdcId': 748608, 'description': 'Oil, olive, extra virgin', 'default_size': 100, 'unit': 'g', 'total_energy_calories': 844, 'micronutrients': {'vitamins': [], 'minerals': []}, 'other_nutrients': [], 'lipids': {'saturated_fat': 15.4, 'monounsaturated_fat': 69.2, 'polyunsaturated_fat': 9.07}, 'fdc_source_url': 'https://fdc.nal.usda.gov/fdc-app.html#/food-details/748608/nutrients'}


In [361]:
nutrify_food_items_nutrients[0]

{'fdcId': 321360,
 'description': 'Tomatoes, grape, raw',
 'default_size': 100,
 'unit': 'g',
 'total_energy_calories': 31,
 'micronutrients': {'vitamins': [{'name': 'Vitamin C, total ascorbic acid',
    'amount': 27.2,
    'unit': 'mg'},
   {'name': 'Choline, total', 'amount': 9.8, 'unit': 'mg'},
   {'name': 'Vitamin E (alpha-tocopherol)', 'amount': 0.98, 'unit': 'mg'},
   {'name': 'Vitamin K (phylloquinone)', 'amount': 4.2, 'unit': 'µg'},
   {'name': 'Riboflavin', 'amount': 0.065, 'unit': 'mg'},
   {'name': 'Thiamin', 'amount': 0.075, 'unit': 'mg'},
   {'name': 'Vitamin B-6', 'amount': 0.06, 'unit': 'mg'},
   {'name': 'Folate, total', 'amount': 10, 'unit': 'µg'},
   {'name': 'Niacin', 'amount': 0.805, 'unit': 'mg'}],
  'minerals': [{'name': 'Calcium, Ca', 'amount': 11, 'unit': 'mg'},
   {'name': 'Copper, Cu', 'amount': 0.058, 'unit': 'mg'},
   {'name': 'Iron, Fe', 'amount': 0.33, 'unit': 'mg'},
   {'name': 'Magnesium, Mg', 'amount': 11.9, 'unit': 'mg'},
   {'name': 'Manganese, Mn', '

In [362]:
len(nutrify_food_items_nutrients)

274

In [363]:
# Save to JSON
with open("nutrify_foodvision_items_nutrients.json", "w") as f:
    json.dump(nutrify_food_items_nutrients, f)

In [364]:
# Upload to Google Storage
!gsutil cp nutrify_foodvision_items_nutrients.json gs://food_vision_bucket_with_object_versioning/nutrition_information/nutrify_foodvision_items_nutrients.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Copying file://nutrify_foodvision_items_nutrients.json [Content-Type=application/json]...
- [1 files][399.4 KiB/399.4 KiB]                                                
Operation completed over 1 objects/399.4 KiB.                                    


In [None]:
# TODO: 
# back the nutrition information up to Google Storage - could also track this as an Artifact in Weights & Biases? 
# Merge the Nutrition information with the Nutrify Food Item names
# Perhaps it's time to start getting a code to match all of the Nutrify food item names?
# Match the metadata of the Nutrify food items with the nutrition information
# For foods without an FDC ID - could match the closest food item based on the name and then use the nutrition information from that food item
# Could do the same with every other food, e.g. encode the text and show links to the closest food items (e.g. apple_red -> red delicious apple + more, apple_green -> granny smith apple + more, etc.)
# This would be the same with text-based foods as well, just include the most similar foods based on the text lookup (e.g. canola oil -> canola oil + more, olive oil -> olive oil + more, etc.)

In [164]:
# TODO: end goal

"""
{
    "nutrify_name": "Cheese, blue",
    "nutrition_information": 
    {"fdcId": 1000001,
     "description": "Cheese, blue",
     "size": 100,
     "unit": "g",
     "Macronutrients": {"Protein": 21.4,
                        "Total lipid (fat)": 28.74,
                        "Carbohydrate, by difference": 3.06},
     "Total energy (calories)": 376,
     "Micronutrients": {"Minerals": {"name": "Calcium, Ca",
                                     "unitName": "mg",
                                     "amount": 673.0},
                        "Vitamins": {"name": "Vitamin A, IU",
                                     "unitName": "mg",
                                     "amount": 2499.0}},
    "source_url": "https://fdc.nal.usda.gov/fdc-app.html#/food-details/1000001/nutrients"}
} 
"""

'\n{\n    "nutrify_name": "Cheese, blue",\n    "nutrition_information": \n    {"fdcId": 1000001,\n     "description": "Cheese, blue",\n     "size": 100,\n     "unit": "g",\n     "Macronutrients": {"Protein": 21.4,\n                        "Total lipid (fat)": 28.74,\n                        "Carbohydrate, by difference": 3.06},\n     "Total energy (calories)": 376,\n     "Micronutrients": {"Minerals": {"name": "Calcium, Ca",\n                                     "unit": "mg",\n                                     "amount": 673.0},\n                        "Vitamins": {"name"Vitamin A, IU": 2499.0}},\n    "source_url": "https://fdc.nal.usda.gov/fdc-app.html#/food-details/1000001/nutrients"}\n} \n'

## Find a match for foods with no exact match

In [227]:
## TODO: 
# UPTOHERE
# Find classes which don't have nutrients
# Display options for the *most* similar items
    # E.g. if someone types in "Cheese, blue" and there is no exact match, display the closest matches
    # Could also just do this for the existing foods (find the most similar foods via name)

In [105]:
# Which rows don't have FDC IDs?
nutrify_to_fdcid[nutrify_to_fdcid["fdc_id"].isna()]

Unnamed: 0,number,class_name,fdc_id
0,0,achacha,
11,11,bacon_and_egg_burger,
15,15,banana_bread,
22,22,beef_diced,
23,23,beef_kebab,
30,30,biltong,
31,31,black_pepper,
48,48,cape_gooseberries,
54,54,carrot_purple,
68,68,chicken_stir_fry,


In [106]:
# Create a function to similarity match the class names (e.g. code which string is most like another string)
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import dot_score
model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_list_of_classes(class_names: list, model: SentenceTransformer):
    """
    Embeds a list of class names.
    """

    # Map the class_name to the embedding
    class_name_to_embedding = {class_name: embedding for class_name, embedding in zip(class_names, model.encode(class_names))}

    return class_name_to_embedding

# Create a function to similarity match the class names (e.g. code which string is most like another string)
def find_most_similar_class_name(target_class_name, class_name_embedding_dict, num_similar=3):
    """
    Finds the most similar class name to the class_name provided.
    """
    # Get the embedding of the target_class_name
    target_class_name_embedding = model.encode([target_class_name])[0]

    # Create a dictionary of the class_name and the dot score between the target_class_name and the class_name
    class_name_embedding_dict = {class_name: dot_score(embedding, target_class_name_embedding) for class_name, embedding in class_name_embedding_dict.items()}

    # Sort the dictionary by the dot score
    most_similar_class_names = {k: v[0].item() for k, v in sorted(class_name_embedding_dict.items(), key=lambda item: item[1], reverse=True)}

    # Only return the top num_similar
    # most_similar_class_names = list(most_similar_class_names.items())[:num_similar]

    return most_similar_class_names

In [107]:
# Get a list of the unique descriptions
unique_descriptions = fdcid_to_description_df["description"].unique().tolist()
len(unique_descriptions)

13475

In [108]:
# Create an embedding of all the unique descriptions
class_name_embedding_dict_unique_descriptions = embed_list_of_classes(unique_descriptions, model)

In [109]:
food_item_without_fdcid = rows_without_fdcid.class_name.to_list()
food_item_without_fdcid[:5]

['achacha', 'bacon_and_egg_burger', 'banana_bread', 'beef_diced', 'beef_kebab']

In [120]:
# target_food_item = random.choice(food_item_without_fdcid)
target_food_item = "breadfruit"
print(f"Showing the most similar class names to: {target_food_item}")
most_similar_class_names = find_most_similar_class_name(target_food_item, class_name_embedding_dict_unique_descriptions, num_similar=10)
most_similar_class_names

Showing the most similar class names to: breadfruit


{'Breadfruit, cooked': 0.8408920168876648,
 'Breadfruit, raw': 0.7962300181388855,
 'Bread, pumpkin': 0.7196508049964905,
 'Bread, fruit': 0.7077605724334717,
 'Bread, vegetable': 0.7036866545677185,
 'Bread, Italian': 0.6614352464675903,
 'Bread, cheese': 0.6606635451316833,
 'Bread, wheat': 0.6581324338912964,
 'Bread, vegetable, toasted': 0.6531142592430115,
 'Bread, dough, fried': 0.649786114692688,
 'Bread, sour dough, toasted': 0.6474003791809082,
 'Bread, potato': 0.6473690271377563,
 'Bread, white': 0.6419284343719482,
 'Bread, rye': 0.6404439210891724,
 'Bread stuffing': 0.6403353214263916,
 'Bread, cheese, toasted': 0.6380453109741211,
 'Bread, pumpernickel, toasted': 0.6380175352096558,
 'Bread, Irish soda': 0.6379777789115906,
 'Bread, pumpernickel': 0.6369190216064453,
 'Bread, barley': 0.633845865726471,
 'Bread, zucchini': 0.6325340270996094,
 'Bread, naan': 0.6309512853622437,
 'Bread, soy': 0.6300880312919617,
 'Bread, sour dough': 0.6282667517662048,
 'Bread, wheat, t

In [117]:
import openai

openai.api_key_path = "../foodvision/utils/openai_api_key.txt"

def chat_complete(prompt: str, **openai_kwargs) -> str:
    """Generate completion from OpenAI ChatGPT API"""
    default_kwargs = {"model": "gpt-3.5-turbo", "max_tokens": 256, "temperature": 0.2, "top_p": 1}
    openai_kwargs = {**default_kwargs, **openai_kwargs}
    response = openai.ChatCompletion.create(
        **openai_kwargs,
        messages=[
                {"role": "system", "content": "You are a helpful assistant who is very good at designing apps and making them look good."},
                {"role": "user", "content": prompt},
            ]
        )
    return response

base_prompt_for_sorting_food_names = """You are a food scientist and nutritionist very knowledgable on all different kinds of foods.\n
Given the following list of foods and the target text, please sort the list of foods into which ones relate most to the target text.\n
For example, the list ['apples, red delicious, with skin, raw',
 'apples, gala, with skin, raw',
 'apples, honeycrisp, with skin, raw',
 'apples, granny smith, with skin, raw',
 'applesauce, unsweetened, with added vitamin c',
 ...]
and the target text 'apple_red' would be sorted as a JSON dictionary,
 '1': 'apples, red delicious, with skin, raw', 
 '2': 'apples, gala, with skin, raw', 
 '3': 'apples, honeycrisp, with skin, raw', 
 '4': 'apples, granny smith, with skin, raw', 
 '5': 'applesauce, unsweetened, with added vitamin c', 
 ....\n
Please do not include any extra foods or text, only return the list of foods in the target food list.\n
Put a favour on similar types of foods, e.g. if the target text contains "capsicum_green" then "pepper, red" should be higher than "cabbage, green".\n
If the target list of foods to sort has N items, you should return a list of N items.\n
For example, if the target list of foods to sort has 10 items, you should return a list of 10 items.\n
Target text: {target_text}\n
List of foods to sort: {target_food_list}\n
Sorted list of food as properly formatted JSON dictionary:\n
"""

def sort_list_of_foods(base_prompt: str, target_text: str, target_food_list: list) -> list:
    # print(base_prompt)
    target_text_dict = {"target_text": target_text, "target_food_list": target_food_list}
    target_text_prompt = base_prompt.format(**target_text_dict)
    # target_text_prompt = base_prompt.format(target_text, target_food_list)
    # print(target_text_prompt)
    answer = chat_complete(target_text_prompt)
    answer_formatted = str(answer.choices[0].message.content)
    # Remove "\n"
    # print(f"Display name generated: {target_text} -> {answer_formatted}")
    return answer_formatted

# target_text = "capsicum_red"
target_text = random.choice(food_item_without_fdcid)
print(f"Predicting the most similar class names to: {target_text}")

most_similar_class_names = find_most_similar_class_name(target_text, class_name_embedding_dict_unique_descriptions)

# Get the top 10 most similar class names
target_food_list = list(most_similar_class_names.keys())[:10]
print(len(target_food_list))

# Turn target_food_list into a dictionary with keys in order of 1, 2, 3...
target_food_list = {i+1: food_name for i, food_name in enumerate(target_food_list)}
print(target_food_list)
# print(target_food_list)

sorted_list_of_foods = sort_list_of_foods(base_prompt=base_prompt_for_sorting_food_names, target_text=target_text, target_food_list=target_food_list)
sorted_list_of_foods


Predicting the most similar class names to: melon_spanish
10
{1: 'Melon, banana (Navajo)', 2: 'Honeydew melon, raw', 3: 'Melons, honeydew, raw', 4: 'Winter melon, cooked', 5: 'Melons, casaba, raw', 6: 'Bitter melon, cooked', 7: 'Melons, cantaloupe, raw', 8: 'Horned melon (Kiwano)', 9: 'Waxgourd, (chinese preserving melon), raw', 10: 'Waxgourd, (chinese preserving melon), cooked, boiled, drained, with salt'}


'{\n    "1": "Melons, honeydew, raw",\n    "2": "Honeydew melon, raw",\n    "3": "Melons, cantaloupe, raw",\n    "4": "Melons, casaba, raw",\n    "5": "Horned melon (Kiwano)",\n    "6": "Waxgourd, (chinese preserving melon), raw",\n    "7": "Waxgourd, (chinese preserving melon), cooked, boiled, drained, with salt",\n    "8": "Winter melon, cooked",\n    "9": "Bitter melon, cooked",\n    "10": "Melon, banana (Navajo)"\n}'

In [193]:
import json
sorted_list_of_foods = json.loads(sorted_list_of_foods)
sorted_list_of_foods

{'1': 'peppers, bell, red, raw',
 '2': 'cabbage, red, raw',
 '3': 'cherries, sweet, dark red, raw',
 '4': 'lettuce, leaf, red, raw',
 '5': 'onions, red, raw',
 '6': 'peppers, bell, orange, raw',
 '7': 'peppers, bell, yellow, raw',
 '8': 'peppers, bell, green, raw',
 '9': 'mustard, prepared, yellow',
 '10': 'beans, dry, medium red (0% moisture)'}

In [20]:
unique_categories = food["food_category_id"].unique()
len(unique_categories)

19

19 different food categories... I wonder what these are?

In [21]:
food["food_category_id"].value_counts()

11.0    10819
1.0      9057
9.0      8558
16.0     4575
4.0      2892
14.0     2889
12.0     2769
20.0     1949
5.0      1503
15.0      913
7.0       795
10.0      613
6.0       568
18.0      488
25.0      474
13.0      454
2.0       386
19.0       54
Name: food_category_id, dtype: int64

In [23]:
# Get food categories
food_cats = pd.read_csv("data/2022/FoodData_Central_Supporting_Data_csv_2022-10-28/food_category.csv")
food_cats

Unnamed: 0,id,code,description
0,1,100,Dairy and Egg Products
1,2,200,Spices and Herbs
2,3,300,Baby Foods
3,4,400,Fats and Oils
4,5,500,Poultry Products
5,6,600,"Soups, Sauces, and Gravies"
6,7,700,Sausages and Luncheon Meats
7,8,800,Breakfast Cereals
8,9,900,Fruits and Fruit Juices
9,10,1000,Pork Products


## 10 foods we want

To keep things simple, we will reduce the databases from FoodData Central to 10 different foods.

Why these foods?

Because we have images for those foods ready to go.

```python
# These aren't whole foods so we don't want them yet, let's get another list and get those
ten_foods = ["chicken_curry", 
"chicken_wings", 
"fried_rice", 
"grilled_salmon", 
"humburger", 
"ice_cream", 
"pizza",
"ramen", 
"steak", 
"sushi"]

# We want these... (they're whole foods) 
ten_whole_foods = ["chicken_wings",
    "apple",
    "banana",
    "beef", # steak, etc
    "carrots",
    "egg", # whole egg
    "strawberries",
    "blueberries",
    "mushrooms",
    "honey"
]
```

In [24]:
ten_whole_foods = ['apple',
 'banana',
 'beef', # steak etc
 'blueberries',
 'carrots',
 'chicken_wings',
 'egg', # whole egg
 'honey',
 'mushrooms',
 'strawberries']
ten_whole_foods

['apple',
 'banana',
 'beef',
 'blueberries',
 'carrots',
 'chicken_wings',
 'egg',
 'honey',
 'mushrooms',
 'strawberries']

In [61]:
food.head()

AttributeError: 'str' object has no attribute 'head'

In [60]:
food.data_type.value_counts()

AttributeError: 'str' object has no attribute 'data_type'

In [26]:
# Foundation food is the ground truth for a certain type of food, excludes some details about the food
# E.g. the data_type foundation_food for Chicken will the the original unique ID for chicken
foundation_food = food[(food["data_type"] == "foundation_food") | (food["data_type"] == "survey_fndds_food")]
len(foundation_food)

5905

survey_fndds_food    5624
foundation_food       281
Name: data_type, dtype: int64

In [27]:
foundation_food[foundation_food["description"].str.contains("blue")]

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
42186,2263889,foundation_food,"blueberries, raw",9.0,2022-04-28
43441,2346411,foundation_food,"blueberries, raw",9.0,2022-10-28
348,2341108,survey_fndds_food,"cheese, blue or roquefort",,2022-10-28
2720,2343480,survey_fndds_food,"pie, blueberry",,2022-10-28
3933,2344693,survey_fndds_food,"blueberries, dried",,2022-10-28
4009,2344769,survey_fndds_food,"blueberries, raw",,2022-10-28
4011,2344771,survey_fndds_food,"blueberries, frozen",,2022-10-28
4012,2344772,survey_fndds_food,blueberry pie filling,,2022-10-28
4057,2344817,survey_fndds_food,blueberry juice,,2022-10-28
4994,2345754,survey_fndds_food,blue or roquefort cheese dressing,,2022-10-28


In [29]:
foundation_foods = foundation_food["description"]
foundation_foods.iloc[20:40]

4153               peanut butter, smooth style, with salt
4329                             cheese, parmesan, grated
4491    cheese, pasteurized process, american, vitamin...
4580    grapefruit juice, white, canned or bottled, un...
4723                                 peaches, yellow, raw
4817    seeds, sunflower seed kernels, dry roasted, wi...
4951      sausage, italian, pork, mild, cooked, pan-fried
5164                  bread, white, commercially prepared
5285          sausage, turkey, breakfast links, mild, raw
5428                                        cheese, swiss
5489    kale, frozen, cooked, boiled, drained, without...
5751    carrots, frozen, unprepared (includes foods fo...
5991                            mustard, prepared, yellow
6198                                figs, dried, uncooked
6339                                kiwifruit, green, raw
6491                              melons, cantaloupe, raw
6650                                      nectarines, raw
6794    orange

In [30]:
# Found a list of the foundation foods we're going to start with!
foundation_foods_list = list(foundation_foods)
for food in foundation_foods_list:
    if "blue" in food:
        print(food)

blueberries, raw
blueberries, raw
cheese, blue or roquefort
pie, blueberry
blueberries, dried
blueberries, raw
blueberries, frozen
blueberry pie filling
blueberry juice
blue or roquefort cheese dressing
blue or roquefort cheese dressing, light
blue or roquefort cheese dressing, fat free
blueberry syrup


In [31]:
# food.loc[(food["description"].str.contains("chicken", case=False)) & (food["description"].str.contains("drumstick", case=False))][-10:]
# Find chicken in foundation food
for food in foundation_foods:
    if "chicken" in food.lower():
        print(food)

chicken, broilers or fryers, drumstick, meat only, cooked, braised
chicken, broiler or fryers, breast, skinless, boneless, meat only, cooked, braised
mock chicken legs
chicken, ns as to part and cooking method, ns as to skin eaten
chicken, ns as to part and cooking method, skin eaten
chicken, ns as to part and cooking method, skin not eaten
chicken, ns as to part, baked, broiled, or roasted, ns as to skin eaten
chicken, ns as to part, baked, broiled, or roasted, skin eaten
chicken, ns as to part, baked, broiled, or roasted, skin not eaten
chicken, ns as to part, rotisserie, ns as to skin eaten
chicken, ns as to part, rotisserie, skin eaten
chicken, ns as to part, rotisserie, skin not eaten
chicken, ns as to part, stewed, ns as to skin eaten
chicken, ns as to part, stewed, skin eaten
chicken, ns as to part, stewed, skin not eaten
chicken, ns as to part, grilled without sauce, ns as to skin eaten
chicken, ns as to part, grilled without sauce, skin eaten
chicken, ns as to part, grilled wi

In [32]:
chicken_wing_id = int(foundation_food.loc[foundation_food["description"].str.contains("Chicken", case=False)].iloc[0]["fdc_id"])
chicken_wing_id

331897

In [33]:
food_nutrient[food_nutrient["fdc_id"] == chicken_wing_id]

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,min_year_acquired,nutrient_name
41650,2259068,331897,1303,0.003,5.0,1.0,0.002,0.004,0.003,,2010.0,,tfa 16:1 t
41651,2259065,331897,1280,0.008,5.0,1.0,0.008,0.009,0.008,,2010.0,,pufa 22:5 n-3 (dpa)
41652,2259076,331897,1404,0.045,5.0,1.0,0.035,0.059,0.042,,2010.0,,"pufa 18:3 n-3 c,c,c (ala)"
41653,2259059,331897,1261,0.002,5.0,1.0,0.001,0.003,0.002,,2010.0,,sfa 8:0
41654,2259106,331897,1109,0.170,1.0,1.0,,,0.170,,2010.0,,vitamin e (alpha-tocopherol)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41740,2259066,331897,1300,0.006,5.0,1.0,0.005,0.007,0.006,,2010.0,,sfa 17:0
41741,2259121,331897,1271,0.088,5.0,1.0,0.083,0.094,0.087,,2010.0,,pufa 20:4
41742,2259112,331897,1167,5.050,5.0,1.0,4.890,5.240,5.050,,2010.0,,niacin
41743,2259074,331897,1329,0.021,,4.0,,,,,,,"fatty acids, total trans-monoenoic"


## Get protein, carb, fat IDs

See this document for info on foundation foods and their nutrients - https://fdc.nal.usda.gov/docs/Foundation_Foods_Documentation_Apr2021.pdf

* Carbohydrate, by difference = total carbohydrates


In [34]:
nutrient[(nutrient["name"].str.contains("protein", case=False)) | \
         (nutrient["name"].str.contains("carbohydrate", case=False)) | \
         (nutrient["name"].str.contains("fat", case=False))]

Unnamed: 0,id,name,unit_name,nutrient_nbr,rank
4,1003,Protein,G,203.0,600.0
5,1004,Total lipid (fat),G,204.0,800.0
6,1005,"Carbohydrate, by difference",G,205.0,1110.0
50,1049,"Solids, non-fat",G,253.0,999999.0
51,1050,"Carbohydrate, by summation",G,205.2,1120.0
54,1053,Adjusted Protein,G,257.0,700.0
73,1072,"Carbohydrate, other",G,284.0,
86,1085,Total fat (NLEA),G,298.0,900.0
258,1257,"Fatty acids, total trans",G,605.0,15400.0
259,1258,"Fatty acids, total saturated",G,606.0,9700.0


In [35]:
target_nutrients = nutrient[nutrient["name"].isin(["Protein", "Total lipid (fat)", "Carbohydrate, by difference"])]
target_nutrients

Unnamed: 0,id,name,unit_name,nutrient_nbr,rank
4,1003,Protein,G,203.0,600.0
5,1004,Total lipid (fat),G,204.0,800.0
6,1005,"Carbohydrate, by difference",G,205.0,1110.0


In [36]:
target_nutrient_dict = {1003: "protein",
    1004: "fat",
    1005: "carbohydrate"
}

## Get target food protein, fat, carbohydrates

We want to now index on the target foods and the target nutrients and retrieve their values for each food/nutrient.

E.g.

```python
{"food_1": {"protein": 100,
            "carbohydrate": 50,
            "fat": 20},
 "food_2": ...

...}
```

In [37]:
list(target_nutrient_dict.keys())

[1003, 1004, 1005]

In [38]:
food_nutrient

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,min_year_acquired,nutrient_name
0,2201847,319877,1051,56.300,1.0,1.0,,,,,,,water
1,2201845,319877,1002,1.280,1.0,1.0,,,,,,,nitrogen
2,2201846,319877,1004,19.000,1.0,1.0,,,,,,,total lipid (fat)
3,2201844,319877,1007,1.980,1.0,1.0,,,,,,,ash
4,2201852,319878,1091,188.000,1.0,1.0,,,,,,,"phosphorus, p"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
365555,28906044,2346383,334,0.000,,,,,,,,,
365556,28906068,2346383,612,0.048,,,,,,,,,
365557,28906042,2346383,323,10.500,,,,,,,,,
365558,28906052,2346383,417,0.000,,,,,,,,,


In [39]:
food_nutrient[(food_nutrient["nutrient_id"].isin(list(target_nutrient_dict.keys())))]

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,min_year_acquired,nutrient_name
2,2201846,319877,1004,19.00,1.0,1.0,,,,,,,total lipid (fat)
16,2201859,319882,1004,18.70,1.0,1.0,,,,,,,total lipid (fat)
28,2201873,319892,1004,16.60,1.0,1.0,,,,,,,total lipid (fat)
43,2201886,319899,1004,19.10,1.0,1.0,,,,,,,total lipid (fat)
97,2201942,319908,1004,18.20,1.0,1.0,,,,,,,total lipid (fat)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129408,28911795,2352627,1004,0.12,1.0,1.0,,,,,,,total lipid (fat)
129433,28911820,2352652,1004,0.19,1.0,1.0,,,,,,,total lipid (fat)
129458,28911845,2352677,1004,0.17,1.0,1.0,,,,,,,total lipid (fat)
129485,28911872,2352704,1004,0.18,1.0,1.0,,,,,,,total lipid (fat)


In [40]:
food_nutrient.dtypes

id                     int64
fdc_id                 int64
nutrient_id            int64
amount               float64
data_points          float64
derivation_id        float64
min                  float64
max                  float64
median               float64
footnote              object
min_year_acqured     float64
min_year_acquired    float64
nutrient_name         object
dtype: object

In [41]:
# Find nutrition for chicken_wing_id (protein, fat, carb)
food_nutrient[(food_nutrient["fdc_id"] == chicken_wing_id) & (food_nutrient["nutrient_id"].isin(list(target_nutrient_dict.keys())))]

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,min_year_acquired,nutrient_name
41686,2259098,331897,1004,5.95,6.0,1.0,5.54,6.33,5.93,,2010.0,,total lipid (fat)
41718,2259079,331897,1003,23.9,,49.0,23.0,24.6,24.1,,,,protein
41729,2259099,331897,1005,0.0,,49.0,,,,,,,"carbohydrate, by difference"


In [42]:
sorted(list(foundation_foods))

['abalone',
 'adobo, with noodles',
 'adobo, with rice',
 'agave liquid sweetener',
 'alcoholic malt beverage',
 'alcoholic malt beverage, sweetened',
 'alexander',
 'alfalfa sprouts, raw',
 'alfredo sauce',
 'alfredo sauce with added vegetables',
 'alfredo sauce with meat',
 'alfredo sauce with meat and added vegetables',
 'alfredo sauce with poultry',
 'alfredo sauce with poultry and added vegetables',
 'alfredo sauce with seafood',
 'alfredo sauce with seafood and added vegetables',
 'almond butter',
 'almond butter and jelly sandwich, on wheat bread',
 'almond butter and jelly sandwich, on white bread',
 'almond butter sandwich, on wheat bread',
 'almond butter sandwich, on white bread',
 'almond butter, creamy',
 'almond butter, lower sodium',
 'almond chicken',
 'almond milk, sweetened',
 'almond milk, sweetened, chocolate',
 'almond milk, unsweetened',
 'almond milk, unsweetened, chocolate',
 'almond milk, unsweetened, plain, refrigerated',
 'almond milk, unsweetened, plain, she

In [43]:
ten_whole_foods = ["chicken_wings",
    "apple",
    "banana",
    "beef", # steak, etc
    "carrots",
    "egg", # whole egg
    "strawberries",
    "blueberries",
    "mushrooms",
    "honey"
]

In [44]:
ten_whole_foods

['chicken_wings',
 'apple',
 'banana',
 'beef',
 'carrots',
 'egg',
 'strawberries',
 'blueberries',
 'mushrooms',
 'honey']

## Get ten whole foods `food_id`

Everything except blueberries and honey are available in `foundation_food`. 

For blueberries and honey, we'll have to dig into the survery data: `data_exploration/data/FoodData_Central_survey_food_csv_2020-10-30`

In [45]:
# Get all food ids from foundation_food (honey and blueberries in another dataset)
target_whole_foods = ['apple', # removed chicken wings... can come back later...
 'banana',
 'beef',
 'blueberries',
 'carrots',
 'chicken',
 'egg',
 'honey',
 'strawberries',
 'mushrooms']

In [46]:
# str.contains can search on regex - https://stackoverflow.com/a/17973255/7900723
pattern = "|".join([f"(?i){food}" for food in target_whole_foods])
pattern

'(?i)apple|(?i)banana|(?i)beef|(?i)blueberries|(?i)carrots|(?i)chicken|(?i)egg|(?i)honey|(?i)strawberries|(?i)mushrooms'

In [47]:
foundation_food[foundation_food["description"].str.contains(pattern, case=False)].sort_values(by=["description"])

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
1481,2342241,survey_fndds_food,almond chicken,,2022-10-28
2208,2342968,survey_fndds_food,"almonds, honey roasted",,2022-10-28
4053,2344813,survey_fndds_food,apple cider,,2022-10-28
5405,2346165,survey_fndds_food,"apple juice beverage, 40-50% juice, light",,2022-10-28
4054,2344814,survey_fndds_food,"apple juice, 100%",,2022-10-28
...,...,...,...,...,...
1338,2342098,survey_fndds_food,"venison or deer, noodles, and vegetables inclu...",,2022-10-28
1337,2342097,survey_fndds_food,"venison or deer, potatoes, and vegetables excl...",,2022-10-28
1336,2342096,survey_fndds_food,"venison or deer, potatoes, and vegetables incl...",,2022-10-28
2248,2343008,survey_fndds_food,"walnuts, excluding honey roasted",,2022-10-28


In [48]:
foundation_food[foundation_food["description"].str.contains("honey")]

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
20191,1105547,foundation_food,"apples, honeycrisp, with skin, raw",9.0,2020-10-30
20547,1750343,foundation_food,"apples, honeycrisp, with skin, raw",9.0,2020-10-30
2208,2342968,survey_fndds_food,"almonds, honey roasted",,2022-10-28
2215,2342975,survey_fndds_food,"cashews, honey roasted",,2022-10-28
2228,2342988,survey_fndds_food,"mixed nuts, honey roasted",,2022-10-28
2237,2342997,survey_fndds_food,"peanuts, honey roasted",,2022-10-28
2242,2343002,survey_fndds_food,"pecans, honey roasted",,2022-10-28
2248,2343008,survey_fndds_food,"walnuts, excluding honey roasted",,2022-10-28
2249,2343009,survey_fndds_food,"walnuts, honey roasted",,2022-10-28
2757,2343517,survey_fndds_food,"sopaipilla, without syrup or honey",,2022-10-28


In [49]:
# Found this earlier
chicken_wing_id

331897

In [50]:
# Map foods to food_id (these have been filtered from larger quantities to smaller quantities)
# For example, if there were 5 kinds of apple, only one was chosen
whole_foods_id_map = {1750339: "apple", # red delicious
    1105314: "banana", # Bananas, ripe and slightly ripe, raw
    1102702: "blueberries", # blueberries, raw	
    746763: "beef", # t-bone steak 
    746764: "carrots", # frozen unprepared
    331897: "chicken_wings", # Chicken, broilers or fryers, drumstick, meat o...	
    329490: "egg", # Egg, whole, dried	
    1103956: "honey", # Honey
    1750347: "mushrooms", # Mushrooms, white button
    747448: "strawberries" # strawberries, raw
}

In [51]:
list(whole_foods_id_map.keys())

[1750339,
 1105314,
 1102702,
 746763,
 746764,
 331897,
 329490,
 1103956,
 1750347,
 747448]

In [52]:
# Find nutrition for eight whole foods
target_whole_foods_df = food_nutrient[(food_nutrient["fdc_id"].isin(list(whole_foods_id_map.keys()))) & \
    (food_nutrient["nutrient_id"].isin(list(target_nutrient_dict.keys())))][["fdc_id", "nutrient_id", "amount"]]
target_whole_foods_df

Unnamed: 0,fdc_id,nutrient_id,amount
34265,329490,1004,39.8
34266,329490,1005,1.87
34270,329490,1003,48.1
41686,331897,1004,5.95
41718,331897,1003,23.9
41729,331897,1005,0.0
71052,746763,1003,27.3
71079,746763,1005,0.0
71097,746763,1004,11.4
71175,746764,1004,0.47


In [53]:
# Pivot the table to how we want it
target_whole_foods_df = target_whole_foods_df.pivot_table("amount", "fdc_id", "nutrient_id")
target_whole_foods_df

nutrient_id,1003,1004,1005
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
329490,48.1,39.8,1.87
331897,23.9,5.95,0.0
746763,27.3,11.4,0.0
746764,0.81,0.47,7.92
747448,0.64,0.22,7.63
1105314,0.74,0.29,23.0
1750339,0.1875,0.2125,14.7817
1750347,2.890625,0.3708,4.079375


In [54]:
len(whole_foods_id_map)

10

In [55]:
target_whole_foods_df = target_whole_foods_df.reset_index(drop=False).rename_axis(None, axis=1)
target_whole_foods_df

Unnamed: 0,fdc_id,1003,1004,1005
0,329490,48.1,39.8,1.87
1,331897,23.9,5.95,0.0
2,746763,27.3,11.4,0.0
3,746764,0.81,0.47,7.92
4,747448,0.64,0.22,7.63
5,1105314,0.74,0.29,23.0
6,1750339,0.1875,0.2125,14.7817
7,1750347,2.890625,0.3708,4.079375


In [56]:
target_nutrient_dict

{1003: 'protein', 1004: 'fat', 1005: 'carbohydrate'}

In [57]:
# Rename columns
target_whole_foods_df.rename(columns=target_nutrient_dict, inplace=True)
target_whole_foods_df

Unnamed: 0,fdc_id,protein,fat,carbohydrate
0,329490,48.1,39.8,1.87
1,331897,23.9,5.95,0.0
2,746763,27.3,11.4,0.0
3,746764,0.81,0.47,7.92
4,747448,0.64,0.22,7.63
5,1105314,0.74,0.29,23.0
6,1750339,0.1875,0.2125,14.7817
7,1750347,2.890625,0.3708,4.079375


In [58]:
# Add food names
target_whole_foods_df["food_name"] = target_whole_foods_df["fdc_id"].map(whole_foods_id_map)
target_whole_foods_df

Unnamed: 0,fdc_id,protein,fat,carbohydrate,food_name
0,329490,48.1,39.8,1.87,egg
1,331897,23.9,5.95,0.0,chicken_wings
2,746763,27.3,11.4,0.0,beef
3,746764,0.81,0.47,7.92,carrots
4,747448,0.64,0.22,7.63,strawberries
5,1105314,0.74,0.29,23.0,banana
6,1750339,0.1875,0.2125,14.7817,apple
7,1750347,2.890625,0.3708,4.079375,mushrooms


All amounts are per 100g.

## Export first 10 target food nutrition information

In [173]:
target_whole_foods_df.to_csv("target_ten_whole_food_nutrition_info.csv", index=False)

In [174]:
ten_whole_foods

['apple',
 'banana',
 'beef',
 'blueberries',
 'carrots',
 'chicken_wings',
 'egg',
 'honey',
 'mushrooms',
 'strawberries']