# Exploring the data downloaded from USDA FoodData Central

See the download here: https://fdc.nal.usda.gov/download-datasets.html

Data available in `.data/`.

Data dictionary available in  `nutrify/data_exploration/data/FoodData_Central_foundation_food_csv_2021-04-28/Download & API Field Descriptions April 2021.pdf`





In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Get Data

In [2]:
# Import databases
food = pd.read_csv("data/FoodData_Central_foundation_food_csv_2021-04-28/food.csv")
food_survey = pd.read_csv("data/FoodData_Central_survey_food_csv_2020-10-30/food.csv")
nutrient = pd.read_csv("data/FoodData_Central_Supporting_Data_csv_2021-04-28/nutrient.csv")
food_nutrient = pd.read_csv("data/FoodData_Central_foundation_food_csv_2021-04-28/food_nutrient.csv")
food_nutrient_survey = pd.read_csv("data/FoodData_Central_survey_food_csv_2020-10-30/food_nutrient.csv")

print(len(food), len(food_survey), len(nutrient), len(food_nutrient), len(food_nutrient_survey))

# Combine food and food_survey and drop columns that don't have a description 
food = food.append(food_survey).dropna(subset=["description"])
food["description"] = food["description"].str.lower()
print(f"Combined food rows: {len(food)}")

# Combine food_nutrient and food_nutrient_survey
food_nutrient = food_nutrient.append(food_nutrient_survey)
food_nutrient["nutrient_name"] = food_nutrient["nutrient_id"].map(nutrient.set_index("id")["name"]).str.lower() 
print(f"Combined food nutrient rows: {len(food_nutrient)}")

  exec(code_obj, self.user_global_ns, self.user_ns)


27593 7083 463 105689 460395
Combined food rows: 34668
Combined food nutrient rows: 566084


In [4]:
food_nutrient.columns

Index(['id', 'fdc_id', 'nutrient_id', 'amount', 'data_points', 'derivation_id',
       'min', 'max', 'median', 'footnote', 'min_year_acqured', 'sf.footnote',
       'min_year_acquired', 'nutrient_name'],
      dtype='object')

In [8]:
len(food_nutrient)

566084

In [10]:
food.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,"hummus, sabra classic",16.0,2019-04-01
1,319875,market_acquisition,"hummus, sabra classic",16.0,2019-04-01
2,319876,market_acquisition,"hummus, sabra classic",16.0,2019-04-01
3,319877,sub_sample_food,hummus,16.0,2019-04-01
4,319878,sub_sample_food,hummus,16.0,2019-04-01


In [3]:
food_nutrient.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,sf.footnote,min_year_acquired,nutrient_name
0,2201847,319877,1051,56.3,1.0,1.0,,,,,,,,water
1,2201845,319877,1002,1.28,1.0,1.0,,,,,,,,nitrogen
2,2201846,319877,1004,19.0,1.0,1.0,,,,,,,,total lipid (fat)
3,2201844,319877,1007,1.98,1.0,1.0,,,,,,,,ash
4,2201852,319878,1091,188.0,1.0,1.0,,,,,,,,"phosphorus, p"


In [41]:
# How many unique?
unique_descriptions = food["description"].unique()
len(unique_descriptions)

18441

Beautiful, this gives us ~11368 foods to work with as a goal to model. But surely they can be split into less categories?

In [42]:
unique_descriptions[:10]

array(['HUMMUS, SABRA CLASSIC', 'Hummus', 'HUMMUS, OTHER',
       'Hummus - NFY12140O', 'Hummus - NFY12140P', 'Hummus - NFY12140Q',
       'Hummus - NFY12140R', 'Hummus - NFY12140S', 'Hummus - NFY12140F',
       'Hummus - NFY12140G'], dtype=object)

Where do these descriptions come from?

How can we reduce them down to like 10 unique foods and keep it simple...

In [43]:
unique_descriptions[-10:]

array(['Cauliflower, cooked, as ingredient',
       'Eggplant, cooked, as ingredient',
       'Green beans, cooked, as ingredient',
       'Summer squash, cooked, as ingredient',
       'Dark green vegetables as ingredient in omelet',
       'Tomatoes as ingredient in omelet',
       'Other vegetables as ingredient in omelet',
       'Vegetables as ingredient in curry',
       'Sauce as ingredient in hamburgers',
       'Industrial oil as ingredient in food'], dtype=object)

In [44]:
# Find random indexes of food to explore
import random
random_number = random.randint(0, len(unique_descriptions)-10)
unique_descriptions[random_number:random_number+10]

array(['Corned beef sandwich',
       'Reuben sandwich, corned beef sandwich with sauerkraut and cheese, with spread',
       'Pastrami sandwich', 'Roast beef sandwich',
       'Roast beef sandwich, with gravy',
       'Roast beef submarine sandwich, with lettuce, tomato and spread',
       'Roast beef submarine sandwich, with cheese, lettuce, tomato and spread',
       'Roast beef sandwich with cheese',
       'Roast beef sandwich with bacon and cheese sauce',
       'Roast beef submarine sandwich, on roll, au jus'], dtype=object)

### Food Categories

Let's dive into food categories. 

In [9]:
food.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,"hummus, sabra classic",16.0,2019-04-01
1,319875,market_acquisition,"hummus, sabra classic",16.0,2019-04-01
2,319876,market_acquisition,"hummus, sabra classic",16.0,2019-04-01
3,319877,sub_sample_food,hummus,16.0,2019-04-01
4,319878,sub_sample_food,hummus,16.0,2019-04-01


In [46]:
unique_categories = food["food_category_id"].unique()
len(unique_categories)

19

19 different food categories... I wonder what these are?

In [47]:
food["food_category_id"].value_counts()

1.0     6406
9.0     3982
11.0    3788
4.0     2924
16.0    2450
5.0     1503
14.0     918
15.0     913
7.0      795
10.0     613
20.0     588
6.0      568
18.0     488
25.0     474
13.0     454
2.0      386
12.0     267
19.0      54
Name: food_category_id, dtype: int64

In [48]:
# Get food categories
food_cats = pd.read_csv("data/FoodData_Central_Supporting_Data_csv_2021-04-28/food_category.csv")
food_cats

Unnamed: 0,id,code,description
0,1,100,Dairy and Egg Products
1,2,200,Spices and Herbs
2,3,300,Baby Foods
3,4,400,Fats and Oils
4,5,500,Poultry Products
5,6,600,"Soups, Sauces, and Gravies"
6,7,700,Sausages and Luncheon Meats
7,8,800,Breakfast Cereals
8,9,900,Fruits and Fruit Juices
9,10,1000,Pork Products


## 10 foods we want

To keep things simple, we will reduce the databases from FoodData Central to 10 different foods.

Why these foods?

Because we have images for those foods ready to go.

```python
# These aren't whole foods so we don't want them yet, let's get another list and get those
ten_foods = ["chicken_curry", 
"chicken_wings", 
"fried_rice", 
"grilled_salmon", 
"humburger", 
"ice_cream", 
"pizza",
"ramen", 
"steak", 
"sushi"]

# We want these... (they're whole foods) 
ten_whole_foods = ["chicken_wings",
    "apple",
    "banana",
    "beef", # steak, etc
    "carrots",
    "egg", # whole egg
    "strawberries",
    "blueberries",
    "mushrooms",
    "honey"
]
```

In [50]:
ten_whole_foods = ['apple',
 'banana',
 'beef', # steak etc
 'blueberries',
 'carrots',
 'chicken_wings',
 'egg', # whole egg
 'honey',
 'mushrooms',
 'strawberries']
ten_whole_foods

['apple',
 'banana',
 'beef',
 'blueberries',
 'carrots',
 'chicken_wings',
 'egg',
 'honey',
 'mushrooms',
 'strawberries']

In [97]:
food.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,"hummus, sabra classic",16.0,2019-04-01
1,319875,market_acquisition,"hummus, sabra classic",16.0,2019-04-01
2,319876,market_acquisition,"hummus, sabra classic",16.0,2019-04-01
3,319877,sub_sample_food,hummus,16.0,2019-04-01
4,319878,sub_sample_food,hummus,16.0,2019-04-01


In [99]:
# Foundation food is the ground truth for a certain type of food, excludes some details about the food
# E.g. the data_type foundation_food for Chicken will the the original unique ID for chicken
foundation_food = food[(food["data_type"] == "foundation_food") | (food["data_type"] == "survey_fndds_food")]
len(foundation_food)

7278

In [100]:
foundation_food[foundation_food["description"].str.contains("blue")]

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
493,1098003,survey_fndds_food,"cheese, blue or roquefort",,2020-10-30
3560,1101070,survey_fndds_food,"pie, berry, not blackberry, blueberry, boysenb...",,2020-10-30
3561,1101071,survey_fndds_food,"pie, berry, not blackberry, blueberry, boysenb...",,2020-10-30
3562,1101072,survey_fndds_food,"pie, berry, not blackberry, blueberry, boysenb...",,2020-10-30
3563,1101073,survey_fndds_food,"pie, blueberry, two crust",,2020-10-30
3564,1101074,survey_fndds_food,"pie, blueberry, individual size or tart",,2020-10-30
3646,1101156,survey_fndds_food,"crisp, blueberry",,2020-10-30
4255,1101765,survey_fndds_food,cereal (malt-o-meal blueberry muffin tops),,2020-10-30
4295,1101805,survey_fndds_food,cereal (kellogg's special k blueberry),,2020-10-30
5116,1102626,survey_fndds_food,"blueberries, dried",,2020-10-30


In [101]:
foundation_foods = foundation_food["description"]
foundation_foods[20:40]

4153               peanut butter, smooth style, with salt
4329                             cheese, parmesan, grated
4491    cheese, pasteurized process, american, vitamin...
4580    grapefruit juice, white, canned or bottled, un...
4723                                 peaches, yellow, raw
4817    seeds, sunflower seed kernels, dry roasted, wi...
4951      sausage, italian, pork, mild, cooked, pan-fried
5164                  bread, white, commercially prepared
5285          sausage, turkey, breakfast links, mild, raw
5428                                        cheese, swiss
5489    kale, frozen, cooked, boiled, drained, without...
5751    carrots, frozen, unprepared (includes foods fo...
5991                            mustard, prepared, yellow
6198                                figs, dried, uncooked
6339                                kiwifruit, green, raw
6491                              melons, cantaloupe, raw
6650                                      nectarines, raw
6794    orange

In [102]:
# Found a list of the foundation foods we're going to start with!
foundation_foods_list = list(foundation_foods)
for food in foundation_foods_list:
    if "blue" in food:
        print(food)

cheese, blue or roquefort
pie, berry, not blackberry, blueberry, boysenberry, huckleberry, raspberry, or strawberry; two crust
pie, berry, not blackberry, blueberry, boysenberry, huckleberry, raspberry, or strawberry; one crust
pie, berry, not blackberry, blueberry, boysenberry, huckleberry, raspberry, or strawberry, individual size or tart
pie, blueberry, two crust
pie, blueberry, individual size or tart
crisp, blueberry
cereal (malt-o-meal blueberry muffin tops)
cereal (kellogg's special k blueberry)
blueberries, dried
blueberries, raw
blueberries, frozen
blueberry pie filling
blueberry juice
blueberry yogurt dessert, baby food, strained
blue or roquefort cheese dressing
blue or roquefort cheese dressing, light
blue or roquefort cheese dressing, fat free
blueberry syrup


In [212]:
# food.loc[(food["description"].str.contains("chicken", case=False)) & (food["description"].str.contains("drumstick", case=False))][-10:]
# Find chicken in foundation food
for food in foundation_foods:
    if "chicken" in food.lower():
        print(food)

Chicken, broilers or fryers, drumstick, meat only, cooked, braised
Chicken, broiler or fryers, breast, skinless, boneless, meat only, cooked, braised


In [147]:
chicken_wing_id = int(foundation_food.loc[foundation_food["description"].str.contains("Chicken", case=False)].iloc[0]["fdc_id"])
chicken_wing_id

331897

In [148]:
food_nutrient[food_nutrient["fdc_id"] == chicken_wing_id]

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,sf.footnote,min_year_acquired,nutrient_name
41650,2259068,331897,1303,0.003,5.0,1.0,0.002,0.004,0.003,,2010.0,,,tfa 16:1 t
41651,2259065,331897,1280,0.008,5.0,1.0,0.008,0.009,0.008,,2010.0,,,pufa 22:5 n-3 (dpa)
41652,2259076,331897,1404,0.045,5.0,1.0,0.035,0.059,0.042,,2010.0,,,"pufa 18:3 n-3 c,c,c (ala)"
41653,2259059,331897,1261,0.002,5.0,1.0,0.001,0.003,0.002,,2010.0,,,sfa 8:0
41654,2259106,331897,1109,0.170,1.0,1.0,,,0.170,,2010.0,,,vitamin e (alpha-tocopherol)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41740,2259066,331897,1300,0.006,5.0,1.0,0.005,0.007,0.006,,2010.0,,,sfa 17:0
41741,2259121,331897,1271,0.088,5.0,1.0,0.083,0.094,0.087,,2010.0,,,pufa 20:4
41742,2259112,331897,1167,5.050,5.0,1.0,4.890,5.240,5.050,,2010.0,,,niacin
41743,2259074,331897,1329,0.021,,4.0,,,,,,,,"fatty acids, total trans-monoenoic"


## Get protein, carb, fat IDs

See this document for info on foundation foods and their nutrients - https://fdc.nal.usda.gov/docs/Foundation_Foods_Documentation_Apr2021.pdf

* Carbohydrate, by difference = total carbohydrates


In [149]:
nutrient[(nutrient["name"].str.contains("protein", case=False)) | \
         (nutrient["name"].str.contains("carbohydrate", case=False)) | \
         (nutrient["name"].str.contains("fat", case=False))]

Unnamed: 0,id,name,unit_name,nutrient_nbr,rank
2,1003,Protein,G,203.0,600.0
3,1004,Total lipid (fat),G,204.0,800.0
4,1005,"Carbohydrate, by difference",G,205.0,1110.0
48,1049,"Solids, non-fat",G,253.0,999999.0
49,1050,"Carbohydrate, by summation",G,205.2,1120.0
52,1053,Adjusted Protein,G,257.0,700.0
70,1072,"Carbohydrate, other",G,284.0,
83,1085,Total fat (NLEA),G,298.0,900.0
254,1257,"Fatty acids, total trans",G,605.0,15400.0
255,1258,"Fatty acids, total saturated",G,606.0,9700.0


In [150]:
target_nutrients = nutrient[nutrient["name"].isin(["Protein", "Total lipid (fat)", "Carbohydrate, by difference"])]
target_nutrients

Unnamed: 0,id,name,unit_name,nutrient_nbr,rank
2,1003,Protein,G,203.0,600.0
3,1004,Total lipid (fat),G,204.0,800.0
4,1005,"Carbohydrate, by difference",G,205.0,1110.0


In [151]:
target_nutrient_dict = {1003: "protein",
    1004: "fat",
    1005: "carbohydrate"
}

## Get target food protein, fat, carbohydrates

We want to now index on the target foods and the target nutrients and retrieve their values for each food/nutrient.

E.g.

```python
{"food_1": {"protein": 100,
            "carbohydrate": 50,
            "fat": 20},
 "food_2": ...

...}
```

In [152]:
list(target_nutrient_dict.keys())

[1003, 1004, 1005]

In [153]:
food_nutrient

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,sf.footnote,min_year_acquired,nutrient_name
0,2201847,319877,1051,56.300,1.0,1.0,,,,,,,,water
1,2201845,319877,1002,1.280,1.0,1.0,,,,,,,,nitrogen
2,2201846,319877,1004,19.000,1.0,1.0,,,,,,,,total lipid (fat)
3,2201844,319877,1007,1.980,1.0,1.0,,,,,,,,ash
4,2201852,319878,1091,188.000,1.0,1.0,,,,,,,,"phosphorus, p"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460390,13335598,1104592,1265,8.407,,,,,,,,,,sfa 16:0
460391,13335599,1104592,1266,23.263,,,,,,,,,,sfa 18:0
460392,13335595,1104592,1262,0.000,,,,,,,,,,sfa 10:0
460393,13335582,1104592,1178,0.000,,,,,,,,,,vitamin b-12


In [154]:
food_nutrient[(food_nutrient["nutrient_id"].isin(list(target_nutrient_dict.keys())))]

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,sf.footnote,min_year_acquired,nutrient_name
2,2201846,319877,1004,19.00,1.0,1.0,,,,,,,,total lipid (fat)
16,2201859,319882,1004,18.70,1.0,1.0,,,,,,,,total lipid (fat)
28,2201873,319892,1004,16.60,1.0,1.0,,,,,,,,total lipid (fat)
43,2201886,319899,1004,19.10,1.0,1.0,,,,,,,,total lipid (fat)
97,2201942,319908,1004,18.20,1.0,1.0,,,,,,,,total lipid (fat)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460301,13335483,1104591,1003,1.34,,,,,,,,,,protein
460324,13335484,1104591,1004,22.85,,,,,,,,,,total lipid (fat)
460344,13335549,1104592,1004,100.00,,,,,,,,,,total lipid (fat)
460349,13335550,1104592,1005,0.00,,,,,,,,,,"carbohydrate, by difference"


In [155]:
food_nutrient.dtypes

id                     int64
fdc_id                 int64
nutrient_id            int64
amount               float64
data_points          float64
derivation_id        float64
min                  float64
max                  float64
median               float64
footnote              object
min_year_acqured     float64
sf.footnote          float64
min_year_acquired    float64
nutrient_name         object
dtype: object

In [156]:
# Find nutrition for chicken_wing_id (protein, fat, carb)
food_nutrient[(food_nutrient["fdc_id"] == chicken_wing_id) & (food_nutrient["nutrient_id"].isin(list(target_nutrient_dict.keys())))]

Unnamed: 0,id,fdc_id,nutrient_id,amount,data_points,derivation_id,min,max,median,footnote,min_year_acqured,sf.footnote,min_year_acquired,nutrient_name
41686,2259098,331897,1004,5.95,6.0,1.0,5.54,6.33,5.93,,2010.0,,,total lipid (fat)
41718,2259079,331897,1003,23.9,,49.0,23.0,24.6,24.1,,,,,protein
41729,2259099,331897,1005,0.0,,49.0,,,,,,,,"carbohydrate, by difference"


In [158]:
sorted(list(foundation_foods))

['100 grand bar',
 '3 musketeers bar',
 '3 musketeers truffle crisp bar',
 'abalone, cooked, ns as to cooking method',
 'abalone, floured or breaded, fried',
 'abalone, steamed or poached',
 'adobo, with noodles',
 'adobo, with rice',
 'agave liquid sweetener',
 'air filled fritter or fried puff, without syrup, puerto rican style',
 'alcoholic malt beverage',
 'alcoholic malt beverage, higher alcohol, sweetened',
 'alcoholic malt beverage, sweetened',
 'alexander',
 'alfalfa sprouts, raw',
 'alfredo sauce',
 'alfredo sauce with added vegetables',
 'alfredo sauce with meat',
 'alfredo sauce with meat and added vegetables',
 'alfredo sauce with poultry',
 'alfredo sauce with poultry and added vegetables',
 'alfredo sauce with seafood',
 'alfredo sauce with seafood and added vegetables',
 'almond butter',
 'almond butter, lower sodium',
 'almond chicken',
 'almond milk, sweetened',
 'almond milk, sweetened, chocolate',
 'almond milk, unsweetened',
 'almond milk, unsweetened, chocolate',
 

In [301]:
ten_whole_foods = ["chicken_wings",
    "apple",
    "banana",
    "beef", # steak, etc
    "carrots",
    "egg", # whole egg
    "strawberries",
    "blueberries",
    "mushrooms",
    "honey"
]

In [302]:
ten_whole_foods

['chicken_wings',
 'apple',
 'banana',
 'beef',
 'carrots',
 'egg',
 'strawberries',
 'blueberries',
 'mushrooms',
 'honey']

## Get ten whole foods `food_id`

Everything except blueberries and honey are available in `foundation_food`. 

For blueberries and honey, we'll have to dig into the survery data: `data_exploration/data/FoodData_Central_survey_food_csv_2020-10-30`

In [159]:
# Get all food ids from foundation_food (honey and blueberries in another dataset)
target_whole_foods = ['apple', # removed chicken wings... can come back later...
 'banana',
 'beef',
 'blueberries',
 'carrots',
 'chicken',
 'egg',
 'honey',
 'strawberries',
 'mushrooms']

In [160]:
# str.contains can search on regex - https://stackoverflow.com/a/17973255/7900723
pattern = "|".join([f"(?i){food}" for food in target_whole_foods])
pattern

'(?i)apple|(?i)banana|(?i)beef|(?i)blueberries|(?i)carrots|(?i)chicken|(?i)egg|(?i)honey|(?i)strawberries|(?i)mushrooms'

In [161]:
foundation_food[foundation_food["description"].str.contains(pattern, case=False)].sort_values(by=["description"])

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
2159,1099669,survey_fndds_food,almond chicken,,2020-10-30
3003,1100513,survey_fndds_food,"almonds, honey roasted",,2020-10-30
5236,1102746,survey_fndds_food,apple cider,,2020-10-30
6855,1104365,survey_fndds_food,"apple juice beverage, 40-50% juice, light",,2020-10-30
5237,1102747,survey_fndds_food,"apple juice, 100%",,2020-10-30
...,...,...,...,...,...
2003,1099513,survey_fndds_food,"venison or deer, potatoes, and vegetables excl...",,2020-10-30
2002,1099512,survey_fndds_food,"venison or deer, potatoes, and vegetables incl...",,2020-10-30
3043,1100553,survey_fndds_food,"walnuts, excluding honey roasted",,2020-10-30
3044,1100554,survey_fndds_food,"walnuts, honey roasted",,2020-10-30


In [162]:
foundation_food[foundation_food["description"].str.contains("honey")]

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
20191,1105547,foundation_food,"apples, honeycrisp, with skin, raw",9.0,2020-10-30
20547,1750343,foundation_food,"apples, honeycrisp, with skin, raw",9.0,2020-10-30
3003,1100513,survey_fndds_food,"almonds, honey roasted",,2020-10-30
3010,1100520,survey_fndds_food,"cashews, honey roasted",,2020-10-30
3023,1100533,survey_fndds_food,"mixed nuts, honey roasted",,2020-10-30
3032,1100542,survey_fndds_food,"peanuts, honey roasted",,2020-10-30
3037,1100547,survey_fndds_food,"pecans, honey roasted",,2020-10-30
3043,1100553,survey_fndds_food,"walnuts, excluding honey roasted",,2020-10-30
3044,1100554,survey_fndds_food,"walnuts, honey roasted",,2020-10-30
3656,1101166,survey_fndds_food,"sopaipilla, without syrup or honey",,2020-10-30


In [163]:
# Found this earlier
chicken_wing_id

331897

In [164]:
# Map foods to food_id (these have been filtered from larger quantities to smaller quantities)
# For example, if there were 5 kinds of apple, only one was chosen
whole_foods_id_map = {1750339: "apple", # red delicious
    1105314: "banana", # Bananas, ripe and slightly ripe, raw
    1102702: "blueberries", # blueberries, raw	
    746763: "beef", # t-bone steak 
    746764: "carrots", # frozen unprepared
    331897: "chicken_wings", # Chicken, broilers or fryers, drumstick, meat o...	
    329490: "egg", # Egg, whole, dried	
    1103956: "honey", # Honey
    1750347: "mushrooms", # Mushrooms, white button
    747448: "strawberries" # strawberries, raw
}

In [165]:
list(whole_foods_id_map.keys())

[1750339,
 1105314,
 1102702,
 746763,
 746764,
 331897,
 329490,
 1103956,
 1750347,
 747448]

In [166]:
# Find nutrition for eight whole foods
target_whole_foods_df = food_nutrient[(food_nutrient["fdc_id"].isin(list(whole_foods_id_map.keys()))) & \
    (food_nutrient["nutrient_id"].isin(list(target_nutrient_dict.keys())))][["fdc_id", "nutrient_id", "amount"]]
target_whole_foods_df

Unnamed: 0,fdc_id,nutrient_id,amount
34265,329490,1004,39.8
34266,329490,1005,1.87
34270,329490,1003,48.1
41686,331897,1004,5.95
41718,331897,1003,23.9
41729,331897,1005,0.0
71052,746763,1003,27.3
71079,746763,1005,0.0
71097,746763,1004,11.4
71175,746764,1004,0.47


In [167]:
# Pivot the table to how we want it
target_whole_foods_df = target_whole_foods_df.pivot_table("amount", "fdc_id", "nutrient_id")
target_whole_foods_df

nutrient_id,1003,1004,1005
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
329490,48.1,39.8,1.87
331897,23.9,5.95,0.0
746763,27.3,11.4,0.0
746764,0.81,0.47,7.92
747448,0.64,0.22,7.63
1102702,0.74,0.33,14.49
1103956,0.3,0.0,82.4
1105314,0.74,0.29,23.0
1750339,0.1875,0.2125,14.7817
1750347,2.890625,0.3708,4.079375


In [168]:
len(whole_foods_id_map)

10

In [169]:
target_whole_foods_df = target_whole_foods_df.reset_index(drop=False).rename_axis(None, axis=1)
target_whole_foods_df

Unnamed: 0,fdc_id,1003,1004,1005
0,329490,48.1,39.8,1.87
1,331897,23.9,5.95,0.0
2,746763,27.3,11.4,0.0
3,746764,0.81,0.47,7.92
4,747448,0.64,0.22,7.63
5,1102702,0.74,0.33,14.49
6,1103956,0.3,0.0,82.4
7,1105314,0.74,0.29,23.0
8,1750339,0.1875,0.2125,14.7817
9,1750347,2.890625,0.3708,4.079375


In [170]:
target_nutrient_dict

{1003: 'protein', 1004: 'fat', 1005: 'carbohydrate'}

In [171]:
# Rename columns
target_whole_foods_df.rename(columns=target_nutrient_dict, inplace=True)
target_whole_foods_df

Unnamed: 0,fdc_id,protein,fat,carbohydrate
0,329490,48.1,39.8,1.87
1,331897,23.9,5.95,0.0
2,746763,27.3,11.4,0.0
3,746764,0.81,0.47,7.92
4,747448,0.64,0.22,7.63
5,1102702,0.74,0.33,14.49
6,1103956,0.3,0.0,82.4
7,1105314,0.74,0.29,23.0
8,1750339,0.1875,0.2125,14.7817
9,1750347,2.890625,0.3708,4.079375


In [172]:
# Add food names
target_whole_foods_df["food_name"] = target_whole_foods_df["fdc_id"].map(whole_foods_id_map)
target_whole_foods_df

Unnamed: 0,fdc_id,protein,fat,carbohydrate,food_name
0,329490,48.1,39.8,1.87,egg
1,331897,23.9,5.95,0.0,chicken_wings
2,746763,27.3,11.4,0.0,beef
3,746764,0.81,0.47,7.92,carrots
4,747448,0.64,0.22,7.63,strawberries
5,1102702,0.74,0.33,14.49,blueberries
6,1103956,0.3,0.0,82.4,honey
7,1105314,0.74,0.29,23.0,banana
8,1750339,0.1875,0.2125,14.7817,apple
9,1750347,2.890625,0.3708,4.079375,mushrooms


All amounts are per 100g.

## Export first 10 target food nutrition information

In [173]:
target_whole_foods_df.to_csv("target_ten_whole_food_nutrition_info.csv", index=False)

In [174]:
ten_whole_foods

['apple',
 'banana',
 'beef',
 'blueberries',
 'carrots',
 'chicken_wings',
 'egg',
 'honey',
 'mushrooms',
 'strawberries']