In this notebook, the USDA api is used to fetch nutritional data based on the name of the food.
Previously, the ingredients in the dataset have been linked to an ingredient in the USDA database, through string matching and manual labeling.

In [1]:
import pandas as pd
from ast import literal_eval

generic = lambda x: literal_eval(x)
conv = {'nutrition' : generic, 'steps' : generic, 'ingredients' : generic, 'id_column' : generic, 'jaccard_similarity' : generic}
df = pd.read_csv("C:/Users/01din\Documents/University\BSc thesis\data\RAW_recipes.csv/ingredients/ingredients_labels.csv")
df = df[['ingredient', 'frequency', 'Long_Desc', 'FdGrp_Desc']]
df = df[df.frequency>49]

In [2]:
def get_nutritional_info(food_name, api_key):
    search_url = "https://api.nal.usda.gov/fdc/v1/foods/search"

    #Search for food_name in usda api
    search_params = {"api_key": api_key, "query": food_name}
    response = requests.get(search_url, params=search_params)
    results = response.json()

    #Get the ID of the first result
    if "foods" in results and results["foods"]:
        fdc_id = results["foods"][0]["fdcId"]
    else:
        return None

    #Fetch nutritional info of the item
    nutrients_url = f"https://api.nal.usda.gov/fdc/v1/food/{fdc_id}"
    nutrients_params = {"api_key": api_key}
    response = requests.get(nutrients_url, params=nutrients_params)
    nutritional_data = response.json()

    #Only want these entries as other entries are not consistently available and cosine similarity requires everything to be present
    major_nutrients = {
        "Energy": 1008,
        "Protein": 1003,
        "Total lipid (fat)": 1004,
        "Carbohydrate, by difference": 1005,
        "Fiber, total dietary": 1079,
        "Sugars, total including NLEA": 2000,
    }

    #Extract the wanted entries
    nutrients = {}
    if 'foodNutrients' in nutritional_data and nutritional_data['foodNutrients']:
        for nutrient in nutritional_data["foodNutrients"]:
            nutrient_id = nutrient["nutrient"]["id"]
            nutrient_name = nutrient["nutrient"]["name"]
            if nutrient_id in major_nutrients.values() and "amount" in nutrient:
                nutrients[nutrient_name] = nutrient["amount"]
    else:
        return None

    return nutrients


In [3]:
import pandas as pd
from tqdm import tqdm
import requests
import json


output_file = "C:/Users/01din\Documents/University\BSc thesis\data\RAW_recipes.csv/ingredients/nutrition.csv"
unique_df = pd.read_csv(output_file)

unique_df['nutrition'] = None

api_key = "2Gk6PKHQZJraaSdb9y9ghvMwTFCCLarDXogOcec7"

#For tracking progress
tqdm.pandas(desc="Fetching nutritional information")

#Get the nutritional information for each unique item
for index, row in tqdm(unique_df.iterrows(), total=len(unique_df)):
    if pd.isna(row['nutrition']):
        nutrition_data = get_nutritional_info(row['Long_Desc'], api_key)
        #Convert to string
        nutrition_data_str = json.dumps(nutrition_data) if nutrition_data else None
        unique_df.loc[index, 'nutrition'] = nutrition_data_str
        #Update CSV file
        unique_df.to_csv(output_file, index=False)

  0%|          | 0/1074 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
df.nutrition.tolist()

In [None]:
import pandas as pd
df = pd.read_csv("C:/Users/01din\Documents/University\BSc thesis\data\RAW_recipes.csv/ingredients/ingredients_labels.csv")

In [None]:
df.label.value_counts()

In [None]:
output_file = "C:/Users/01din\Documents/University\BSc thesis\data\RAW_recipes.csv/ingredients/nutrition.csv"
nutrition_df = pd.read_csv(output_file)

In [None]:
failed = nutrition_df[nutrition_df.nutrition.isna()]

Some are still missing, these will be manually filled through USDA or nutrifox in the case of USDA having removed this entry from their api.

In [None]:
failed

In [None]:
def get_nutritional_info(food_name, api_key):
    search_url = "https://api.nal.usda.gov/fdc/v1/foods/search"

    # Search for the food item
    search_params = {"api_key": api_key, "query": food_name}
    response = requests.get(search_url, params=search_params)
    results = response.json()
    return results

In [None]:
response = get_nutritional_info('PIZZA HUT 12" Cheese Pizza, Pan Crust', api_key)

Some values were missing from USDA. They were found through other sources like nutrifox.

In [21]:
nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, ground, 70% lean meat / 30% fat, raw'].index[0], 'nutrition'] = '{"Energy": 332, "Protein": 14.4, "Total lipid (fat)": 30, "Carbohydrate, by difference": 0.0, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Oil, canola'].index[0], 'nutrition'] = '{"Energy": 884, "Protein": 0, "Total lipid (fat)": 100, "Carbohydrate, by difference": 0.0, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, ground, 93% lean meat / 7% fat, raw'].index[0], 'nutrition'] = '{"Energy": 152, "Protein": 20.8, "Total lipid (fat)": 7, "Carbohydrate, by difference": 0.0, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'PEPPERIDGE FARM, Hamburger Buns w/Sesame'].index[0], 'nutrition'] = '{"Energy": 280, "Protein": 0, "Total lipid (fat)": 4, "Carbohydrate, by difference": 50, "Fiber, total dietary": 3, "Sugars, total including NLEA": 4}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, round, bottom round, roast, separable lean only, trimmed to 0" fat, all grades, cooked, roasted'].index[0], 'nutrition'] = '{"Energy": 185, "Protein": 27.2, "Total lipid (fat)": 7.63, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, ground, 70% lean meat / 30% fat, crumbles, cooked, pan-browned'].index[0], 'nutrition'] = '{"Energy": 270, "Protein": 25.6, "Total lipid (fat)": 17.9, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Sauce, pasta, spaghetti/marinara, ready-to-serve'].index[0], 'nutrition'] = '{"Energy": 51, "Protein": 1.41, "Total lipid (fat)": 1.48, "Carbohydrate, by difference": 8.06, "Fiber, total dietary": 1.8, "Sugars, total including NLEA": 5.5}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'PAPA JOHN\'S 14" The Works Pizza, Original Crust'].index[0], 'nutrition'] = '{"Energy": 240, "Protein": 10, "Total lipid (fat)": 10, "Carbohydrate, by difference": 27, "Fiber, total dietary": 3, "Sugars, total including NLEA": 5}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, flank, steak, separable lean only, trimmed to 0" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 149, "Protein": 22, "Total lipid (fat)": 6, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, chuck eye roast, boneless, America\'s Beef Roast, separable lean only, trimmed to 0" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 139, "Protein": 21, "Total lipid (fat)": 6, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, round, outside round, bottom round, steak, separable lean and fat, trimmed to 0" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 150, "Protein": 21, "Total lipid (fat)": 7, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, loin, top sirloin cap steak, boneless, separable lean only, trimmed to 1/8" fat, select, raw'].index[0], 'nutrition'] = '{"Energy": 138, "Protein": 21.3, "Total lipid (fat)": 5.82, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, top sirloin, steak, separable lean and fat, trimmed to 1/8" fat, select, raw'].index[0], 'nutrition'] = '{"Energy": 158, "Protein": 22.2, "Total lipid (fat)": 7.07, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Soup, beef broth, less/reduced sodium, ready to serve'].index[0], 'nutrition'] = '{"Energy": 6, "Protein": 1.14, "Total lipid (fat)": 0.07, "Carbohydrate, by difference": 0.2, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0.2}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'PIZZA HUT 12" Cheese Pizza, Pan Crust'].index[0], 'nutrition'] = '{"Energy": 280, "Protein": 11.7, "Total lipid (fat)": 12.6, "Carbohydrate, by difference": 29.9, "Fiber, total dietary": 1.7, "Sugars, total including NLEA": 3.21}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, loin, top sirloin cap steak, boneless, separable lean only, trimmed to 1/8" fat, choice, cooked, grilled'].index[0], 'nutrition'] = '{"Energy": 189, "Protein": 28, "Total lipid (fat)": 8, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, tenderloin, steak, separable lean only, trimmed to 1/8" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 158, "Protein": 22.2, "Total lipid (fat)": 7.07, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Lamb, domestic, leg, sirloin half, separable lean only, trimmed to 1/4" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 134, "Protein": 21, "Total lipid (fat)": 5, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'CAMPBELL\'S Red and White, Beef Consomme, condensed'].index[0], 'nutrition'] = '{"Energy": 16, "Protein": 3, "Total lipid (fat)": 0, "Carbohydrate, by difference": 1, "Fiber, total dietary": 0, "Sugars, total including NLEA": 1}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, rib eye steak, bone-in, lip-on, separable lean and fat, trimmed to 1/8" fat, choice, cooked, grilled'].index[0], 'nutrition'] = '{"Energy": 313, "Protein": 23, "Total lipid (fat)": 25, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Pork sausage, link/patty, unprepared'].index[0], 'nutrition'] = '{"Energy": 392, "Protein": 13.5, "Total lipid (fat)": 37.2, "Carbohydrate, by difference": 0.69, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.53}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, ground, 80% lean meat / 20% fat, raw'].index[0], 'nutrition'] = '{"Energy": 254, "Protein": 17.2, "Total lipid (fat)": 20, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, chuck, short ribs, boneless, separable lean only, trimmed to 0" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 175, "Protein": 19.4, "Total lipid (fat)": 10.7, "Carbohydrate, by difference": 0.29, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, plate, inside skirt steak, separable lean and fat, trimmed to 0" fat, all grades, cooked, broiled'].index[0], 'nutrition'] = '{"Energy": 268, "Protein": 28.6, "Total lipid (fat)": 17.1, "Carbohydrate, by difference": 0.0, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, chuck eye steak, boneless, separable lean only, trimmed to 0" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 277, "Protein": 25, "Total lipid (fat)": 19.6, "Carbohydrate, by difference": 0.0, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, round, top round, steak, separable lean only, trimmed to 1/8" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 160, "Protein": 21, "Total lipid (fat)": 8, "Carbohydrate, by difference": 0.0, "Fiber, total dietary": 0.0, "Sugars, total including NLEA": 0.0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Popcorn, sugar syrup/caramel, fat-free'].index[0], 'nutrition'] = '{"Energy": 381, "Protein": 2, "Total lipid (fat)": 1.4, "Carbohydrate, by difference": 90.1, "Fiber, total dietary": 2.5, "Sugars, total including NLEA": 64.7}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, top sirloin, steak, separable lean only, trimmed to 1/8" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 131, "Protein": 22.1, "Total lipid (fat)": 4.08, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, flank, steak, separable lean only, trimmed to 0" fat, choice, cooked, broiled'].index[0], 'nutrition'] = '{"Energy": 194, "Protein": 27.8, "Total lipid (fat)": 8.32, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, top sirloin, steak, separable lean and fat, trimmed to 0" fat, all grades, cooked, broiled'].index[0], 'nutrition'] = '{"Energy": 183, "Protein": 30.6, "Total lipid (fat)": 5.79, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, rib, back ribs, bone-in, separable lean only, trimmed to 0" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 252, "Protein": 19, "Total lipid (fat)": 19, "Carbohydrate, by difference": 1, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, chuck eye steak, boneless, separable lean and fat, trimmed to 0" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 277, "Protein": 25, "Total lipid (fat)": 19.6, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Lamb, domestic, shoulder, blade, separable lean only, trimmed to 1/4" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 279, "Protein": 35.5, "Total lipid (fat)": 14.1, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, chuck, arm pot roast, separable lean only, trimmed to 1/8" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 139, "Protein": 22, "Total lipid (fat)": 5.05, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

nutrition_df.at[nutrition_df[nutrition_df['Long_Desc'] == 'Beef, loin, top sirloin cap steak, boneless, separable lean only, trimmed to 1/8" fat, choice, raw'].index[0], 'nutrition'] = '{"Energy": 138, "Protein": 21.3, "Total lipid (fat)": 5.82, "Carbohydrate, by difference": 0, "Fiber, total dietary": 0, "Sugars, total including NLEA": 0}'

In [22]:
failed = nutrition_df[nutrition_df.nutrition.isna()]

In [24]:
nutrition_df

Unnamed: 0,Long_Desc,FdGrp_Desc,nutrition
0,"Salt, table",Spices and Herbs,"{""Energy"": 0.0, ""Protein"": 0.0, ""Total lipid (..."
1,"Butter, without salt",Dairy and Egg Products,"{""Energy"": 717.0, ""Protein"": 0.85, ""Total lipi..."
2,"Sugars, granulated",Sweets,"{""Energy"": 385.0, ""Protein"": 0.0, ""Total lipid..."
3,"Onions, raw",Vegetables and Vegetable Products,"{""Protein"": 0.86, ""Total lipid (fat)"": 0.08, ""..."
4,"Beverages, water, tap, municipal",Beverages,"{""Energy"": 0.0, ""Protein"": 0.0, ""Total lipid (..."
...,...,...,...
2578,,,"{""Energy"": 252, ""Protein"": 19, ""Total lipid (f..."
2581,,,"{""Energy"": 277, ""Protein"": 25, ""Total lipid (f..."
2612,,,"{""Energy"": 279, ""Protein"": 35.5, ""Total lipid ..."
2648,,,"{""Energy"": 139, ""Protein"": 22, ""Total lipid (f..."


In [26]:
nutrition_df.to_csv("C:/Users/01din\Documents/University\BSc thesis\data\RAW_recipes.csv/ingredients/nutrition.csv")

In [37]:
df = pd.read_csv("C:/Users/01din\Documents/University\BSc thesis\data\RAW_recipes.csv/ingredients/ingredients_labels.csv")
df = df[['ingredient', 'frequency', 'Long_Desc', 'FdGrp_Desc']]
df = df[df.frequency>49]

In [46]:
#Drop rows with NaN values in the merging column
df_nonan = df.dropna(subset=['Long_Desc'])

merged = df_nonan.merge(nutrition_df, on='Long_Desc', how='left')
merged = merged.append(df[df['Long_Desc'].isna()], ignore_index=True)

merged.FdGrp_Desc = merged.FdGrp_Desc_x
merged.drop(['FdGrp_Desc_x', 'FdGrp_Desc_y'], axis=1, inplace = True)
merged

  merged = merged.append(df[df['Long_Desc'].isna()], ignore_index=True)


Unnamed: 0,ingredient,frequency,Long_Desc,nutrition,FdGrp_Desc
0,salt,85746.0,"Salt, table","{""Energy"": 0.0, ""Protein"": 0.0, ""Total lipid (...",Spices and Herbs
1,butter,54975.0,"Butter, without salt","{""Energy"": 717.0, ""Protein"": 0.85, ""Total lipi...",Dairy and Egg Products
2,sugar,44535.0,"Sugars, granulated","{""Energy"": 385.0, ""Protein"": 0.0, ""Total lipid...",Sweets
3,onion,39065.0,"Onions, raw","{""Protein"": 0.86, ""Total lipid (fat)"": 0.08, ""...",Vegetables and Vegetable Products
4,water,34914.0,"Beverages, water, tap, municipal","{""Energy"": 0.0, ""Protein"": 0.0, ""Total lipid (...",Beverages
...,...,...,...,...,...
2712,skewers,96.0,,,
2713,toothpicks,84.0,,,
2714,reynolds wrap foil,67.0,,,
2715,parchment paper,61.0,,,


This final dataframe contains each USDA item that has at least one ingredient from the recipe dataset linked to it, and has its nutritional values. Note: the NaN's occur with items which are not foods, which are few.

In [48]:
merged.to_csv("C:/Users/01din\Documents/University\BSc thesis\data\RAW_recipes.csv/ingredients/ingredients_nutrition.csv")