In [1]:
import numpy as np
import pandas as pd

# Food nutrient table preprocessing

In [2]:
# read csv files as df
df_foodGroup = pd.read_csv("FOOD_GROUP.csv",encoding='latin-1')
df_foodName = pd.read_csv("FOOD_NAME.csv",encoding='latin-1')
df_nutrientAmt = pd.read_csv("NUTRIENT_AMOUNT.csv",encoding='latin-1')
df_nutrientName = pd.read_csv("NUTRIENT_NAME.csv",encoding='latin-1')

In [3]:
# Only select the following nutrients: protein, fat, carbohydrate, energy (kcal), sugars, sodium
NutrientIDs = [203,204,205,208,269,291,307,601,605,606]
df_nutrientName_filtered = df_nutrientName[df_nutrientName["NutrientID"].isin(NutrientIDs)]
# Join food name and group
df_food = pd.merge(df_foodName[["FoodID","FoodGroupID","FoodDescription"]],df_foodGroup[["FoodGroupID","FoodGroupName"]],on=["FoodGroupID"])
# Join with nutrient IDs
df_combined = pd.merge(df_nutrientAmt[["FoodID","NutrientID","NutrientValue"]],df_food[["FoodID","FoodGroupName","FoodDescription"]],on=["FoodID"])
# Join with nutrient names and units
df_combined = pd.merge(df_combined[["FoodDescription","FoodGroupName","NutrientID","NutrientValue"]],df_nutrientName_filtered[["NutrientID","NutrientName","NutrientUnit"]],on=["NutrientID"])
df_combined = df_combined.drop(columns=["NutrientID"])
# Sort by food description
df_combined = df_combined.sort_values("FoodDescription").reset_index(drop=True)


*   **Total # of food sources**: 5689
*   **Total # of food groups**: 23
*   **Total # of available nutrients**: 10 (protein, fat, saturated fat, trans fat carbohydrate, energy, sugars, sodium, cholesterol, fibre) 



In [4]:
df_combined["NutrientAmt"] = df_combined["NutrientValue"].astype(str) + " " + df_combined["NutrientUnit"]

In [86]:
df_combined.NutrientName.unique()

array(['SUGARS, TOTAL', 'FAT (TOTAL LIPIDS)', 'CHOLESTEROL',
       'ENERGY (KILOCALORIES)', 'SODIUM', 'PROTEIN',
       'CARBOHYDRATE, TOTAL (BY DIFFERENCE)',
       'FATTY ACIDS, SATURATED, TOTAL', 'FIBRE, TOTAL DIETARY',
       'FATTY ACIDS, TRANS, TOTAL'], dtype=object)

In [5]:
# convert dataframe format (set nutrients names as columns)
format = {"Food":"Abiyuch, raw","Category":"Fruits and fruit juices","Servings":"100 g","Calories":"69.0 kCal","Fat":"0.1 g","Saturated Fat":"0.014 g","Trans Fat":np.NaN,"Cholesterol":"0.0 mg","Sodium":"20.0 mg","Carbohydrate":"17.6 g","Sugars":"8.55 g","Fibre":"5.3 g","Protein":"1.5 g"}
cols = df_combined.NutrientName.unique()
df_combined_clean = pd.DataFrame(format,index=[0])
for food in df_combined.FoodDescription.unique()[1:]:
  new_df = df_combined[df_combined["FoodDescription"]==food]
  category = new_df.FoodGroupName.unique()[0]
  sugars = new_df[new_df["NutrientName"]==cols[0]].NutrientAmt
  fat = new_df[new_df["NutrientName"]==cols[1]].NutrientAmt
  cholesterol = new_df[new_df["NutrientName"]==cols[2]].NutrientAmt
  energy = new_df[new_df["NutrientName"]==cols[3]].NutrientAmt
  sodium = new_df[new_df["NutrientName"]==cols[4]].NutrientAmt
  protein = new_df[new_df["NutrientName"]==cols[5]].NutrientAmt
  carbs = new_df[new_df["NutrientName"]==cols[6]].NutrientAmt
  saturated_fat = new_df[new_df["NutrientName"]==cols[7]].NutrientAmt
  fibre = new_df[new_df["NutrientName"]==cols[8]].NutrientAmt
  trans_fat = new_df[new_df["NutrientName"]==cols[9]].NutrientAmt

  arr = [energy,fat,saturated_fat,trans_fat,cholesterol,sodium,carbs,sugars,fibre,protein]
  try:
    for i in range(10):
      if (arr[i].empty==False):
        arr[i] = arr[i].item()
  except:
    continue
  
  new_format = {"Food":food,"Category":category,"Servings":"100 g","Calories":arr[0],"Fat":arr[1],"Saturated Fat":arr[2],"Trans Fat":arr[3],"Cholesterol":arr[4],"Sodium":arr[5],"Carbohydrate":arr[6],"Sugars":arr[7],"Fibre":arr[8],"Protein":arr[9]}
  try:
    new_format = pd.DataFrame(new_format,index=[0])
  except:
    continue
  if (new_format.isnull().sum().sum()==10):
    continue
  df_combined_clean = pd.concat([df_combined_clean,new_format])

In [92]:
df_Fruits = df_combined_clean[df_combined_clean["Category"]=="Fruits and fruit juices"].reset_index(drop=True)
df_Fruits = df_Fruits.fillna('')
df_Fruits["Category"] = "Fruits"
df_Fruits.to_dict(orient='records')

[{'Food': 'Abiyuch, raw',
  'Category': 'Fruits',
  'Servings': '100 g',
  'Calories': '69.0 kCal',
  'Fat': '0.1 g',
  'Saturated Fat': '0.014 g',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '20.0 mg',
  'Carbohydrate': '17.6 g',
  'Sugars': '8.55 g',
  'Fibre': '5.3 g',
  'Protein': '1.5 g'},
 {'Food': 'Acerola (West Indian cherry), raw',
  'Category': 'Fruits',
  'Servings': '100 g',
  'Calories': '32.0 kCal',
  'Fat': '0.3 g',
  'Saturated Fat': '0.068 g',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '7.0 mg',
  'Carbohydrate': '7.69 g',
  'Sugars': '',
  'Fibre': '1.1 g',
  'Protein': '0.4 g'},
 {'Food': 'Acerola juice, raw',
  'Category': 'Fruits',
  'Servings': '100 g',
  'Calories': '23.0 kCal',
  'Fat': '0.3 g',
  'Saturated Fat': '0.068 g',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '3.0 mg',
  'Carbohydrate': '4.8 g',
  'Sugars': '4.5 g',
  'Fibre': '0.3 g',
  'Protein': '0.4 g'},
 {'Food': 'Apple juice, canned or bottled, added vita

In [6]:
df_Dairy = df_combined_clean[df_combined_clean["Category"]=="Dairy and Egg Products"].reset_index(drop=True)
df_Dairy = df_Dairy.fillna('')
df_Dairy["Category"] = "Dairy"
df_Dairy.to_dict(orient='records')

[{'Food': 'Butter oil, anhydrous',
  'Category': 'Dairy',
  'Servings': '100 g',
  'Calories': '876.0 kCal',
  'Fat': '99.48 g',
  'Saturated Fat': '61.924 g',
  'Trans Fat': '',
  'Cholesterol': '256.0 mg',
  'Sodium': '2.0 mg',
  'Carbohydrate': '0.0 g',
  'Sugars': '0.0 g',
  'Fibre': '0.0 g',
  'Protein': '0.28 g'},
 {'Food': 'Butter, light, salted',
  'Category': 'Dairy',
  'Servings': '100 g',
  'Calories': '509.0 kCal',
  'Fat': '55.1 g',
  'Saturated Fat': '34.321 g',
  'Trans Fat': '',
  'Cholesterol': '106.0 mg',
  'Sodium': '450.0 mg',
  'Carbohydrate': '0.0 g',
  'Sugars': '0.0 g',
  'Fibre': '0.0 g',
  'Protein': '3.3 g'},
 {'Food': 'Butter, light, unsalted',
  'Category': 'Dairy',
  'Servings': '100 g',
  'Calories': '499.0 kCal',
  'Fat': '55.1 g',
  'Saturated Fat': '34.321 g',
  'Trans Fat': '',
  'Cholesterol': '106.0 mg',
  'Sodium': '36.0 mg',
  'Carbohydrate': '0.0 g',
  'Sugars': '0.0 g',
  'Fibre': '0.0 g',
  'Protein': '3.3 g'},
 {'Food': 'Butter, regular',
  'C

In [7]:
df_Vegetables = df_combined_clean[df_combined_clean["Category"]=="Vegetables and Vegetable Products"].reset_index(drop=True)
df_Vegetables = df_Vegetables.fillna('')
df_Vegetables["Category"] = "Vegetables"
df_Vegetables.to_dict(orient='records')

[{'Food': 'Agave, cooked',
  'Category': 'Vegetables',
  'Servings': '100 g',
  'Calories': '135.0 kCal',
  'Fat': '0.29 g',
  'Saturated Fat': '',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '13.0 mg',
  'Carbohydrate': '32.0 g',
  'Sugars': '20.87 g',
  'Fibre': '10.6 g',
  'Protein': '0.99 g'},
 {'Food': 'Agave, dried',
  'Category': 'Vegetables',
  'Servings': '100 g',
  'Calories': '341.0 kCal',
  'Fat': '0.69 g',
  'Saturated Fat': '',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '14.0 mg',
  'Carbohydrate': '81.98 g',
  'Sugars': '50.7 g',
  'Fibre': '15.6 g',
  'Protein': '1.71 g'},
 {'Food': 'Agave, raw',
  'Category': 'Vegetables',
  'Servings': '100 g',
  'Calories': '68.0 kCal',
  'Fat': '0.15 g',
  'Saturated Fat': '',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '14.0 mg',
  'Carbohydrate': '16.23 g',
  'Sugars': '2.58 g',
  'Fibre': '6.6 g',
  'Protein': '0.52 g'},
 {'Food': 'Alfalfa seeds, sprouted, raw',
  'Category': 'Vegetables

In [8]:
grains_list = ["Nuts and Seeds","Legumes and Legume Products","Baked Products","Cereals, Grains and Pasta"]
df_Grains = df_combined_clean[df_combined_clean["Category"].isin(grains_list)].reset_index(drop=True)
df_Grains = df_Grains.fillna('')
df_Grains["Category"] = "Grains"
df_Grains.to_dict(orient='records')

[{'Food': 'Bagel, cinnamon-raisin',
  'Category': 'Grains',
  'Servings': '100 g',
  'Calories': '274.0 kCal',
  'Fat': '1.7 g',
  'Saturated Fat': '0.274 g',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '344.0 mg',
  'Carbohydrate': '55.2 g',
  'Sugars': '5.98 g',
  'Fibre': '2.3 g',
  'Protein': '9.8 g'},
 {'Food': 'Bagel, cinnamon-raisin, toasted',
  'Category': 'Grains',
  'Servings': '100 g',
  'Calories': '294.0 kCal',
  'Fat': '1.8 g',
  'Saturated Fat': '0.295 g',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '370.0 mg',
  'Carbohydrate': '59.3 g',
  'Sugars': '6.43 g',
  'Fibre': '2.5 g',
  'Protein': '10.6 g'},
 {'Food': 'Bagel, egg',
  'Category': 'Grains',
  'Servings': '100 g',
  'Calories': '278.0 kCal',
  'Fat': '2.1 g',
  'Saturated Fat': '0.421 g',
  'Trans Fat': '',
  'Cholesterol': '24.0 mg',
  'Sodium': '505.0 mg',
  'Carbohydrate': '53.0 g',
  'Sugars': '',
  'Fibre': '2.3 g',
  'Protein': '10.6 g'},
 {'Food': 'Bagel, oat bran',
  'Category

In [14]:
proteins_list = ["Poultry Products","Sausages and Luncheon meats","Pork Products","Beef Products","Finfish and Shellfish Products","Lamb, Veal and Game"]
df_proteins = df_combined_clean[df_combined_clean["Category"].isin(proteins_list)].reset_index(drop=True)
df_proteins = df_proteins.fillna('')
df_proteins["Category"] = "Proteins"
df_proteins.to_dict(orient='records')

[{'Food': 'Agutuk, native, fish with shortening (Alaskan ice cream)',
  'Category': 'Proteins',
  'Servings': '100 g',
  'Calories': '470.0 kCal',
  'Fat': '43.5 g',
  'Saturated Fat': '8.6 g',
  'Trans Fat': '',
  'Cholesterol': '26.0 mg',
  'Sodium': '24.0 mg',
  'Carbohydrate': '10.5 g',
  'Sugars': '',
  'Fibre': '0.0 g',
  'Protein': '9.0 g'},
 {'Food': 'Agutuk, native, fish/berry with seal oil (Alaskan ice cream)',
  'Category': 'Proteins',
  'Servings': '100 g',
  'Calories': '354.0 kCal',
  'Fat': '31.8 g',
  'Saturated Fat': '7.7 g',
  'Trans Fat': '',
  'Cholesterol': '10.0 mg',
  'Sodium': '21.0 mg',
  'Carbohydrate': '13.4 g',
  'Sugars': '',
  'Fibre': '0.5 g',
  'Protein': '3.4 g'},
 {'Food': 'Ascidians, native (tunughnak)',
  'Category': 'Proteins',
  'Servings': '100 g',
  'Calories': '20.0 kCal',
  'Fat': '0.5 g',
  'Saturated Fat': '0.1 g',
  'Trans Fat': '',
  'Cholesterol': '7.0 mg',
  'Sodium': '656.0 mg',
  'Carbohydrate': '0.0 g',
  'Sugars': '0.0 g',
  'Fibre': 

In [15]:
df_beverages = df_combined_clean[df_combined_clean["Category"]=="Beverages"].reset_index(drop=True)
df_beverages = df_beverages.fillna('')
df_beverages["Category"] = "Beverages"
df_beverages.to_dict(orient='records')

[{'Food': 'Alcohol, beer, light (4% alcohol by volume)',
  'Category': 'Beverages',
  'Servings': '100 g',
  'Calories': '29.0 kCal',
  'Fat': '0.0 g',
  'Saturated Fat': '0.0 g',
  'Trans Fat': '0.0 g',
  'Cholesterol': '0.0 mg',
  'Sodium': '4.0 mg',
  'Carbohydrate': '1.64 g',
  'Sugars': '0.09 g',
  'Fibre': '0.0 g',
  'Protein': '0.24 g'},
 {'Food': 'Alcohol, beer, regular, (5% alcohol by volume)',
  'Category': 'Beverages',
  'Servings': '100 g',
  'Calories': '43.0 kCal',
  'Fat': '0.0 g',
  'Saturated Fat': '0.0 g',
  'Trans Fat': '0.0 g',
  'Cholesterol': '0.0 mg',
  'Sodium': '4.0 mg',
  'Carbohydrate': '3.55 g',
  'Sugars': '0.0 g',
  'Fibre': '0.0 g',
  'Protein': '0.46 g'},
 {'Food': 'Alcohol, cocktail, daiquiri (rum), homemade',
  'Category': 'Beverages',
  'Servings': '100 g',
  'Calories': '186.0 kCal',
  'Fat': '0.06 g',
  'Saturated Fat': '0.006 g',
  'Trans Fat': '',
  'Cholesterol': '0.0 mg',
  'Sodium': '5.0 mg',
  'Carbohydrate': '6.94 g',
  'Sugars': '5.58 g',
  