In [2]:
import sys
print(sys.version)
import numpy as np
print(np.__version__)
import pandas as pd
print(pd.__version__)
import matplotlib.pyplot as plt
import re

3.5.2 |Anaconda 4.1.1 (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]
1.11.1
0.18.1


In [3]:
# df will be our set with all fields, it originated by searching the foodfacts website for US products
# we're going to work with Nikki's csv, which includes her calories_100g field

df = pd.read_csv("groomed_food_facts_data.cs", sep = '\t')

# and also include her adjustment for the fields that were truncating

pd.set_option('display.max_colwidth', -1)
df.head()

Unnamed: 0.1,Unnamed: 0,code,url,creator,created_t,last_modified_t,product_name,generic_name,quantity,packaging,...,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,calories_100g
0,0,5010092093045,http://world.openfoodfacts.org/product/5010092093045,bcatelin,1389309305,1461479010,Soft white,White bread,800g,Plastic bag,...,,,,,,,125.0,-1.0,-1.0,232.4
1,1,44000030377,http://world.openfoodfacts.org/product/0044000030377,openfoodfacts-contributors,1385850411,1459174448,Wheat Thins Original,,258g,,...,,,,,,,,,,
2,2,7832309,http://world.openfoodfacts.org/product/07832309,openfoodfacts-contributors,1403210081,1458995984,Diet Dr Pepper,,,can,...,,,,,,,,,,
3,3,5099353000169,http://world.openfoodfacts.org/product/5099353000169,bcatelin,1385926289,1413659845,Eggs,Eggs,6,Cardbox,...,,,,,,,,,,
4,4,82592720153,http://world.openfoodfacts.org/product/0082592720153,openfoodfacts-contributors,1389308826,1459174499,Green Machine,,15.2 fl. oz (450 mL),,...,,,,,,,,,,58.532


In [4]:
# create a second index with just the fields (columns) that we wish to examine.  
# build working data frame by building (construction) rather than taking away.  this is better.
# also allows us to adjust the list by adding or taking away columns as our analysis evolves.

target_list = ['code', 'product_name', 'generic_name', 'carbohydrates_100g', 
    'fat_100g', 'proteins_100g','serving_size', 'calories_100g']


In [5]:
# let's see how many of our rows have data in these target categories.  
# if we screen out those that are missing target data, how many items will we be left with?

for item in target_list:
    print(item, ":", df[item].count())


code : 2820
product_name : 2356
generic_name : 592
carbohydrates_100g : 1292
fat_100g : 1295
proteins_100g : 1284
serving_size : 1362
calories_100g : 1270


In [6]:
df2 = df[target_list]
df2.head()

Unnamed: 0,code,product_name,generic_name,carbohydrates_100g,fat_100g,proteins_100g,serving_size,calories_100g
0,5010092093045,Soft white,White bread,44.6,2.0,9.0,50g,232.4
1,44000030377,Wheat Thins Original,,,,,,
2,7832309,Diet Dr Pepper,,,,,,
3,5099353000169,Eggs,Eggs,,,,,
4,82592720153,Green Machine,,13.8,0.0,0.833,8 fl oz (240 mL),58.532


In [8]:
# have a look at some of the rows

df2.iloc[580:605]

Unnamed: 0,code,product_name,generic_name,carbohydrates_100g,fat_100g,proteins_100g,serving_size,calories_100g
580,12000130274,,,12.3,0.0,0.0,1 Bottle (20 fl oz) (591 mL),49.2
581,810165016415,,,,,,,
582,856820160048,,,,,,,
583,617237641564,,,,,,,
584,8000500026731,,,,,,,
585,38000576089,,,,,,,
586,8000500205167,,,,,,,
587,70896732057,,,,,,,
588,799857655371,,,,,,,
589,810675000225,,,,,,,


In [9]:
# looks like some of these rows may be poor quality (many NaN fields), example row 581

df2.iloc[581]

code                  810165016415
product_name          NaN         
generic_name          NaN         
carbohydrates_100g    NaN         
fat_100g              NaN         
proteins_100g         NaN         
serving_size          NaN         
calories_100g         NaN         
Name: 581, dtype: object

In [10]:
# use nonnull to eliminate the rows that inluce NaNs in the fields we wish to study.

clean_df2 = df2[pd.notnull(df['calories_100g'])]
clean_df2.head()

Unnamed: 0,code,product_name,generic_name,carbohydrates_100g,fat_100g,proteins_100g,serving_size,calories_100g
0,5010092093045,Soft white,White bread,44.6,2.0,9.0,50g,232.4
4,82592720153,Green Machine,,13.8,0.0,0.833,8 fl oz (240 mL),58.532
6,9800895250,Nutella,Hazelnut spread with skim milk & cacao,56.8,32.4,5.41,2 tbsp (37 g),540.44
16,71443003903,,,76.2,9.52,2.38,42 g,400.0
20,36632036506,Activia light blueberry,Yaourt,9.73,0.0,3.54,1 container (113g),53.08


In [12]:
# this new df should be pretty balanced, not so many NaNs left.  let's see.
# note that I generic_name was added back as some of the product_name fields are unhelpful (ie "soft white" - ?)
# it's okay then that thi

clean_df2.count()

code                  1270
product_name          1223
generic_name          418 
carbohydrates_100g    1270
fat_100g              1270
proteins_100g         1270
serving_size          1215
calories_100g         1270
dtype: int64

In [14]:
# at this point we are well positioned to be able to analyse calorie count for any subset for our data.
# now what about the other nutrients: does a "basket" provide adequate nutrition, per IOM RDAs?
# let's go back to our original data set and see what nutrient data is available

# use this to search for columns to delete. useful if we want to create our working dataframe by eliminating columns.

column_list = list(df.columns.values)

nutrients_list = []
for column in column_list:
    if re.search('_100g$', column):
        print(column)
        nutrients_list.append(column)

# print(nutrients_list)

energy_100g
energy-from-fat_100g
fat_100g
saturated-fat_100g
-butyric-acid_100g
-caproic-acid_100g
-caprylic-acid_100g
-capric-acid_100g
-lauric-acid_100g
-myristic-acid_100g
-palmitic-acid_100g
-stearic-acid_100g
-arachidic-acid_100g
-behenic-acid_100g
-lignoceric-acid_100g
-cerotic-acid_100g
-montanic-acid_100g
-melissic-acid_100g
monounsaturated-fat_100g
polyunsaturated-fat_100g
omega-3-fat_100g
-alpha-linolenic-acid_100g
-eicosapentaenoic-acid_100g
-docosahexaenoic-acid_100g
omega-6-fat_100g
-linoleic-acid_100g
-arachidonic-acid_100g
-gamma-linolenic-acid_100g
-dihomo-gamma-linolenic-acid_100g
omega-9-fat_100g
-oleic-acid_100g
-elaidic-acid_100g
-gondoic-acid_100g
-mead-acid_100g
-erucic-acid_100g
-nervonic-acid_100g
trans-fat_100g
cholesterol_100g
carbohydrates_100g
sugars_100g
-sucrose_100g
-glucose_100g
-fructose_100g
-lactose_100g
-maltose_100g
-maltodextrins_100g
starch_100g
polyols_100g
fiber_100g
proteins_100g
casein_100g
serum-proteins_100g
nucleotides_100g
salt_100g
sodium

In [None]:
# here's the website that lists the IOM RDAs, we can pull the data in from there
# https://www.consumerlab.com/RDAs/