In [4]:
# Some code to set up the problem

import sys
print(sys.version)
import numpy as np
print(np.__version__)
import pandas as pd
print(pd.__version__)
import matplotlib.pyplot as plt
import re
import html5lib
df = pd.read_csv("openfoodfacts_search.csv", sep = '\t')
pd.set_option('display.max_colwidth', -1)
df.head()



#it apppears that pandas is truncating some data.  let's change that
pd.set_option('display.max_colwidth', -1)
df.head()
list(df.columns)
df['energy_100g'].head()
df['carbohydrates_100g'].head()
df['carbohydrates_100g'].count()
df['fat_100g'].count()
df['proteins_100g'].count()
df['energy_100g'].count()

###verify the nutrition facts add up with the manufacturer's data
### it appears from the site's documentation that
###http://world.openfoodfacts.org/data/data-fields.txt
###[nutrient]_100gr is nutrients per 100gr or mL
###step 1, grab data for an identifable product

df[['code', 'url', 'product_name', 'energy_100g','carbohydrates_100g',
    'fat_100g', 'proteins_100g','serving_size']].iloc[20]

#step 2: verify the data on the site's own product page
import html5lib
site = 'http://www.activia.us.com/probiotic-yogurt/products/activia-light-blueberry'
blueberry_yougurt = pd.read_html(site)[0].set_index('Nutritional Facts')
blueberry_yougurt

### the calroies(energy) does not look like it is 220 cals/100gr, but the carb/protein/fat ratio looks about right
### verify the macronutrients per 100gr are equivalent to the food facts db:
print(float(blueberry_yougurt.loc['Protein'][0].strip('g'))/1.13)
print(float(blueberry_yougurt.loc['Total Carbohydrate'][0].strip('g'))/1.13)
print(float(blueberry_yougurt.loc['Total Fat'][0].strip('g'))/1.13)

###the total macronutrients per 100 grams posted in the food facts db roughly matches what is posted on the product's
###own site.  Let's check anoother product, this time at random:
np.random.seed(4)
df[['code', 'url', 'product_name', 'energy_100g','carbohydrates_100g',
    'fat_100g', 'proteins_100g','serving_size']].iloc[np.random.randint(0,len(df))]

site = 'http://www.hunts.com/nutrition-label?upc=2700037909&inline=false'
hunts_tomatoes = pd.read_html(site)
#hunts_tomatoes = pd.read_html(site)[1].set_index('Nutrition Facts')
hunts_tomatoes[1][0]

###this product's label isn't as nicely formattted, but we can get some information from it
#per 121 grams, there are 0 grams of fat, 9 gr of carbohydrate, and 2 gr of protein
carbs_per_100g = 9/1.21
protein_per_100g = 2/1.21
cals_per_100gr = 45/1.21
print('carbs: ',carbs_per_100g, 'protein: ', protein_per_100g, 'cals: ', cals_per_100gr)

###again, we have parity except with the energy field.
### it would make the most sense to add in a new calculated field that looks at total calories per 100 gr.
###Calories is a function of the macro nutrients and can be found using basic algebra
### cals = 9*fat + 4* protein + 4*carbohydrates + 7*alcohol
###  let's add in the correctd calories per 100 grams
def cals(w, x, y):
    return (9*w) + (4*(x + y))

df['calories_100g'] = np.vectorize(cals)(df['fat_100g'], df['proteins_100g'], df['carbohydrates_100g'])

#verify against a known product:
df[['code', 'url', 'product_name', 'energy_100g','carbohydrates_100g',
    'fat_100g', 'proteins_100g','serving_size', 'calories_100g']].iloc[20]

df['calories_100g'].iloc[20]*1.13

###This value is nearly identical to the value listed for 113 grams on activia's website
###it is safe to use this calorie value in place of the energy_100g field
df.head()

df['calories_100g'].describe()
df['proteins_100g'].describe()
df['fat_100g'].describe()
df['carbohydrates_100g'].describe()
###save the df with the new calories
df.to_csv('groomed_food_facts_data.cs', sep = '\t')

###according to the source material, the following score gives information the total nutritive value of a
###product
###nutrition-score-uk_100g
###from the Food Standards Administration:
###A food is classified as 'less healthy' where it scores 4 points or more.
###A drink is classified as 'less healthy' where it scores 1 point or more.
###so, a product's score is like golf; the lower the better
###The score  can be verified using the information on this site:
###http://www.food.gov.uk/sites/default/files/multimedia/pdfs/techguidenutprofiling.pdf
df['nutrition-score-uk_100g'].describe()

###let's check to see if a score exits for the yogurt example
#verify against a known product:
df[['code', 'url', 'product_name', 'energy_100g','carbohydrates_100g',
    'fat_100g', 'proteins_100g','serving_size', 'calories_100g', 'nutrition-score-uk_100g']].iloc[20]

#investiate quantity; does quantity contain information of servings per unit or total size of unit?
df['quantity'].head(20)
df['quantity'].value_counts()
#investiate categories
df['categories'].value_counts()

df['categories_tags'].value_counts()

###how often are the categories present?
df[df['categories']!= np.nan]['categories'].count()
#subset the data to only include rows with macronutrients and categories
df.head()

list(df.columns)

### Parse down to important data:
keep_cols = ['code', 'product_name', 'generic_name','categories','categories_tags', 'ingredients_text','main_category',
            'fat_100g', 'saturated-fat_100g','monounsaturated-fat_100g', 'polyunsaturated-fat_100g','omega-3-fat_100g',
            'omega-6-fat_100g', 'omega-9-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g','fiber_100g',
            'proteins_100g','salt_100g', 'sodium_100g', 'alcohol_100g', 'vitamin-a_100g','vitamin-c_100g','potassium_100g',
             'chloride_100g', 'calcium_100g', 'iron_100g', 'fruits-vegetables-nuts_100g', 'nutrition-score-uk_100g', 'calories_100g'
            ]

df_keep = df[keep_cols]
sdf_keep = df_keep.dropna(subset=['calories_100g'])
df_keep = pd.read_csv("valid_entries.tsv", sep = "\t")
df_ing = df_keep.ix[1]["ingredients_text"].upper().split(',')



3.5.2 |Anaconda 4.1.1 (x86_64)| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]
1.11.1
0.18.1
3.5398230088495577
9.734513274336283
0.0
carbs:  7.43801652892562 protein:  1.6528925619834711 cals:  37.1900826446281




In [5]:
# Solution to Problem 2

df_keep.fillna('x', inplace=True)

#Insert Keyword Here 
keywords = ["corn syrup","hydrogenated", "high fructose"]

flag = [0] * df_keep.shape[0]
KeyCounter = [0] * len(keywords)
TotalCounter = 0
NumberNANs = 0
for j in range(0,len(keywords)):
    keyword = keywords[j]
    for i in range(0, df_keep.shape[0]):
        df_ing = df_keep.ix[i]["ingredients_text"].upper().split(',')
        Checker = False
        for indi in df_ing:
            if keyword.upper() in indi:
                flag[i] = 1
                Checker = True
        if Checker:
            KeyCounter[j] = KeyCounter[j] + 1
        if j == len(keywords)-1 and flag[i] == 1:
            TotalCounter = TotalCounter + 1
        if j == len(keywords)-1 and ['X'] == df_ing:
            NumberNANs = NumberNANs + 1

# Print Flag array 0 means no Keyword and 1 contains keyword
print(flag)

# Print Statistics
print('Total number of ingredients (without NANs) in this database: ', df_keep.shape[0]-NumberNANs)
#print('Number of invalid ingredients in this database: ', NumberNANs )
for j in range(0,len(keywords)):
    print('Frequency of', keywords[j], ' is: ', KeyCounter[j])
#print('Frequency of corn syrup,hydrogenated, high fructose in the database : ',KeyCounter)
print('TotalKeys: ', TotalCounter )


for j in range(0,len(keywords)):
    print('Percentage of recipes containing ', keywords[j],' is: ', float(100)*float(KeyCounter[j])/float(df_keep.shape[0]-NumberNANs),'%')
#print('Frequency of corn syrup,hydrogenated, high fructose in the database : ',KeyCounter)
print('Percentage of recipes containing either ', keywords, ' is ', float(100)*float(TotalCounter)/float(df_keep.shape[0]-NumberNANs),'%' )




[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 