# Exploratory Data Analysis - World Foods Dataset

## Reading in the Data

In [18]:
import pandas as pd
import numpy as np

In [19]:
food = pd.read_csv("en.openfoodfacts.org.products.tsv", sep='\t',low_memory=False)

In [20]:
food.shape

(333824, 162)

In [21]:
cols = list(food.columns)
cols

['code',
 'url',
 'creator',
 'created_t',
 'created_datetime',
 'last_modified_t',
 'last_modified_datetime',
 'product_name',
 'generic_name',
 'quantity',
 'packaging',
 'packaging_tags',
 'brands',
 'brands_tags',
 'categories',
 'categories_tags',
 'categories_en',
 'origins',
 'origins_tags',
 'manufacturing_places',
 'manufacturing_places_tags',
 'labels',
 'labels_tags',
 'labels_en',
 'emb_codes',
 'emb_codes_tags',
 'first_packaging_code_geo',
 'cities',
 'cities_tags',
 'purchase_places',
 'stores',
 'countries',
 'countries_tags',
 'countries_en',
 'ingredients_text',
 'allergens',
 'allergens_en',
 'traces',
 'traces_tags',
 'traces_en',
 'serving_size',
 'no_nutriments',
 'additives_n',
 'additives',
 'additives_tags',
 'additives_en',
 'ingredients_from_palm_oil_n',
 'ingredients_from_palm_oil',
 'ingredients_from_palm_oil_tags',
 'ingredients_that_may_be_from_palm_oil_n',
 'ingredients_that_may_be_from_palm_oil',
 'ingredients_that_may_be_from_palm_oil_tags',
 'nutritio

In [22]:
food.count().sort_values(ascending = False)

last_modified_t                            333824
last_modified_datetime                     333824
creator                                    333822
created_t                                  333821
created_datetime                           333815
code                                       333800
url                                        333800
states_en                                  333776
states_tags                                333776
states                                     333776
countries_tags                             333559
countries_en                               333559
countries                                  333559
product_name                               316169
brands                                     304629
brands_tags                                304617
energy_100g                                272870
proteins_100g                              271651
salt_100g                                  267230
sodium_100g                                267183


The data is very sparse and there are a lot of missing values.

One thing to look at is the meaning of NaN - For example, not all foods have caffeine, so a NaN under caffeine may just mean that food does not have caffeine.

Let's check against some known foods with caffeine.

In [23]:
foodnames = food.product_name.dropna()

foodnames[foodnames.str.contains('kale')]

108670    Veggie-Go's, Tropikale Organic Chewy Fruit & V...
118446                   Crispy cacao & cinnamon kale chips
118447                   Crispy cacao & cinnamon kale chips
124570                                           Green kale
199947     Juice apple pineapple mango passion fruit & kale
263787                                Jus kale-pomme-laitue
267371    Rahm Gemüse rustikale Karotten mit Rosenkohl &...
270681    Geschenkpackung Adventskalender Winterlandscha...
303841                           Rustikales Filet-Pfännchen
306927                                  Sweet potato & kale
333461               Chewy Fruit & Veggie Snacks, Tropikale
Name: product_name, dtype: object

In [24]:
# Take 3 coffees
coffees = foodnames[foodnames.str.contains('coffee')].index.values

food.iloc[coffees]['caffeine_100g']

14056    NaN
66044    NaN
86084    NaN
109446   NaN
110074   NaN
112282   NaN
112854   NaN
155749   NaN
170699   NaN
180321   NaN
193046   NaN
210244   NaN
238047   NaN
269676   NaN
270929   NaN
279192   NaN
286994   NaN
294225   NaN
312181   NaN
314591   NaN
324653   NaN
325175   NaN
326239   NaN
326241   NaN
329232   NaN
330466   NaN
330592   NaN
331770   NaN
332284   NaN
333163   NaN
333600   NaN
Name: caffeine_100g, dtype: float64

This means that the data are generally incomplete, and that NaN should not be treated as a 0.

Some other difficulties with the data

In [25]:
foodnames[foodnames.str.contains('acai')]

119051                                      Seasoned Zhacai
250594    Biscuiterie de Provence Petit Biscuits Voyageu...
Name: product_name, dtype: object

If the name of a food is contained in another food, it may show up. In this instance, acai is a 'superfood' from Brazil. But zhacai is a pickled Chinese vegetable.

Since this is a world food database, foods are in all different languages.

In [26]:
foodnames[foodnames.str.contains('')].head()

0                Farine de blé noir
1    Banana Chips Sweetened (Whole)
2                           Peanuts
3            Organic Salted Nut Mix
4                   Organic Polenta
Name: product_name, dtype: object

Some foods use the same word, but they have different meanings. In British English a biscuit usually refers to what Americans consider a cookie, while American biscuits are more breadlike.

A more extreme case is the same word in different languages. For example, in English, Paprika is a spice, but in German, paprika is the word for bell pepper.

In [27]:
foodnames[foodnames.str.contains('paprika')].head()

112278                                       smoked paprika
113193                    Boulettes de bœuf oignons paprika
115362                  Papillotes poulet paprika et tomate
176608           Bœuf au paprika, rouleaux de riz et daïkon
176609    Bœuf au paprika, daïkon, chou pak choï et caro...
Name: product_name, dtype: object

In [28]:
def display(i):
    print(cols[i], i, food[cols[i]].count())

In [29]:
display(151)

taurine_100g 151 31


Some columns that would be of interest
product_name, index = 7, count = 316169: Product Name, in language, 
generic_name, index = 8, count = 55063: Sometimes a translation for the product
quantity, index = 9, count = 110551, weight
categories_en, index = 16, count = 90786: Tags of what the food is, separated by commas
labels_en, index = 23, count = 51157: Certain features of the food, like vegetarian, gluten-free, etc.,
countries_en, index = 33, count = 333559: Country of Origin
ingredients_text, index = 34, count = 260990: Ingredients in the food, separated by commas, in native language
traces_en, index = 39, count = 25876: Ingredients it contains traces of, in English
serving_size, index = 40, count = 213266: Serving Size
additives_n, index = 42, count = 260966: Perhaps some additive value
additives, index = 43, count = 260929: Shows additives (?) seems like foods
additives_en, index = 45, count = 159101: Shows chemical additives of foods
ingredients_from_palm_oil_n, index = 46, count = 260966
nutrition_grade_fr, index = 53, count = 232858: Some French nutrition grade
pnns_groups_1, index = 54, count = 104284: Food under certain categories - sugar snacks, fish meat eggs, beverages, etc.,
pnns_groups_2, index = 55, count = 107472: Usually a further clarification of groups_1
main_category_en, index = 60, count = 90740: Sometimes less clear than pnns_groups_2

## energy_100g, index = 63, count = 272870: calories
energy-from-fat_100g, index = 64, count = 858: cals from fat
## fat_100g, index = 65, count = 256087: fat
## saturated-fat_100g 66 241295
## monounsaturated-fat_100g 81 22893
## polyunsaturated-fat_100g 82 22926
omega-3-fat_100g 83 869
trans-fat_100g 99 143328
cholesterol_100g 100 144100
## carbohydrates_100g 101 255794
## sugars_100g 102 256687
## fiber_100g 111 207889
## proteins_100g 112 271651
## salt_100g 116 267230
## sodium_100g 117 267183
alcohol_100g 118 4165
vitamin-a_100g 119 137592
vitamin-d_100g 121 7102
vitamin-e_100g 122 1378
vitamin-k_100g 123 921
vitamin-c_100g 124 140928
vitamin-b1_100g 125 11194
vitamin-b2_100g 126 10859
vitamin-pp_100g 127 11764
vitamin-b6_100g 128 6821
vitamin-b9_100g 129 5265
folates_100g 130 3045
vitamin-b12_100g 131 5336
pantothenic-acid_100g 133 2504
potassium_100g 136 24781
calcium_100g 138 141196
phosphorus_100g 139 5875
iron_100g 140 140522
magnesium_100g 141 6301
zinc_100g 142 3941
copper_100g 143 2112
manganese_100g 144 1626
selenium_100g 146 1171
fruits-vegetables-nuts_100g 153 3155
cocoa_100g 155 1009
## nutrition-score-fr_100g 158 232859
## nutrition-score-uk_100g 159 232859

biotin_100g 132 336

In [30]:
-alpha-linolenic-acid_100g 84 189
omega-6-fat_100g 87 192
-linoleic-acid_100g 88 154
-lactose_100g 106 266
starch_100g 109 275
polyols_100g 110 431

chloride_100g 137 167
iodine_100g 149 271

SyntaxError: invalid syntax (<ipython-input-30-7328dc65c4c9>, line 1)

In [None]:
food2 = food[np.isfinite(food['energy_100g'])]
food2 = food2[np.isfinite(food2['fat_100g'])]
food2 = food2[np.isfinite(food2['saturated-fat_100g'])]
food2 = food2[np.isfinite(food2['monounsaturated-fat_100g'])]
food2 = food2[np.isfinite(food2['polyunsaturated-fat_100g'])]
food2 = food2[np.isfinite(food2['carbohydrates_100g'])]
food2 = food2[np.isfinite(food2['sugars_100g'])]
food2 = food2[np.isfinite(food2['fiber_100g'])]
food2 = food2[np.isfinite(food2['proteins_100g'])]
food2 = food2[np.isfinite(food2['sodium_100g'])]


In [None]:
dff = food2[['product_name', 'generic_name', 'countries_en', 'energy_100g', 
       'fat_100g', 'saturated-fat_100g', 'monounsaturated-fat_100g', 'polyunsaturated-fat_100g',
       'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'sodium_100g', 'salt_100g',
       'ingredients_from_palm_oil_n', 'cholesterol_100g', 'trans-fat_100g', 'iron_100g', 'calcium_100g',
       'vitamin-a_100g', 'vitamin-c_100g', 'potassium_100g', 'vitamin-pp_100g', 'vitamin-b2_100g',
       'vitamin-b1_100g', 'vitamin-b9_100g', 'vitamin-b6_100g', 'phosphorus_100g', 'magnesium_100g',
       'vitamin-d_100g', 'vitamin-b12_100g', 'zinc_100g']]

dff['product_name'] = dff['product_name'].str.lower()

In [None]:
dff.shape

In [None]:
dff.median()

In [None]:
dff.mean()

In [None]:
dff['product_name']

In [None]:
foodnames2[foodnames2.str.contains('Green Tea')]

In [None]:
dff = dff[pd.notnull(dff['product_name'])]

In [None]:
dff['healthy'] = 0

# Healthy Food List + 'Superfoods'

Done
'berr'
kiwi
apple
pineapple
mango
lemon
lime
orange
banana
watermelon
coconut
ginger

green tea

tomato
spinach
kale
artichoke
mushroom
broccoli
asparagus
beet
pepper
chard





acai
quinoa
flax
chia
avocado
bok choy
#oat
salmon
edamame
lentil
greek yogurt
kefir
barley
egg
pistachio
almond
ginger
bean
pumpkin
garlic
cauliflower
hemp
wheatgrass
walnut
turkey
herring
sardine
mackerel
soy
barley
brazil nut
seaweed
goji
sauerkraut
cherr
spirulina
parsley
dark chocolate
olive oil
grapefruit
carrot
cilantro
brussels sprout
buckwheat
onion
honey
sweet potato
turmeric
apricot
pumpkin

sprout

Not in data
papaya

In [None]:
dff

In [None]:
# clean out the sweet fruits

dff.loc[(dff.product_name.str.contains('berr') | dff.product_name.str.contains('kiwi')
      | dff.product_name.str.contains('mango') | dff.product_name.str.contains('pineapple')
      | dff.product_name.str.contains('orange') | dff.product_name.str.contains('banana')
      | dff.product_name.str.contains('watermelon') | dff.product_name.str.contains('coconut')
      | dff.product_name.str.contains('ginger') | dff.product_name.str.contains('pomegranate')  
      | dff.product_name.str.contains('cherr') | dff.product_name.str.contains('apricot') 
      | dff.product_name.str.contains('grapefruit') | dff.product_name.str.contains('acai')
            ) &
        
    (~dff.product_name.str.contains('muffin')
    & ~dff.product_name.str.contains('cake') & ~dff.product_name.str.contains('waffle')
    & ~dff.product_name.str.contains('biscuit') & ~dff.product_name.str.contains('bagel')
    & ~dff.product_name.str.contains('scone') & ~dff.product_name.str.contains('pie')
    & ~dff.product_name.str.contains('sugar') & ~dff.product_name.str.contains('wafer')
    & ~dff.product_name.str.contains('donut') & ~dff.product_name.str.contains('strudel')
    & ~dff.product_name.str.contains('cream') & ~dff.product_name.str.contains('pastries')
    & ~dff.product_name.str.contains('cookie') & ~dff.product_name.str.contains('kake')
    & ~dff.product_name.str.contains('candy') & ~dff.product_name.str.contains('tart')
    & ~dff.product_name.str.contains('dessert') & ~dff.product_name.str.contains('jam')
    & ~dff.product_name.str.contains('icee') & ~dff.product_name.str.contains('drink mix')
    & ~dff.product_name.str.contains('peanut butter') & ~dff.product_name.str.contains('granola')
    & ~dff.product_name.str.contains('danish') & ~dff.product_name.str.contains('sweetened')
    & ~dff.product_name.str.contains('pudding') & ~dff.product_name.str.contains('bread')
    & ~dff.product_name.str.contains('cereal') & ~dff.product_name.str.contains('trail mix')
    & ~dff.product_name.str.contains('vinaigrette') & ~dff.product_name.str.contains('bars')
    & ~dff.product_name.str.contains('bar') & ~dff.product_name.str.contains('dressing')
    & ~dff.product_name.str.contains('pretzels') & ~dff.product_name.str.contains('flakes')
    & ~dff.product_name.str.contains('macaron') & ~dff.product_name.str.contains('macaroon')               
    & ~dff.product_name.str.contains('crunch') & ~dff.product_name.str.contains('bun')
    & ~dff.product_name.str.contains('lollipop') & ~dff.product_name.str.contains('syrup')
    & ~dff.product_name.str.contains('preserve') & ~dff.product_name.str.contains('nectar')
    & ~dff.product_name.str.contains('pastry') & ~dff.product_name.str.contains('spread')  
    & ~dff.product_name.str.contains('bites') & ~dff.product_name.str.contains('crisp')   
    & ~dff.product_name.str.contains('chip') & ~dff.product_name.str.contains('candied') 
    & ~dff.product_name.str.contains('milk chocolate') & ~dff.product_name.str.contains('fruit snacks')
    & ~dff.product_name.str.contains('roll') & ~dff.product_name.str.contains('crumbler')
    & ~dff.product_name.str.contains('sorbetto') & ~dff.product_name.str.contains('juice')     
    & ~dff.product_name.str.contains('snaks') & ~dff.product_name.str.contains('vinegar')
    & ~dff.product_name.str.contains('white chocolate') & ~dff.product_name.str.contains('rings')
    & ~dff.product_name.str.contains('drink') & ~dff.product_name.str.contains('pop')
    & ~dff.product_name.str.contains('smoothie') & ~dff.product_name.str.contains('special k')
    & ~dff.product_name.str.contains('snack') & ~dff.product_name.str.contains('snap')
    & ~dff.product_name.str.contains('delight') & ~dff.product_name.str.contains('soup')
    & ~dff.product_name.str.contains('twist') & ~dff.product_name.str.contains('crumble')
    & ~dff.product_name.str.contains('maraschino') & ~dff.product_name.str.contains('marmalade')
    & ~dff.product_name.str.contains('square')
                                     
                                    )
        , 'healthy'] = 1

# clean out veggies
dff.loc[(dff.product_name.str.contains('tomato') | dff.product_name.str.contains('spinach')
    | dff.product_name.str.contains('kale') | dff.product_name.str.contains('carrot')
    | dff.product_name.str.contains('artichoke') | dff.product_name.str.contains('mushroom')        
    | dff.product_name.str.contains('broccoli') | dff.product_name.str.contains('chard') 
    | dff.product_name.str.contains('asparagus') | dff.product_name.str.contains('beet')
    | dff.product_name.str.contains('pepper') | dff.product_name.str.contains('chard')
    | dff.product_name.str.contains('edamame') | dff.product_name.str.contains('pumpkin')
    | dff.product_name.str.contains('cauliflower') | dff.product_name.str.contains('chard')
    | dff.product_name.str.contains('cilantro') | dff.product_name.str.contains('sweet potato')         
    | dff.product_name.str.contains('pumpkin') | dff.product_name.str.contains('sprout')
    | dff.product_name.str.contains('onion') | dff.product_name.str.contains('garlic') 
    | dff.product_name.str.contains('sauerkraut') 
             
            )
    &
    (~dff.product_name.str.contains('juice') & ~dff.product_name.str.contains('chip')
    & ~dff.product_name.str.contains('ravioli') & ~dff.product_name.str.contains('cake')
    & ~dff.product_name.str.contains('soup') & ~dff.product_name.str.contains('pasta')
    & ~dff.product_name.str.contains('fettucine') & ~dff.product_name.str.contains('cream')
    & ~dff.product_name.str.contains('sauce') & ~dff.product_name.str.contains('penne')
    & ~dff.product_name.str.contains('wrap') & ~dff.product_name.str.contains('ketchup')
    & ~dff.product_name.str.contains('puff') & ~dff.product_name.str.contains('sausage')
    & ~dff.product_name.str.contains('pizza') & ~dff.product_name.str.contains('popcorn')
    & ~dff.product_name.str.contains('cookie') & ~dff.product_name.str.contains('wafer')
    & ~dff.product_name.str.contains('ring') & ~dff.product_name.str.contains('frie')
    & ~dff.product_name.str.contains('pretzel') & ~dff.product_name.str.contains('crisp')
    & ~dff.product_name.str.contains('bar') & ~dff.product_name.str.contains('stick')
    & ~dff.product_name.str.contains('cocktail') & ~dff.product_name.str.contains('cracker')
    & ~dff.product_name.str.contains('dressing') & ~dff.product_name.str.contains('bite')
    & ~dff.product_name.str.contains('bread') & ~dff.product_name.str.contains('rind')
    & ~dff.product_name.str.contains('toast') & ~dff.product_name.str.contains('granola')
    & ~dff.product_name.str.contains('tortilla') & ~dff.product_name.str.contains('dip')
    & ~dff.product_name.str.contains('patties') & ~dff.product_name.str.contains('patty')
    & ~dff.product_name.str.contains('roll') & ~dff.product_name.str.contains('muffin')
    & ~dff.product_name.str.contains('salsa') & ~dff.product_name.str.contains('loaf')
    & ~dff.product_name.str.contains('snack') & ~dff.product_name.str.contains('bagel')
    & ~dff.product_name.str.contains('flour') & ~dff.product_name.str.contains('bun')
    & ~dff.product_name.str.contains('fudge') & ~dff.product_name.str.contains('brownie')
    & ~dff.product_name.str.contains('candy') & ~dff.product_name.str.contains('paste')
    & ~dff.product_name.str.contains('knot') & ~dff.product_name.str.contains('naan')
    & ~dff.product_name.str.contains('seasoning') & ~dff.product_name.str.contains('macaroni')
    & ~dff.product_name.str.contains('creme') & ~dff.product_name.str.contains('trail mix')
    & ~dff.product_name.str.contains('baguette') & ~dff.product_name.str.contains('donut')
    & ~dff.product_name.str.contains('chocolate') & ~dff.product_name.str.contains('cheese')
    & ~dff.product_name.str.contains('burger') & ~dff.product_name.str.contains('pie')
    & ~dff.product_name.str.contains('risotto') & ~dff.product_name.str.contains('sandwich')
    ), 'healthy'] = 1

# fruits with savory potential
dff.loc[(dff.product_name.str.contains('lemon') | dff.product_name.str.contains('lime')
    | dff.product_name.str.contains('apple') | dff.product_name.str.contains('avocado') )
    &
    (~dff.product_name.str.contains('cookie') & ~dff.product_name.str.contains('cake')
    & ~dff.product_name.str.contains('chip') & ~dff.product_name.str.contains('noodle soup')
    & ~dff.product_name.str.contains('dessert') & ~dff.product_name.str.contains('drink mix')
    & ~dff.product_name.str.contains('pie') & ~dff.product_name.str.contains('ice cream')
    & ~dff.product_name.str.contains('kake') & ~dff.product_name.str.contains('wafer')
    & ~dff.product_name.str.contains('ramen') & ~dff.product_name.str.contains('drop')
    & ~dff.product_name.str.contains('macaroon') & ~dff.product_name.str.contains('macaron')
    & ~dff.product_name.str.contains('tart') & ~dff.product_name.str.contains('lemonade')  
    & ~dff.product_name.str.contains('vinaigrette') & ~dff.product_name.str.contains('salsa')
    & ~dff.product_name.str.contains('cereal') & ~dff.product_name.str.contains('snack')
    & ~dff.product_name.str.contains('bagel') & ~dff.product_name.str.contains('sausage')
    & ~dff.product_name.str.contains('bacon') & ~dff.product_name.str.contains('bar')
    & ~dff.product_name.str.contains('crisp') & ~dff.product_name.str.contains('nectar')
    & ~dff.product_name.str.contains('danish') & ~dff.product_name.str.contains('streusel')
    & ~dff.product_name.str.contains('crunch') & ~dff.product_name.str.contains('dressing')        
    & ~dff.product_name.str.contains('granola') & ~dff.product_name.str.contains('roll')
    & ~dff.product_name.str.contains('milk chocolate') & ~dff.product_name.str.contains('donut')
    & ~dff.product_name.str.contains('taco') & ~dff.product_name.str.contains('cracker')
    & ~dff.product_name.str.contains('pretzel') & ~dff.product_name.str.contains('thin')
    & ~dff.product_name.str.contains('flour') & ~dff.product_name.str.contains('bread')
    & ~dff.product_name.str.contains('smoothie') & ~dff.product_name.str.contains('juice')
    & ~dff.product_name.str.contains('sauce') & ~dff.product_name.str.contains('cobbler')
    & ~dff.product_name.str.contains('pasta')      
                                     
                                    )            , 'healthy'] = 1

dff.loc[dff.product_name.str.contains('green tea') , 'healthy'] = 1

dff.loc[(dff.product_name.str.contains('seaweed')) & (~dff.product_name.str.contains('crackers'))
            , 'healthy'] = 1

# Proteins

dff.loc[(dff.product_name.str.contains('salmon') | dff.product_name.str.contains('turkey')
    | dff.product_name.str.contains('herring') | dff.product_name.str.contains('sardine')
    | dff.product_name.str.contains('mackerel') | dff.product_name.str.contains('egg')                     
            )
    &
    (~dff.product_name.str.contains('chip') & ~dff.product_name.str.contains('dip')
     & ~dff.product_name.str.contains('candied') & ~dff.product_name.str.contains('noodle')
     & ~dff.product_name.str.contains('veggie') & ~dff.product_name.str.contains('chiips')
     & ~dff.product_name.str.contains('croissant') & ~dff.product_name.str.contains('sandwich')
     & ~dff.product_name.str.contains('fettucine') & ~dff.product_name.str.contains('sausage')
     & ~dff.product_name.str.contains('wrap') & ~dff.product_name.str.contains('stuffing')
     & ~dff.product_name.str.contains('fettucine') & ~dff.product_name.str.contains('sausage')
     & ~dff.product_name.str.contains('pasta') & ~dff.product_name.str.contains('macaroni')
     & ~dff.product_name.str.contains('salad') & ~dff.product_name.str.contains('bun')
     & ~dff.product_name.str.contains('gravy') & ~dff.product_name.str.contains('bologna')
     & ~dff.product_name.str.contains('bacon') & ~dff.product_name.str.contains('ham')
     & ~dff.product_name.str.contains('sauce') & ~dff.product_name.str.contains('cream')
    ), 'healthy'] = 1

# Everything else

dff.loc[(dff.product_name.str.contains('quinoa') | dff.product_name.str.contains('flax')
    | dff.product_name.str.contains('chia') | dff.product_name.str.contains('oat')
    | dff.product_name.str.contains('lentil') | dff.product_name.str.contains('greek yogurt')
    | dff.product_name.str.contains('barley') | dff.product_name.str.contains('pistachio') 
    | dff.product_name.str.contains('almond') | dff.product_name.str.contains('bean') 
    | dff.product_name.str.contains('hemp') | dff.product_name.str.contains('walnut')
    | dff.product_name.str.contains('soy') | dff.product_name.str.contains('brazil nut') 
    | dff.product_name.str.contains('goji') | dff.product_name.str.contains('dark chocolate')
    | dff.product_name.str.contains('olive oil') | dff.product_name.str.contains('buckwheat') 
    | dff.product_name.str.contains('turmeric')
            )
    &
    (~dff.product_name.str.contains('cheerios') & ~dff.product_name.str.contains('patties')
     & ~dff.product_name.str.contains('graham') & ~dff.product_name.str.contains('cereal')
     & ~dff.product_name.str.contains('marshmallow') & ~dff.product_name.str.contains('cookie')
     & ~dff.product_name.str.contains('creamer') & ~dff.product_name.str.contains('bar')
     & ~dff.product_name.str.contains('trail mix') & ~dff.product_name.str.contains('fudge')
     & ~dff.product_name.str.contains('pretzel') & ~dff.product_name.str.contains('cake')
     & ~dff.product_name.str.contains('bread') & ~dff.product_name.str.contains('chips')
     & ~dff.product_name.str.contains('muffin') & ~dff.product_name.str.contains('granola')
     & ~dff.product_name.str.contains('bun') & ~dff.product_name.str.contains('sauce')
     & ~dff.product_name.str.contains('sugar') & ~dff.product_name.str.contains('fried')
     & ~dff.product_name.str.contains('muffin') & ~dff.product_name.str.contains('granola')
     & ~dff.product_name.str.contains('ice cream') & ~dff.product_name.str.contains('square')
     & ~dff.product_name.str.contains('dressing') & ~dff.product_name.str.contains('waffle')
     & ~dff.product_name.str.contains('mayo') & ~dff.product_name.str.contains('vanilla')
     & ~dff.product_name.str.contains('ice cream') & ~dff.product_name.str.contains('square')
     & ~dff.product_name.str.contains('bacon') & ~dff.product_name.str.contains('honey')
     & ~dff.product_name.str.contains('baked bean') & ~dff.product_name.str.contains('burger')
     & ~dff.product_name.str.contains('creme') & ~dff.product_name.str.contains('gelato')
     & ~dff.product_name.str.contains('cracker') & ~dff.product_name.str.contains('cobbler')
     & ~dff.product_name.str.contains('roll') & ~dff.product_name.str.contains('biscuit')
     & ~dff.product_name.str.contains('pudding') & ~dff.product_name.str.contains('pie')
     & ~dff.product_name.str.contains('candy') & ~dff.product_name.str.contains('tequila')
     & ~dff.product_name.str.contains('biscotti') & ~dff.product_name.str.contains('brownie')
     & ~dff.product_name.str.contains('ice cream') & ~dff.product_name.str.contains('scone')
     & ~dff.product_name.str.contains('caramel') & ~dff.product_name.str.contains('dip')
    ), 'healthy'] = 1

In [None]:
set(dff.loc[(dff.product_name.str.contains('berr') | dff.product_name.str.contains('kiwi')
      | dff.product_name.str.contains('mango') | dff.product_name.str.contains('pineapple')
      | dff.product_name.str.contains('orange') | dff.product_name.str.contains('banana')
      | dff.product_name.str.contains('watermelon') | dff.product_name.str.contains('coconut')
      | dff.product_name.str.contains('ginger') | dff.product_name.str.contains('pomegranate')  
      | dff.product_name.str.contains('cherr') | dff.product_name.str.contains('apricot') 
      | dff.product_name.str.contains('grapefruit') | dff.product_name.str.contains('acai')
            ) &
        
    (~dff.product_name.str.contains('muffin')
    & ~dff.product_name.str.contains('cake') & ~dff.product_name.str.contains('waffle')
    & ~dff.product_name.str.contains('biscuit') & ~dff.product_name.str.contains('bagel')
    & ~dff.product_name.str.contains('scone') & ~dff.product_name.str.contains('pie')
    & ~dff.product_name.str.contains('sugar') & ~dff.product_name.str.contains('wafer')
    & ~dff.product_name.str.contains('donut') & ~dff.product_name.str.contains('strudel')
    & ~dff.product_name.str.contains('cream') & ~dff.product_name.str.contains('pastries')
    & ~dff.product_name.str.contains('cookie') & ~dff.product_name.str.contains('kake')
    & ~dff.product_name.str.contains('candy') & ~dff.product_name.str.contains('tart')
    & ~dff.product_name.str.contains('dessert') & ~dff.product_name.str.contains('jam')
    & ~dff.product_name.str.contains('icee') & ~dff.product_name.str.contains('drink mix')
    & ~dff.product_name.str.contains('peanut butter') & ~dff.product_name.str.contains('granola')
    & ~dff.product_name.str.contains('danish') & ~dff.product_name.str.contains('sweetened')
    & ~dff.product_name.str.contains('pudding') & ~dff.product_name.str.contains('bread')
    & ~dff.product_name.str.contains('cereal') & ~dff.product_name.str.contains('trail mix')
    & ~dff.product_name.str.contains('vinaigrette') & ~dff.product_name.str.contains('bars')
    & ~dff.product_name.str.contains('bar') & ~dff.product_name.str.contains('dressing')
    & ~dff.product_name.str.contains('pretzels') & ~dff.product_name.str.contains('flakes')
    & ~dff.product_name.str.contains('macaron') & ~dff.product_name.str.contains('macaroon')               
    & ~dff.product_name.str.contains('crunch') & ~dff.product_name.str.contains('bun')
    & ~dff.product_name.str.contains('lollipop') & ~dff.product_name.str.contains('syrup')
    & ~dff.product_name.str.contains('preserve') & ~dff.product_name.str.contains('nectar')
    & ~dff.product_name.str.contains('pastry') & ~dff.product_name.str.contains('spread')  
    & ~dff.product_name.str.contains('bites') & ~dff.product_name.str.contains('crisp')   
    & ~dff.product_name.str.contains('chip') & ~dff.product_name.str.contains('candied') 
    & ~dff.product_name.str.contains('milk chocolate') & ~dff.product_name.str.contains('fruit snacks')
    & ~dff.product_name.str.contains('roll') & ~dff.product_name.str.contains('crumbler')
    & ~dff.product_name.str.contains('sorbetto') & ~dff.product_name.str.contains('juice')     
    & ~dff.product_name.str.contains('snaks') & ~dff.product_name.str.contains('vinegar')
    & ~dff.product_name.str.contains('white chocolate') & ~dff.product_name.str.contains('rings')
    & ~dff.product_name.str.contains('drink') & ~dff.product_name.str.contains('pop')
    & ~dff.product_name.str.contains('smoothie') & ~dff.product_name.str.contains('special k')
    & ~dff.product_name.str.contains('snack') & ~dff.product_name.str.contains('snap')
    & ~dff.product_name.str.contains('delight') & ~dff.product_name.str.contains('soup')
    & ~dff.product_name.str.contains('twist') & ~dff.product_name.str.contains('crumble')
    & ~dff.product_name.str.contains('maraschino') & ~dff.product_name.str.contains('marmalade')
    & ~dff.product_name.str.contains('square')
                                     
                                    )
        , 'product_name'])

In [None]:
set(dff.loc[(dff.product_name.str.contains('sauerkraut') 
    ) 
    &
    (~dff.product_name.str.contains('awejioewjg') 
    )
            , 'product_name'])


In [None]:
set(dff.loc[(dff.product_name.str.contains('tomato') | dff.product_name.str.contains('spinach')
    | dff.product_name.str.contains('kale') | dff.product_name.str.contains('carrot')
    | dff.product_name.str.contains('artichoke') | dff.product_name.str.contains('mushroom')        
    | dff.product_name.str.contains('broccoli') | dff.product_name.str.contains('chard') 
    | dff.product_name.str.contains('asparagus') | dff.product_name.str.contains('beet')
    | dff.product_name.str.contains('pepper') | dff.product_name.str.contains('chard')
    | dff.product_name.str.contains('edamame') | dff.product_name.str.contains('pumpkin')
    | dff.product_name.str.contains('cauliflower') | dff.product_name.str.contains('chard')
    | dff.product_name.str.contains('cilantro') | dff.product_name.str.contains('sweet potato')         
    | dff.product_name.str.contains('pumpkin') | dff.product_name.str.contains('sprout')
    | dff.product_name.str.contains('onion') | dff.product_name.str.contains('garlic') 
    | dff.product_name.str.contains('sauerkraut') 
             
            )
    &
    (~dff.product_name.str.contains('juice') & ~dff.product_name.str.contains('chip')
    & ~dff.product_name.str.contains('ravioli') & ~dff.product_name.str.contains('cake')
    & ~dff.product_name.str.contains('soup') & ~dff.product_name.str.contains('pasta')
    & ~dff.product_name.str.contains('fettucine') & ~dff.product_name.str.contains('cream')
    & ~dff.product_name.str.contains('sauce') & ~dff.product_name.str.contains('penne')
    & ~dff.product_name.str.contains('wrap') & ~dff.product_name.str.contains('ketchup')
    & ~dff.product_name.str.contains('puff') & ~dff.product_name.str.contains('sausage')
    & ~dff.product_name.str.contains('pizza') & ~dff.product_name.str.contains('popcorn')
    & ~dff.product_name.str.contains('cookie') & ~dff.product_name.str.contains('wafer')
    & ~dff.product_name.str.contains('ring') & ~dff.product_name.str.contains('frie')
    & ~dff.product_name.str.contains('pretzel') & ~dff.product_name.str.contains('crisp')
    & ~dff.product_name.str.contains('bar') & ~dff.product_name.str.contains('stick')
    & ~dff.product_name.str.contains('cocktail') & ~dff.product_name.str.contains('cracker')
    & ~dff.product_name.str.contains('dressing') & ~dff.product_name.str.contains('bite')
    & ~dff.product_name.str.contains('bread') & ~dff.product_name.str.contains('rind')
    & ~dff.product_name.str.contains('toast') & ~dff.product_name.str.contains('granola')
    & ~dff.product_name.str.contains('tortilla') & ~dff.product_name.str.contains('dip')
    & ~dff.product_name.str.contains('patties') & ~dff.product_name.str.contains('patty')
    & ~dff.product_name.str.contains('roll') & ~dff.product_name.str.contains('muffin')
    & ~dff.product_name.str.contains('salsa') & ~dff.product_name.str.contains('loaf')
    & ~dff.product_name.str.contains('snack') & ~dff.product_name.str.contains('bagel')
    & ~dff.product_name.str.contains('flour') & ~dff.product_name.str.contains('bun')
    & ~dff.product_name.str.contains('fudge') & ~dff.product_name.str.contains('brownie')
    & ~dff.product_name.str.contains('candy') & ~dff.product_name.str.contains('paste')
    & ~dff.product_name.str.contains('knot') & ~dff.product_name.str.contains('naan')
    & ~dff.product_name.str.contains('seasoning') & ~dff.product_name.str.contains('macaroni')
    & ~dff.product_name.str.contains('creme') & ~dff.product_name.str.contains('trail mix')
    & ~dff.product_name.str.contains('baguette') & ~dff.product_name.str.contains('donut')
    & ~dff.product_name.str.contains('chocolate') & ~dff.product_name.str.contains('cheese')
    & ~dff.product_name.str.contains('burger') & ~dff.product_name.str.contains('pie')
    & ~dff.product_name.str.contains('risotto') & ~dff.product_name.str.contains('sandwich')
    ), 'product_name'])

In [None]:
# clean out fruits with savory
set(dff.loc[(dff.product_name.str.contains('lemon') | dff.product_name.str.contains('lime')
    | dff.product_name.str.contains('apple') | dff.product_name.str.contains('avocado') )
    &
    (~dff.product_name.str.contains('cookie') & ~dff.product_name.str.contains('cake')
    & ~dff.product_name.str.contains('chip') & ~dff.product_name.str.contains('noodle soup')
    & ~dff.product_name.str.contains('dessert') & ~dff.product_name.str.contains('drink mix')
    & ~dff.product_name.str.contains('pie') & ~dff.product_name.str.contains('ice cream')
    & ~dff.product_name.str.contains('kake') & ~dff.product_name.str.contains('wafer')
    & ~dff.product_name.str.contains('ramen') & ~dff.product_name.str.contains('drop')
    & ~dff.product_name.str.contains('macaroon') & ~dff.product_name.str.contains('macaron')
    & ~dff.product_name.str.contains('tart') & ~dff.product_name.str.contains('lemonade')  
    & ~dff.product_name.str.contains('vinaigrette') & ~dff.product_name.str.contains('salsa')
    & ~dff.product_name.str.contains('cereal') & ~dff.product_name.str.contains('snack')
    & ~dff.product_name.str.contains('bagel') & ~dff.product_name.str.contains('sausage')
    & ~dff.product_name.str.contains('bacon') & ~dff.product_name.str.contains('bar')
    & ~dff.product_name.str.contains('crisp') & ~dff.product_name.str.contains('nectar')
    & ~dff.product_name.str.contains('danish') & ~dff.product_name.str.contains('streusel')
    & ~dff.product_name.str.contains('crunch') & ~dff.product_name.str.contains('dressing')        
    & ~dff.product_name.str.contains('granola') & ~dff.product_name.str.contains('roll')
    & ~dff.product_name.str.contains('milk chocolate') & ~dff.product_name.str.contains('donut')
    & ~dff.product_name.str.contains('taco') & ~dff.product_name.str.contains('cracker')
    & ~dff.product_name.str.contains('pretzel') & ~dff.product_name.str.contains('thin')
    & ~dff.product_name.str.contains('flour') & ~dff.product_name.str.contains('bread')
    & ~dff.product_name.str.contains('smoothie') & ~dff.product_name.str.contains('juice')
    & ~dff.product_name.str.contains('sauce') & ~dff.product_name.str.contains('cobbler')
    & ~dff.product_name.str.contains('pasta')      
                                     
                                    )                             , 'product_name'])

In [None]:
dff.loc[(dff.product_name.str.contains('brazil nut') 
    ) 
    &
    (~dff.product_name.str.contains('crackers') 
    )
            , 'healthy']

In [None]:
set(dff.loc[(dff.product_name.str.contains('salmon') | dff.product_name.str.contains('turkey')
    | dff.product_name.str.contains('herring') | dff.product_name.str.contains('sardine')
    | dff.product_name.str.contains('mackerel') | dff.product_name.str.contains('egg')                     
            )
    &
    (~dff.product_name.str.contains('chip') & ~dff.product_name.str.contains('dip')
     & ~dff.product_name.str.contains('candied') & ~dff.product_name.str.contains('noodle')
     & ~dff.product_name.str.contains('veggie') & ~dff.product_name.str.contains('chiips')
     & ~dff.product_name.str.contains('croissant') & ~dff.product_name.str.contains('sandwich')
     & ~dff.product_name.str.contains('fettucine') & ~dff.product_name.str.contains('sausage')
     & ~dff.product_name.str.contains('wrap') & ~dff.product_name.str.contains('stuffing')
     & ~dff.product_name.str.contains('fettucine') & ~dff.product_name.str.contains('sausage')
     & ~dff.product_name.str.contains('pasta') & ~dff.product_name.str.contains('macaroni')
     & ~dff.product_name.str.contains('salad') & ~dff.product_name.str.contains('bun')
     & ~dff.product_name.str.contains('gravy') & ~dff.product_name.str.contains('bologna')
     & ~dff.product_name.str.contains('bacon') & ~dff.product_name.str.contains('ham')
     & ~dff.product_name.str.contains('sauce') & ~dff.product_name.str.contains('cream')
    ), 'product_name'])

In [None]:
set(dff.loc[(dff.product_name.str.contains('quinoa') | dff.product_name.str.contains('flax')
    | dff.product_name.str.contains('chia') | dff.product_name.str.contains('oat')
    | dff.product_name.str.contains('lentil') | dff.product_name.str.contains('greek yogurt')
    | dff.product_name.str.contains('barley') | dff.product_name.str.contains('pistachio') 
    | dff.product_name.str.contains('almond') | dff.product_name.str.contains('bean') 
    | dff.product_name.str.contains('hemp') | dff.product_name.str.contains('walnut')
    | dff.product_name.str.contains('soy') | dff.product_name.str.contains('brazil nut') 
    | dff.product_name.str.contains('goji') | dff.product_name.str.contains('dark chocolate')
    | dff.product_name.str.contains('olive oil') | dff.product_name.str.contains('buckwheat') 
    | dff.product_name.str.contains('turmeric')
            )
    &
    (~dff.product_name.str.contains('cheerios') & ~dff.product_name.str.contains('patties')
     & ~dff.product_name.str.contains('graham') & ~dff.product_name.str.contains('cereal')
     & ~dff.product_name.str.contains('marshmallow') & ~dff.product_name.str.contains('cookie')
     & ~dff.product_name.str.contains('creamer') & ~dff.product_name.str.contains('bar')
     & ~dff.product_name.str.contains('trail mix') & ~dff.product_name.str.contains('fudge')
     & ~dff.product_name.str.contains('pretzel') & ~dff.product_name.str.contains('cake')
     & ~dff.product_name.str.contains('bread') & ~dff.product_name.str.contains('chips')
     & ~dff.product_name.str.contains('muffin') & ~dff.product_name.str.contains('granola')
     & ~dff.product_name.str.contains('bun') & ~dff.product_name.str.contains('sauce')
     & ~dff.product_name.str.contains('sugar') & ~dff.product_name.str.contains('fried')
     & ~dff.product_name.str.contains('muffin') & ~dff.product_name.str.contains('granola')
     & ~dff.product_name.str.contains('ice cream') & ~dff.product_name.str.contains('square')
     & ~dff.product_name.str.contains('dressing') & ~dff.product_name.str.contains('waffle')
     & ~dff.product_name.str.contains('mayo') & ~dff.product_name.str.contains('vanilla')
     & ~dff.product_name.str.contains('ice cream') & ~dff.product_name.str.contains('square')
     & ~dff.product_name.str.contains('bacon') & ~dff.product_name.str.contains('honey')
     & ~dff.product_name.str.contains('baked bean') & ~dff.product_name.str.contains('burger')
     & ~dff.product_name.str.contains('creme') & ~dff.product_name.str.contains('gelato')
     & ~dff.product_name.str.contains('cracker') & ~dff.product_name.str.contains('cobbler')
     & ~dff.product_name.str.contains('roll') & ~dff.product_name.str.contains('biscuit')
     & ~dff.product_name.str.contains('pudding') & ~dff.product_name.str.contains('pie')
     & ~dff.product_name.str.contains('candy') & ~dff.product_name.str.contains('tequila')
     & ~dff.product_name.str.contains('biscotti') & ~dff.product_name.str.contains('brownie')
     & ~dff.product_name.str.contains('ice cream') & ~dff.product_name.str.contains('scone')
     & ~dff.product_name.str.contains('caramel') & ~dff.product_name.str.contains('dip')
    ), 'product_name'])

In [None]:
set(dff.countries_en)

In [None]:
dff.counts()

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score

X = dff.iloc[:, 3:]
X = X.iloc[:,1:].apply(lambda x: x.fillna(round(x.median(),0)),axis=0)
y = dff.healthy

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=4444)

In [None]:
def summary_stats(clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy - %.3f' % accuracy_score(y_test, y_pred))
    print('Precision - %.3f' % metrics.precision_score(y_test, y_pred, average = 'macro'))
    print('Recall - %.3f' % metrics.recall_score(y_test, y_pred, average = 'macro'))
    print('F1 score - %.3f' % metrics.f1_score(y_test, y_pred, average = 'macro'))
    print('\n')

In [None]:
print('kNN')
summary_stats(KNeighborsClassifier(3))

print('Logistic Regression')
summary_stats(LogisticRegression())

print('Gaussian Naive Bayes')
summary_stats(GaussianNB())

print('SVC')
summary_stats(SVC())

print('Decision Tree')
summary_stats(DecisionTreeClassifier())

print('Random Forest')
summary_stats(RandomForestClassifier())

In [None]:
dff.count().sort_values(ascending = False)

In [None]:
clf = KNeighborsClassifier(3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
dff.loc[X_test.index,'product_name']

In [None]:
dff.groupby('countries_en')['countries_en'].count().sort_values(ascending = False)