In [20]:
%matplotlib inline
import math
import numpy as np
import pandas
import matplotlib.pyplot as plt

# Import the relevant CSF files from https://fineli.fi/fineli/fi/avoin-data.
# These files are part of the 1.5MB zip file download option on the Fineli website.
#
component_value = pandas.read_csv("component_value.csv", sep=';', decimal=',')
food =  pandas.read_csv("food.csv", sep=';', encoding='latin1')
foodname = pandas.read_csv("foodname_EN.csv", sep=';', encoding='latin1')
fuclass = pandas.read_csv("fuclass_EN.csv", sep=';')
eufdname = pandas.read_csv("eufdname_EN.csv", sep=';')
# Some patching for the component value data
component_value = component_value[component_value['EUFDNAME'].apply(lambda x: isinstance(x, (str)))]

In [21]:
#The second big step is creating the new data frame that should end up containing
# a list of normalized vector representation of the nutritional values for each food.
#
# We start off with two merges to combine the base info from food, food class and food name csv files.
df = pandas.merge(left=food[["FOODID","FUCLASS"]], right=fuclass[["THSCODE", "DESCRIPT"]], \
                  how='left', left_on="FUCLASS", right_on="THSCODE")[["FOODID","DESCRIPT"]]
foodshort = foodname[["FOODID","FOODNAME"]]
df = pandas.merge(how='left', right=df, left=foodshort, left_on="FOODID", right_on="FOODID")

#Now we itterate each of the unique component names in the component value CSV file.
for comp in component_value["EUFDNAME"].unique():
    # Take a filtered version of nutrient values for different foods for just one component name.
    filtered = component_value[component_value["EUFDNAME"] == comp][["FOODID","BESTLOC"]]
    #Calculate the mean value and standard deviation for the named component across all foods.
    std = filtered.loc[:,"BESTLOC"].std(axis=0)
    mean = filtered.loc[:,"BESTLOC"].mean(axis=0)
    # Normalize the component values for the named component.
    filtered[comp] = (filtered["BESTLOC"] - mean) / std
    # Merge the normalized component values into our dataframe
    filtered = filtered[["FOODID", comp]]
    df = pandas.merge(left=df,right=filtered, how='left', left_on='FOODID', right_on='FOODID')
#Replace all NotANumber values with a zero.
df = df.fillna(0)


In [22]:
#We shall use the vector distance between the mean vector for all fruit and the mean
# vector for all veggies as reference vector distance for looking at food group.
fruit = df.loc[df['DESCRIPT'] == 'Fruits']
vegies = df.loc[df['DESCRIPT'] == 'Vegetables']
reference_vectordistance = np.linalg.norm((vegies.mean() - fruit.mean()).values[1:])

#Create a new list for our results.
rowlist = []

#Itterate over all food groups defined in this data set, except for fruits.
for foodtype in df["DESCRIPT"].unique():
    if foodtype != 'Fruits':
        # Calculate the vector distance between fruits and this food group.
        other = df.loc[df['DESCRIPT'] == foodtype]
        vectordistance = np.linalg.norm((other.mean() - fruit.mean()).values[1:])
        # If the vector distance is up to twice the vector distance between fruit and veggies,
        # add it to our results list.
        if vectordistance/reference_vectordistance < 2.0001:
            row = dict()
            row["foodtype"] = foodtype
            row["reldistance"] = vectordistance/reference_vectordistance
            rowlist.append(row)

# Turn the constructed row list into a pandas dataframe
peergroups = pandas.DataFrame(rowlist)

#Display the other food groups and their relative vecto distance from fruit.
# A value of 1.0 denotes the same vector distance as between fruits and veggies
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    print(peergroups.sort_values(by=['reldistance']))
        

                                   foodtype  reldistance
71             Baby fruit and berry product     0.254584
12                                   Juices     0.324371
11                   Fruit and berry salads     0.419500
53                         Vegetable salads     0.433896
15                              Juice drink     0.499597
54                    Fruit and berry soups     0.512050
47   Fruit and berry dishes other than pies     0.572982
33                             Other drinks     0.619280
49                          Vegetable soups     0.652777
72                   Baby vegetable product     0.655973
31                    Soft drink with sugar     0.710715
9                          Vegetable juices     0.716880
60                              Pulse soups     0.729302
48                            Potato dishes     0.742103
51                        Cooked vegetables     0.744533
55                         Vegetable sauces     0.753597
52                         Vege

In [23]:
#Lets take a Honeydew melon as a reference fruit
melon = df.loc[df['FOODNAME'] == 'HONEYDEW MELON, WITHOUT SKIN']
#Replace all of the normalized component values in our data fram with the distance between our melon
# its component values and the given foods component values.
for header in df.head():
    if not header in ["FOODID","FOODNAME","DESCRIPT"]:
        df[header] = df[header] - melon[[header]].values[0]
# A list for holding our results.
rowlist = []
#Itterate over our dataframe
for index,row in df.iterrows():
    food = row.values[1]
    foodtype = row.values[2]
    #Determine the vector distance between the melon and this food.
    vector = row.values[3:]
    distance = np.linalg.norm(vector)/reference_vectordistance
    # Only look at MCDONALD foods and foods in the Chocolate food group
    if "MCDONALD" in food or foodtype == "Chocolate":
        row = dict()
        row["food"] = food
        row["distance"] = distance
        rowlist.append(row)

# Turn our results into a pandas dataframe
peerfood = pandas.DataFrame(rowlist)
#Display the other foods sorted by normalized vector distance.
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    print(peerfood.sort_values(by=['distance']))
    

    distance                                               food
21  0.956963                     MILKSHAKE, VANILLA, MCDONALD'S
23  1.515775                     HAMBURGER, MCFEAST, MCDONALD'S
10  1.589669         HAMBURGER, BEEF AND WHEAT ROLL, MCDONALD'S
11  1.647408               HAMBURGER, CHEESE BURGER, MCDONALD'S
13  1.647526      HAMBURGER, DOUBLE BURGER, BIG MAC, MCDONALD'S
12  1.728808              HAMBURGER, CHICKEN BURGER, MCDONALD'S
22  1.986608        HAMBURGER, DOUBLE CHEESE BURGER, MCDONALD'S
1   2.542666         CHOCOLATE CONFECTION FILLED WITH MARMALADE
14  2.915553            CHOCOLATE BAR, CARAMEL AND COOKIE, TWIX
24  2.915631         CHOCOLATE CONFECTION FILLED WITH CHOCOLATE
18  2.978043  SUFFELI CHOCOLATE BAR, WAFFLE, TOFFEE FILLING ...
7   3.174270                             CHOCOLATE BAR, LOW-FAT
6   3.224222                CHOCOLATE BAR WITH FILLING, AVERAGE
2   3.336511                             CHOCOLATE BAR, AVERAGE
15  3.350807  SUFFELI PUFFI SNACKS,PUFFE

In [24]:
count1 = 0
count2 = 0
count3 = 0
tcount = 0
for index,row in df.iterrows():
    food = row.values[1]
    foodtype = row.values[2]
    vector = row.values[3:]
    distance = np.linalg.norm(vector)/reference_vectordistance
    if "Vegetables" == foodtype:
        tcount += 1
        if distance > 2.915553:
            print("+ " ,food,"is further removed, nutritionally, from a melon than a Twix bar is")
            count3 +=1
        if distance > 1.986608:
            print("- " ,food,"is further removed, nutritionally, from a melon than a double cheeseburger is")
            count2 +=1
        if distance > 0.956963:
            print("# " ,food,"is further removed, nutritionally, from a melon than a milkshake is")
            count1 +=1
print()
print("* A milkshake is nutritionally closer to a melon than", count1,"out of", tcount,"vegetables.")
print("* A double cheeseburger is nutritionally closer to a melon than", count2,"out of", tcount, "vegetables.")
print("* A Twix candy bar is nutritionally closer to a melon than", count3,"out of", tcount,"vegetables.")

#  CARROT is further removed, nutritionally, from a melon than a milkshake is
+  CARROT, DRIED is further removed, nutritionally, from a melon than a Twix bar is
-  CARROT, DRIED is further removed, nutritionally, from a melon than a double cheeseburger is
#  CARROT, DRIED is further removed, nutritionally, from a melon than a milkshake is
#  JERUSALEM ARTICHOKE is further removed, nutritionally, from a melon than a milkshake is
#  BLACK SALSIFY is further removed, nutritionally, from a melon than a milkshake is
#  CAULIFLOWER is further removed, nutritionally, from a melon than a milkshake is
#  BROCCOLI is further removed, nutritionally, from a melon than a milkshake is
+  KALE is further removed, nutritionally, from a melon than a Twix bar is
-  KALE is further removed, nutritionally, from a melon than a double cheeseburger is
#  KALE is further removed, nutritionally, from a melon than a milkshake is
-  BRUSSELS SPROUT is further removed, nutritionally, from a melon than a double c

-  SPINACH, FROZEN is further removed, nutritionally, from a melon than a double cheeseburger is
#  SPINACH, FROZEN is further removed, nutritionally, from a melon than a milkshake is
#  WILD ROCKET, SAND ROCKET is further removed, nutritionally, from a melon than a milkshake is
#  GARDEN LETTUCE is further removed, nutritionally, from a melon than a milkshake is
#  RADICCHIO is further removed, nutritionally, from a melon than a milkshake is
#  ROMAINE LETTUCE is further removed, nutritionally, from a melon than a milkshake is
#  PEA SHOOT is further removed, nutritionally, from a melon than a milkshake is
-  CORN SALAD, LAMB'S LETTUCE is further removed, nutritionally, from a melon than a double cheeseburger is
#  CORN SALAD, LAMB'S LETTUCE is further removed, nutritionally, from a melon than a milkshake is
-  CORIANDER, FRESH is further removed, nutritionally, from a melon than a double cheeseburger is
#  CORIANDER, FRESH is further removed, nutritionally, from a melon than a milksh

In [27]:
# List of specific foods to compare to our melon in detail
compare = ['MILKSHAKE, VANILLA, MCDONALD\'S','KALE','BROCCOLI','CHOCOLATE BAR, CARAMEL AND COOKIE, TWIX']
#Subset of our data frame containing our four chosen foods.
part = df.loc[df['FOODNAME'].isin(compare)]
# Food name as dataframe axis, drop unneeded columns.
part = part.set_index('FOODNAME').drop(['FOODID','DESCRIPT'], axis=1)
# Twap the rows and columns, use shorter column names.
part = part.transpose().rename(columns={"CHOCOLATE BAR, CARAMEL AND COOKIE, TWIX": "TWIX",
                     "MILKSHAKE, VANILLA, MCDONALD'S": "MILKSHAKE"})
#Get the actual english descriptions for the components in our dataframe
names = eufdname.drop(['LANG'], axis=1).rename(columns={"THSCODE": "FOODNAME"}).set_index("FOODNAME")
#Merge the names with our transposed food subset
results = pandas.merge(how='left', right=names, left=part, left_index=True, right_index = True).set_index("DESCRIPT").round(1)


Unnamed: 0_level_0,BROCCOLI,KALE,TWIX,MILKSHAKE
DESCRIPT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"energy,calculated",-0.0,0.0,2.9,0.3
"fat, total",0.0,0.0,1.6,0.1
"carbohydrate, available",-0.3,-0.3,2.7,0.2
"protein, total",0.4,0.3,0.3,0.3
alcohol,0.0,0.0,0.0,0.0
"organic acids, total",0.1,-0.3,-0.4,-0.1
sugar alcohols,0.0,0.0,0.0,0.0
"sugars, total",-0.6,-0.6,4.2,0.4
fructose,-0.3,-0.3,0.7,-0.5
galactose,-0.1,-0.1,-0.2,2.3
