In [271]:
import pandas as pd
import numpy as np
import sys

### Reading of the train set

In [272]:
column_names = ['id','name','barcode','category','Énergie (kJ)','Sel (g)','Protéines (g)','Sucres (g)',\
                'Glucides (g)','Matières grasses (g)','Acides gras saturés (g)','Fibres alimentaires (g)',\
                'Biotine (µg)','Vitamine B1 (Thiamine) (mg)','Vitamine E (Tocopherol) (mg)','Acide folique (µg)',\
                'Vitamine A (µg)','Vitamine D (Cholacalciferol) (µg)','Vitamine C (Acide ascorbique) (mg)',\
                'Vitamine B6 (Pyridoxine) (mg)','Vitamine B5 (acide pantothénique) (mg)','Vitamine B3 (Niacine) (mg)',\
                'Vitamine B2 (Riboflavine) (mg)','Vitamine B12 (Cobalamine) (µg)','endline', 'extra']
train_set = pd.read_csv('../clustering/raw_cluster_train_set.csv', names = column_names)

### Dropping unfound products in OpenFood

In [273]:
# dropping first line that repeats columns
train_set.drop(0, inplace=True)
train_set = train_set[train_set['name'].notnull()]

### Processing of product names containing ','

For the elements of the train set that had a ',' in their name the csv formatting shifted every value and created an extra column. Every value is reassigned to its right place below.

In [274]:
elem_to_shift = train_set[train_set['extra'].notnull()].copy()
for i in range(elem_to_shift.shape[0]):
    line = elem_to_shift.iloc[i].copy()
    line_values = line.values
    line_values[1] = line_values[1] + line_values[2]
    line_values = np.delete(line_values, 2)
    line_values = np.append(line_values, [np.nan])
    elem_to_shift.iloc[i] = line_values

train_set[train_set['extra'].notnull()] = elem_to_shift


### Processing categories that have more than 2 levels by removing extra levels

In [275]:
catagory_column_value = train_set['category'].values

for i in range(train_set.shape[0]):
    if catagory_column_value[i].count('/') > 2:
        current_category =  catagory_column_value[i]
        slash_indexes = [pos for pos, char in enumerate(current_category) if char == '/']
        catagory_column_value[i] = current_category[:slash_indexes[2]]

train_set['category'] = catagory_column_value

### Dropping useless columns

In [276]:
train_set.drop(['extra', 'endline'], axis = 1, inplace = True)

In [277]:
train_set.columns

Index(['id', 'name', 'barcode', 'category', 'Énergie (kJ)', 'Sel (g)',
       'Protéines (g)', 'Sucres (g)', 'Glucides (g)', 'Matières grasses (g)',
       'Acides gras saturés (g)', 'Fibres alimentaires (g)', 'Biotine (µg)',
       'Vitamine B1 (Thiamine) (mg)', 'Vitamine E (Tocopherol) (mg)',
       'Acide folique (µg)', 'Vitamine A (µg)',
       'Vitamine D (Cholacalciferol) (µg)',
       'Vitamine C (Acide ascorbique) (mg)', 'Vitamine B6 (Pyridoxine) (mg)',
       'Vitamine B5 (acide pantothénique) (mg)', 'Vitamine B3 (Niacine) (mg)',
       'Vitamine B2 (Riboflavine) (mg)', 'Vitamine B12 (Cobalamine) (µg)'],
      dtype='object')

### Dropping products that didn't have a nutrient list and dropping those that have a -1 as nutrient

In [278]:
train_set_values = train_set[train_set.columns[4:]].values.astype(float)
train_set = train_set[np.abs(np.sum(train_set_values, axis =1)) > 1e-3]

In [279]:
indexes_to_drop = []
# update train_set indexes so that it ranges from 0 to train_set.shape[0]-1

train_set.index = range(train_set.shape[0])
train_set_values = train_set[train_set.columns[4:]].values.astype(float)

for i in range(train_set.shape[0]):
    if -1 in train_set_values[i]:
        indexes_to_drop += [i]
print(indexes_to_drop)

train_set = train_set.drop(indexes_to_drop)

[10, 45, 55, 74, 78, 270, 340, 465, 593, 817, 917, 929, 1300, 1480, 1733, 1754, 1757, 1783, 1819, 2045, 2183, 2207, 2617, 2896, 2937, 2938, 3213, 3261, 3269, 3456, 3496, 3657, 3671, 3940, 3969, 4005, 4194, 4339, 4560, 4565]


In [280]:
train_set

Unnamed: 0,id,name,barcode,category,Énergie (kJ),Sel (g),Protéines (g),Sucres (g),Glucides (g),Matières grasses (g),...,Vitamine E (Tocopherol) (mg),Acide folique (µg),Vitamine A (µg),Vitamine D (Cholacalciferol) (µg),Vitamine C (Acide ascorbique) (mg),Vitamine B6 (Pyridoxine) (mg),Vitamine B5 (acide pantothénique) (mg),Vitamine B3 (Niacine) (mg),Vitamine B2 (Riboflavine) (mg),Vitamine B12 (Cobalamine) (µg)
0,0,V6 White Spearmint,5700626503125,/chocolat/bonbons-chewing-gum,707.0,0.18,0.4,0.0,67.0,0.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,M-Quick Boisson au cacao,7613312049297,/petit-dejeuner/cacao-chocolats-en-poudre,1640.0,0.06,5.0,80.0,82.0,3.0,...,6.0,300.0,400.0,2.5,40.0,0.7,3.0,8.0,0.7,1.25
2,6,Skai (Crunchy mint),7616500651555,/chocolat/bonbons-chewing-gum,650.0,0.02,0.2,0.1,64.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,RedBull : Energy Drink,9002490206789,/boissons-chaudes-froides/boissons-energetiques,194.0,0.1,0.0,11.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,2.0,8.0,0.0,2.0
4,10,Frey Milch Extra Chocolat au lait extra fin sa...,7616500912472,/chocolat/chocolat,2040.0,0.25,8.0,11.0,51.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,13,MIGROS BIO Choco Cookies aux pépites de chocolat,7617400037784,/chocolat/sain-bio,2142.0,0.59,7.0,27.0,61.0,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,15,LONGOBARDI : Pomodori pelati,8002510010007,/garnitures-ingredients/tomates-en-conserve,100.0,0.0,1.5,4.5,4.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,23,Kinder Riegel,4008400221021,/chocolat/chocolat,2360.0,0.31,8.7,53.3,53.5,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,25,Namaste India PALAK PANEER Epinards et fromage...,7613312065945,/garnitures-ingredients/du-monde-entier,424.0,1.6,4.0,2.5,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,29,MINDOR petit berre,7617400032796,/chocolat/biscuits-gaufres,2150.0,0.4,6.0,34.0,64.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [281]:
train_set.shape

(4614, 24)

### Renaming categories with only one level in "To classify"

In [282]:
catagory_column_value = train_set['category'].values

for i in range(train_set.shape[0]):
    if catagory_column_value[i].count('/') < 2:
        catagory_column_value[i] = "TO_CLASSIFY"

train_set['category'] = catagory_column_value

### Saving preprocessed train set before computing clusters

In [283]:
train_set.shape

(4614, 24)

In [284]:
path = '../clustering/preprocessed_cluster_train_set.csv'
train_set.to_csv(path, index = False)

### Rereading of the train_set

In [285]:
train_set = pd.read_csv(path)

### Dropping columns not useful for clustering

In [286]:
train_set.drop(['id', 'name', 'barcode'], axis = 1, inplace = True)

### Computing cluster centers

In [287]:
unique_categories = train_set['category'].unique()
cluster_centers = np.zeros((unique_categories.shape[0], train_set.shape[1]-1))

# We compute the nutrient center of gravity of each of the 250 categories
for index, category in enumerate(unique_categories):
    category_train_set = train_set[train_set['category'] == category]
    category_train_set_values = category_train_set.values[:,1:].astype(float)
    nutrient_stack = np.zeros(train_set.shape[1]-1)
    
    for i in range(category_train_set_values.shape[0]):
        
        nutrient_array = category_train_set_values[i,:]
        nutrient_stack = nutrient_stack + nutrient_array
    
    
    cluster_centers[index] = nutrient_stack / category_train_set_values.shape[0]

unique_categories = np.expand_dims(unique_categories, axis = 1)
cluster_centers_values = np.concatenate([unique_categories, cluster_centers], axis = 1)
    
cluster_centers_df = pd.DataFrame(cluster_centers_values, columns = column_names[3:-2])
    

### Change cluster type names to match with enum types

In [288]:
cluster_centers_categories_values = cluster_centers_df['category'].values.copy()

for i in range(cluster_centers_categories_values.shape[0]):
    category_name = cluster_centers_categories_values[i]
    category_name = category_name.replace('-', '_')
    category_name = category_name.upper()
    cluster_centers_categories_values[i] = category_name

cluster_centers_df['category'] = cluster_centers_categories_values

### We drop the "TO CLASSIFY" category from the cluster centers df

In [289]:
cluster_centers_df = cluster_centers_df[cluster_centers_df['category'] != 'TO_CLASSIFY']

### Saving non-normalized data

In [290]:
cluster_centers_df.to_csv('../clustering/non_normalized_cluster_nutrient_centers.csv')

### Normalize the data

In [291]:
cluster_centers_df[cluster_centers_df.columns[1:]]

Unnamed: 0,Énergie (kJ),Sel (g),Protéines (g),Sucres (g),Glucides (g),Matières grasses (g),Acides gras saturés (g),Fibres alimentaires (g),Biotine (µg),Vitamine B1 (Thiamine) (mg),Vitamine E (Tocopherol) (mg),Acide folique (µg),Vitamine A (µg),Vitamine D (Cholacalciferol) (µg),Vitamine C (Acide ascorbique) (mg),Vitamine B6 (Pyridoxine) (mg),Vitamine B5 (acide pantothénique) (mg),Vitamine B3 (Niacine) (mg),Vitamine B2 (Riboflavine) (mg),Vitamine B12 (Cobalamine) (µg)
0,1168.26,0.108552,1.58572,31.8755,77.1228,1.42966,0.18,2.92138,0.344828,0.00758621,0.0827586,0,0,0,23.2345,0.00965517,0.0413793,0.110345,0.00965517,0.0172414
1,1480.65,0.3205,8.335,55.5,64.7,4.995,0.235,6.65,1.25,0.1735,1.896,95,126.4,0.79,12.64,0.221,0.65,2.53,0.256,0.3955
2,264.792,0.159167,5.29583,6.37083,6.79583,0.6625,0.566667,1.75,0.333333,0.0583333,0.575,13.75,19.1667,0.233333,3.83333,0.875,0.9375,3.83333,0.075,1.05833
3,2250.61,0.173249,6.42809,46.8444,51.4036,33.7242,3.20614,2.91827,0.0722022,0.0031769,0.0353791,1.73285,2.31047,0.0144404,0.231047,0.00404332,0.00866426,0.0462094,0.00404332,0.00722022
4,1936.35,0.554615,8.47692,25.7808,55.8192,22.9385,2.65769,4.8,0,0,0,0,0,0,0,0,0,0,0,0
5,253.429,0.235714,2.98571,10.2286,10.6286,0.678571,0,1.75714,0,0,0,0,0,0,0,0,0,0,0,0
6,983.022,3.2534,5.82273,8.08237,30.4357,8.97412,0.425258,2.45412,0,0.00618557,0,0.670103,0,0,0.449485,0.0152577,0.0262887,0.078866,0.00721649,0.00773196
7,2066.71,0.363431,6.65392,32.3382,60.9118,24.6314,2.36275,2.71373,0,0,0,0,0,0,0,0,0,0,0,0
8,642.679,1.05464,5.94643,2.33929,16.8214,6.5,0.267857,1.87143,0,0,0,0,0,0,0,0,0,0,0,0
9,680.106,1.01013,5.55729,3.94171,15.8616,8.4253,0.0962312,2.02915,0,0,0,0,0,0,0.306533,0,0,0,0,0


In [292]:
cluster_centers_values = cluster_centers_df[cluster_centers_df.columns[1:]].values.astype(np.float64)
cluster_centers_mean = np.mean(cluster_centers_values, axis = 0)
expanded_cluster_centers_mean = np.repeat(np.expand_dims(cluster_centers_mean, 0), cluster_centers_df.shape[0], axis =0)

cluster_centers_std = np.std(cluster_centers_values, axis = 0, ddof=1)
expanded_cluster_centers_std = np.repeat(np.expand_dims(cluster_centers_std, 0), cluster_centers_df.shape[0], axis =0)

normalized_cluster_centers_values = (cluster_centers_values - expanded_cluster_centers_mean) / expanded_cluster_centers_std
final_cluster_centers_values = np.c_[cluster_centers_df['category'].values, normalized_cluster_centers_values]

In [293]:
np.c_[np.expand_dims(np.array('std'), 0), np.expand_dims(cluster_centers_std, 0)]

array([['std', '656.3225781629042', '3.2417905186235414',
        '6.756990329873574', '15.09728548259974', '24.342119677076273',
        '13.571863856938599', '1.0141491791985042', '3.6912433746902957',
        '4.455665777537821', '0.10529878950598048', '1.150027452192731',
        '16.40606921395684', '42.83546968643756', '0.22922032557696864',
        '4.956335327575863', '4.1587027099051115', '0.2886633196530765',
        '1.417002844196999', '0.13338263347784496', '0.18760675392477022']],
      dtype='<U32')

In [294]:
cluster_centers_df = pd.DataFrame(data=final_cluster_centers_values, columns=cluster_centers_df.columns)
mean_row = np.c_[np.expand_dims(np.array('MEAN'), 0), np.expand_dims(cluster_centers_mean, 0)]
cluster_centers_df = cluster_centers_df.append(pd.DataFrame(data = mean_row, columns=cluster_centers_df.columns))

std_row = np.c_[np.expand_dims(np.array('STD'), 0), np.expand_dims(cluster_centers_std, 0)]
cluster_centers_df = cluster_centers_df.append(pd.DataFrame(data = std_row, columns=cluster_centers_df.columns))


### Saving normalized cluster centers

In [295]:
cluster_centers_df.to_csv('../clustering/cluster_nutrient_centers.csv', index=False)

### /!\ Think of modifying the treatment of getParsedNutrients in master
### /!\ Add Acides gras monoinsaturés (g) and Acides gras poly-insaturés (g)