In [81]:
import pandas as pd
import numpy as np
import sys

### Reading of the train set

In [82]:
column_names = ['id','name','barcode','category','Énergie (kJ)','Sel (g)','Protéines (g)','Sucres (g)',\
                'Glucides (g)','Matières grasses (g)','Acides gras saturés (g)','Fibres alimentaires (g)',\
                'Biotine (µg)','Vitamine B1 (Thiamine) (mg)','Vitamine E (Tocopherol) (mg)','Acide folique (µg)',\
                'Vitamine A (µg)','Vitamine D (Cholacalciferol) (µg)','Vitamine C (Acide ascorbique) (mg)',\
                'Vitamine B6 (Pyridoxine) (mg)','Vitamine B5 (acide pantothénique) (mg)','Vitamine B3 (Niacine) (mg)',\
                'Vitamine B2 (Riboflavine) (mg)','Vitamine B12 (Cobalamine) (µg)','endline', 'extra']
train_set = pd.read_csv('../cluster_train_set.csv', names = column_names)

### Dropping unfound products in OpenFood

In [83]:
# dropping first line that repeats columns
train_set.drop(0, inplace=True)
train_set = train_set[train_set['name'].notnull()]

### Processing of product names containing ','

For the elements of the train set that had a ',' in their name the csv formatting shifted every value and created an extra column. Every value is reassigned to its right place below.

In [84]:
elem_to_shift = train_set[train_set['extra'].notnull()].copy()
for i in range(elem_to_shift.shape[0]):
    line = elem_to_shift.iloc[i].copy()
    line_values = line.values
    line_values[1] = line_values[1] + line_values[2]
    line_values = np.delete(line_values, 2)
    line_values = np.append(line_values, [np.nan])
    elem_to_shift.iloc[i] = line_values

train_set[train_set['extra'].notnull()] = elem_to_shift


### Processing categories that have more than 2 levels by removing extra levels

In [85]:
catagory_column_value = train_set['category'].values

for i in range(train_set.shape[0]):
    if catagory_column_value[i].count('/') > 2:
        current_category =  catagory_column_value[i]
        slash_indexes = [pos for pos, char in enumerate(current_category) if char == '/']
        catagory_column_value[i] = current_category[:slash_indexes[2]]

train_set['category'] = catagory_column_value

### Dropping useless columns

In [86]:
train_set.drop(['extra', 'endline'], axis = 1, inplace = True)

In [87]:
train_set.columns

Index(['id', 'name', 'barcode', 'category', 'Énergie (kJ)', 'Sel (g)',
       'Protéines (g)', 'Sucres (g)', 'Glucides (g)', 'Matières grasses (g)',
       'Acides gras saturés (g)', 'Fibres alimentaires (g)', 'Biotine (µg)',
       'Vitamine B1 (Thiamine) (mg)', 'Vitamine E (Tocopherol) (mg)',
       'Acide folique (µg)', 'Vitamine A (µg)',
       'Vitamine D (Cholacalciferol) (µg)',
       'Vitamine C (Acide ascorbique) (mg)', 'Vitamine B6 (Pyridoxine) (mg)',
       'Vitamine B5 (acide pantothénique) (mg)', 'Vitamine B3 (Niacine) (mg)',
       'Vitamine B2 (Riboflavine) (mg)', 'Vitamine B12 (Cobalamine) (µg)'],
      dtype='object')

### Dropping products that didn't have a nutrient list

In [88]:
train_set_values = train_set[train_set.columns[4:]].values.astype(float)
train_set = train_set[np.abs(np.sum(train_set_values, axis =1)) > 1e-3]

In [89]:
train_set.shape

(4654, 24)

### Renaming categories with only one level in "To classify"

In [90]:
catagory_column_value = train_set['category'].values

for i in range(train_set.shape[0]):
    if catagory_column_value[i].count('/') < 2:
        catagory_column_value[i] = "To classify"

train_set['category'] = catagory_column_value

### Saving preprocessed train set before computing clusters

In [97]:
train_set.shape

(4654, 21)

In [98]:
path = '../preprocessed_cluster_train_set.csv'
train_set.to_csv(path, index = False)

### Rereading of the train_set

In [92]:
train_set = pd.read_csv(path)

### Dropping columns not useful for clustering

In [93]:
train_set.drop(['id', 'name', 'barcode'], axis = 1, inplace = True)

### Computing cluster centers

In [94]:
unique_categories = train_set['category'].unique()
cluster_centers = np.zeros((unique_categories.shape[0], train_set.shape[1]-1))

# We compute the nutrient center of gravity of each of the 250 categories
for index, category in enumerate(unique_categories):
    category_train_set = train_set[train_set['category'] == category]
    category_train_set_values = category_train_set.values[:,1:].astype(float)
    nutrient_stack = np.zeros(train_set.shape[1]-1)
    
    for i in range(category_train_set_values.shape[0]):
        
        nutrient_array = category_train_set_values[i,:]
        nutrient_stack = nutrient_stack + nutrient_array
    
    
    cluster_centers[index] = nutrient_stack / category_train_set_values.shape[0]

unique_categories = np.expand_dims(unique_categories, axis = 1)
categories_center_values = np.concatenate([unique_categories, cluster_centers], axis = 1)
    
categories_center_df = pd.DataFrame(categories_center_values, columns = column_names[3:-2])
    

In [96]:
categories_center_df

Unnamed: 0,category,Énergie (kJ),Sel (g),Protéines (g),Sucres (g),Glucides (g),Matières grasses (g),Acides gras saturés (g),Fibres alimentaires (g),Biotine (µg),...,Vitamine E (Tocopherol) (mg),Acide folique (µg),Vitamine A (µg),Vitamine D (Cholacalciferol) (µg),Vitamine C (Acide ascorbique) (mg),Vitamine B6 (Pyridoxine) (mg),Vitamine B5 (acide pantothénique) (mg),Vitamine B3 (Niacine) (mg),Vitamine B2 (Riboflavine) (mg),Vitamine B12 (Cobalamine) (µg)
0,/chocolat/bonbons-chewing-gum,1168.26,0.108552,1.58572,31.8755,77.1228,1.42966,0.18,2.92138,0.344828,...,0.0827586,0,0,0,23.2345,0.00965517,0.0413793,0.110345,0.00965517,0.0172414
1,/petit-dejeuner/cacao-chocolats-en-poudre,1480.65,0.3205,8.335,55.5,64.7,4.995,0.235,6.65,1.25,...,1.896,95,126.4,0.79,12.64,0.221,0.65,2.53,0.256,0.3955
2,/boissons-chaudes-froides/boissons-energetiques,280.759,0.097931,6.05517,5.6931,6.9,0.731034,0.468966,1.72414,0.241379,...,0.682759,16.2069,34.4828,0.313793,4.17241,0.865517,0.882759,3.86207,0.0655172,0.806897
3,/chocolat/chocolat,2250.61,0.173249,6.42809,46.8444,51.4036,33.7242,3.20614,2.91827,0.0722022,...,0.0353791,1.73285,2.31047,0.0144404,0.231047,0.00404332,0.00866426,0.0462094,0.00404332,0.00722022
4,/chocolat/sain-bio,1936.35,0.554615,8.47692,25.7808,55.8192,22.9385,2.65769,4.8,0,...,0,0,0,0,0,0,0,0,0,0
5,/garnitures-ingredients/tomates-en-conserve,253.429,0.235714,2.98571,10.2286,10.6286,0.678571,0,1.75714,0,...,0,0,0,0,0,0,0,0,0,0
6,/garnitures-ingredients/du-monde-entier,977.976,3.23159,5.78774,8.03579,30.2745,8.92297,0.417949,2.43641,0,...,0,0.666667,0,0,0.447179,0.0151795,0.0261538,0.0784615,0.00717949,0.00769231
7,/chocolat/biscuits-gaufres,2066.71,0.363431,6.65392,32.3382,60.9118,24.6314,2.36275,2.71373,0,...,0,0,0,0,0,0,0,0,0,0
8,/garnitures-ingredients/soupes-sauces-bouillon,636.444,1.53741,2.58771,3.51058,9.17099,11.2922,0.156655,0.995563,0,...,0,0,0,0,0,0,0,0,0,0
9,/produits-surgeles/plats-cuisines,642.679,1.05464,5.94643,2.33929,16.8214,6.5,0.267857,1.87143,0,...,0,0,0,0,0,0,0,0,0,0


### /!\ Think of modifying the treatment of getParsedNutrients in master