<!--TABLE OF CONTENTS-->
Contents:
- [Calculation de nutriscore](#Calculation-de-nutriscore)
- [Partie 0: Import de données](#Partie-0:-Import-de-données)
- [Partie 1: Analyse de Composantes Principales](#Partie-1:-Analyse-de-Composantes-Principales)
  - [Objectif : réduire les dimensions , en trouvant les composants qui explique le plus de variance](#Objectif-:-réduire-les-dimensions-,-en-trouvant-les-composants-qui-explique-le-plus-de-variance)
    - [PCA étape 1 - selection des colonnes à prendre en compte dans l'ACP](#PCA-étape-1---selection-des-colonnes-à-prendre-en-compte-dans-l'ACP)
    - [PCA étape 2 - Préparation des données sélectionnés pour l'ACP](#PCA-étape-2---Préparation-des-données-sélectionnés-pour-l'ACP)
      - [2a: éliminer les outliers:](#2a:-éliminer-les-outliers:)
      - [2b: éliminer les NaN:](#2b:-éliminer-les-NaN:)
    - [PCA étape 3  -  standardise (centrage et réduction d'echelle)](#PCA-étape-3-----standardise-(centrage-et-réduction-d'echelle))
  - [Additives](#Additives)
    - [Scree plot](#Scree-plot)
- [Cercle des corrélations](#Cercle-des-corrélations)
- [Partie 2: Clustering via K Nearest Neighbours](#Partie-2:-Clustering-via-K-Nearest-Neighbours)
    - [KNN step 1 - Divide Data Into Features and Labels](#KNN-step-1---Divide-Data-Into-Features-and-Labels)
- [Partie 3 : ANOVA](#Partie-3-:-ANOVA)


# Calculation de nutriscore



In [1]:


import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd
import seaborn as sns

palette = sns.color_palette("bright", 10)

# Partie 0: Import de données

In [2]:
# Données
RAW_DATA_FILENAME='en.openfoodfacts.org.products.csv'
RAW_DATA_DICT='raw_data_dict.csv'
SAMPLE_DATA_FILENAME='data_sample100k.csv' # 100,000 registres
CLEAN_DATA_FILENAME='openfoodfacts_cleaned.csv'
CLEAN_DATA_DICT='clean_data_dict.csv'
# Données supplémentaires
ADDITIVES_DETAIL='additives_detail.csv'
ADDITIVES_COUNT='additives_count.csv'

DATA_FOLDER = '../data/raw'
OUT_FOLDER = '../data/out'
IMAGE_FOLDER = 'images'

os_path_join = lambda folder, file: f'{folder}/{file}'

In [3]:
DATA_DICT = os_path_join(OUT_FOLDER, CLEAN_DATA_DICT)
data_dict_df=pd.read_csv(DATA_DICT,encoding='UTF-8',sep='\t')
# exclut les date_cols
date_col_filter= data_dict_df['column'].str.endswith(('_datetime', '_t'))
date_cols= data_dict_df[date_col_filter]['column'].tolist()
print(f'colonnes de dates : {date_cols}')
data_dict_df=data_dict_df[~date_col_filter]
data_dict=dict(zip(data_dict_df['column'],data_dict_df['dtype'].values))
data_dict

colonnes de dates : ['created_t', 'created_datetime', 'last_modified_t', 'last_modified_datetime']


{'Unnamed: 0': 'object',
 'code': 'object',
 'url': 'object',
 'creator': 'object',
 'product_name': 'object',
 'generic_name': 'object',
 'quantity': 'object',
 'packaging': 'object',
 'packaging_tags': 'object',
 'brands': 'object',
 'brands_tags': 'object',
 'categories': 'object',
 'categories_tags': 'object',
 'categories_en': 'object',
 'origins': 'object',
 'origins_tags': 'object',
 'origins_en': 'object',
 'manufacturing_places': 'object',
 'manufacturing_places_tags': 'object',
 'labels': 'object',
 'labels_tags': 'object',
 'labels_en': 'object',
 'emb_codes': 'object',
 'emb_codes_tags': 'object',
 'first_packaging_code_geo': 'object',
 'cities_tags': 'object',
 'purchase_places': 'object',
 'stores': 'object',
 'countries': 'object',
 'countries_tags': 'object',
 'countries_en': 'object',
 'ingredients_text': 'object',
 'allergens': 'object',
 'traces': 'object',
 'traces_tags': 'object',
 'traces_en': 'object',
 'serving_size': 'object',
 'serving_quantity': 'object',
 'a

In [4]:
CLEAN_DATA = os_path_join(OUT_FOLDER, CLEAN_DATA_FILENAME)
print(f'data file pour analyse exploratoire: {CLEAN_DATA}')

cleaned_data = pd.read_csv(CLEAN_DATA, sep='\t', header=0,
                      encoding='utf-8',
                      dtype=data_dict,
                      parse_dates=date_cols,
                      infer_datetime_format=True,
                      low_memory=True)  # Warning: dtypes

data file pour analyse exploratoire: ../data/out/openfoodfacts_cleaned.csv


# Partie 1: Analyse de Composantes Principales

- <https://www.datasklr.com/principal-component-analysis-and-factor-analysis/principal-component-analysis>
- <https://jmausolf.github.io/code/pca_in_python/>
- <https://cmdlinetips.com/2018/03/pca-example-in-python-with-scikit-learn/>

## Objectif : réduire les dimensions , en trouvant les composants qui explique le plus de variance



In [5]:
names = cleaned_data["product_name"] # ou data.index pour avoir les intitulés



### PCA étape 1 - selection des colonnes à prendre en compte dans l'ACP
Sélectionner les variables quantitatives

In [6]:
def datetime_to_float(df):
    date_cols= list(df.select_dtypes('datetime'))
    for col in date_cols:
        df[col]=df[col].astype('datetime64').view(np.int64).astype(np.float64)
    return df

def pca_select_vars(df,exclude_cols=[]):
    return (df
       .select_dtypes(include=['number'])
       .drop(columns=exclude_cols)
    )

pca_data= cleaned_data.pipe(pca_select_vars)
pca_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99996 entries, 0 to 99995
Data columns (total 25 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   additives_n                              35903 non-null  Int64  
 1   ingredients_from_palm_oil_n              35903 non-null  Int64  
 2   ingredients_that_may_be_from_palm_oil_n  35903 non-null  Int64  
 3   nutriscore_score                         35867 non-null  float64
 4   energy-kj_100g                           7129 non-null   float64
 5   energy-kcal_100g                         76671 non-null  float64
 6   energy_100g                              79271 non-null  float64
 7   fat_100g                                 78863 non-null  float64
 8   saturated-fat_100g                       76831 non-null  float64
 9   monounsaturated-fat_100g                 2476 non-null   float64
 10  polyunsaturated-fat_100g                 2484 

In [7]:
pca_data1= pca_data.drop(columns=['vitamin-a_100g','vitamin-c_100g','calcium_100g',
'iron_100g','energy-kcal_100g','energy-kj_100g','monounsaturated-fat_100g','polyunsaturated-fat_100g',
'trans-fat_100g','cholesterol_100g','additives_n','salt_100g','fiber_100g','potassium_100g','nutriscore_score',
'nutrition-score-fr_100g'])
pca_data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99996 entries, 0 to 99995
Data columns (total 9 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ingredients_from_palm_oil_n              35903 non-null  Int64  
 1   ingredients_that_may_be_from_palm_oil_n  35903 non-null  Int64  
 2   energy_100g                              79271 non-null  float64
 3   fat_100g                                 78863 non-null  float64
 4   saturated-fat_100g                       76831 non-null  float64
 5   carbohydrates_100g                       78844 non-null  float64
 6   sugars_100g                              78107 non-null  float64
 7   proteins_100g                            78939 non-null  float64
 8   sodium_100g                              74653 non-null  float64
dtypes: Int64(2), float64(7)
memory usage: 7.1 MB


In [8]:
target_data=cleaned_data.select_dtypes(exclude=['number'])
target_data.shape

(99996, 58)

### PCA étape 2 - Préparation des données sélectionnés pour l'ACP

#### 2a: éliminer les outliers:
  - pour réduire le risque de trop d'influence des outliers 
  - eliminer ou abs(Z-score) > 3  (pas bon pour des colonnes avec beaucoup de skew)
  - eliminer les top 1% et bottom 1%

#### 2b: éliminer les NaN:
  - dropna() # pour ne reduire pas le variance d'un variable
  - fillna(data_pca.mean()) # Il est fréquent de remplacer les valeurs inconnues par la moyenne de la variable
  - drop(columns={colonnes moins importants qui contient des NaN})

In [9]:
def drop_high_outliers(df, subset=None, exclude=None, alpha=0.01):
    ## elimine 1% des valeurs les plus grands 
    filtre = False
    if subset is None: subset= list(df.select_dtypes('number'))
    if not exclude is None:
        subset = list(set(subset)-set(exclude))
    for column in subset:
        # min_val = df[column].quantile(alpha)
        max_val = df[column].quantile(1-alpha)
        # filtre &= df[column]>min_val
        filtre |= df[column]>max_val
    
    nb=df[filtre].shape[0]
    print(f'drop_high_outliers, nb={nb}') #', (subset={subset})')
    return df[~filtre]    

def drop_low_outliers(df, subset=None, exclude=None, alpha=0.01):
    ## elimine 1% des valeurs les plus petits 
    filtre = False
    if subset is None: subset= list(df.select_dtypes('number'))
    if not exclude is None: subset = list(set(subset)-set(exclude))
    for column in subset:
        min_val = df[column].quantile(alpha)
        # max_val = df[column].quantile(1-alpha)
        filtre |= df[column]<min_val
        # filtre &= df[column]<max_val

    nb=df[filtre].shape[0]
    print(f'drop_low_outliers, nb={nb}') #, (subset={subset})')
    return df[~filtre]

print(pca_data.shape)
print(pca_data1.shape)
pca_d1=pca_data1.pipe(drop_high_outliers)
print(pca_d1.shape)
pca_d2=pca_data1.pipe(drop_low_outliers)
print(pca_d2.shape)
pca_data = pca_d2.loc[pca_d2.index.intersection(pca_d1.index)]
print('intersection of result')
print(pca_data.shape)


(99996, 25)
(99996, 9)
drop_high_outliers, nb=4580
(33864, 9)
drop_low_outliers, nb=1
(35902, 9)
intersection of result
(33863, 9)


In [10]:
def list_colonnes_vides(df: pd.DataFrame, threshold=0):
    count = df.isna().mean()
    colonnes_vides = count[(1 - count) <= threshold].index.to_list()
    print(f'list_colonnes_vides, threshold={threshold} (nb cols = {len(colonnes_vides)}) :{colonnes_vides}')
    return df


def drop_colonnes_vides(df, threshold=0):
    count = df.isna().mean()
    colonnes_vides = count[(1 - count) <= threshold].index.to_list()
    print(f'drop_colonnes_vides, threshold = {threshold}')
    return df.drop(colonnes_vides, axis=1)
pca_data2=pca_data.pipe(drop_colonnes_vides,threshold=0.25)
print(pca_data2.shape)
print(pca_data.columns.to_list())


drop_colonnes_vides, threshold = 0.25
(33863, 9)
['ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 'energy_100g', 'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g', 'sodium_100g']


In [11]:
pca_data3=pca_data2
# .dropna(subset=['nutriscore_score'])
pca_data3.shape

(33863, 9)

In [12]:
pca_data4=pca_data3
# .dropna(subset=['additives_n','salt_100g','fiber_100g'])
print(pca_data4.shape)
pca_data4.info()

(33863, 9)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33863 entries, 3 to 99992
Data columns (total 9 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ingredients_from_palm_oil_n              33863 non-null  Int64  
 1   ingredients_that_may_be_from_palm_oil_n  33863 non-null  Int64  
 2   energy_100g                              30529 non-null  float64
 3   fat_100g                                 30642 non-null  float64
 4   saturated-fat_100g                       28710 non-null  float64
 5   carbohydrates_100g                       30627 non-null  float64
 6   sugars_100g                              29941 non-null  float64
 7   proteins_100g                            30602 non-null  float64
 8   sodium_100g                              30274 non-null  float64
dtypes: Int64(2), float64(7)
memory usage: 2.6 MB


In [13]:
pca_data5=pca_data4.dropna()
# .drop(columns={'vitamin-a_100g','vitamin-c_100g','calcium_100g','iron_100g','energy-kcal_100g'}).dropna()
pca_data5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28020 entries, 3 to 99992
Data columns (total 9 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ingredients_from_palm_oil_n              28020 non-null  Int64  
 1   ingredients_that_may_be_from_palm_oil_n  28020 non-null  Int64  
 2   energy_100g                              28020 non-null  float64
 3   fat_100g                                 28020 non-null  float64
 4   saturated-fat_100g                       28020 non-null  float64
 5   carbohydrates_100g                       28020 non-null  float64
 6   sugars_100g                              28020 non-null  float64
 7   proteins_100g                            28020 non-null  float64
 8   sodium_100g                              28020 non-null  float64
dtypes: Int64(2), float64(7)
memory usage: 2.2 MB


In [14]:
features = pca_data5.columns
print(features)

Index(['ingredients_from_palm_oil_n',
       'ingredients_that_may_be_from_palm_oil_n', 'energy_100g', 'fat_100g',
       'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g',
       'proteins_100g', 'sodium_100g'],
      dtype='object')


In [92]:
def pts_kj_boisson(energy_100g):
    # Valeur énergétique (kJ/100g ou kJ/100ml)
    limit = [0,30,60,90,120,150,180,210,240,270]
    for i in range(10):
        if energy_100g <= limit[i]: break;return i
    return 10

def pts_kj(energy_100g,categ=None):
    # Valeur énergétique (kJ/100g ou kJ/100ml)
    if categ=='boisson': # Boissons
        return pts_kj_boisson(energy_100g)
    else: # Cas général, Fromages, Matières grasses ajoutées
        limit=[335, 670, 1005, 1340, 1675, 2010, 2345, 2680, 3015, 3350]
    for i in range(10):
        if energy_100g <= limit[i]: break;return i
    return 10


def pts_glus_boisson(sugar_100g):
    # Sucres (g/100g ou 100mL)
    limit=[0,1.5,3,4.5,6,7.5,9,10.5,12,13.5]
    for i in range(10):
        if sugar_100g <= limit[i]: break;return i
    return 10

def pts_glus(sugar_100g, categ=None):
    # Sucres (g/100g ou 100mL)
    if categ=='boisson': # Boissons
        return pts_glus_boisson(sugar_100g)
    else: # Cas général, Fromages, Matières grasses ajoutées
        limit=[4.5,9,13.5,18,22.5,27,5,31,36,40,45]
        for i in range(10):
            if sugar_100g <= limit[i]: break;return i
        return 10

def pts_ags_added_fat(saturated_fat_100g, fat_100g):
    # Acides gras saturés (g/100g ou 100mL)
    # categ Matières grasses ajoutées
    limit=[10,16,22,28,34,40,46,52,58,64]
    ags_lip_tot=saturated_fat_100g/fat_100g*100
    for i in range(10):
        if ags_lip_tot <= limit[i]: break;return i
    return 10

def pts_ags(saturated_fat_100g, categ=None, fat_100g=None):
    # Acides gras saturés (g/100g ou 100mL)
    if categ=='added_fat': # Matières grasses ajoutées
        return pts_ags_added_fat(saturated_fat_100g,fat_100g)

    else: # Cas général, Fromages, Boissons
        limit=[1,2,3,4,5,6,7,8,9,10]
        for i in range(10):
            if saturated_fat_100g <= limit[i]: break;return i
        return 10



def pts_na(sodium_100g):
    # Sodium (mg/100g ou 100mL), toutes les catégories
    limit=[90,180,270,360,450,540,630,720,810,900]
    for i in range(10):
        if sodium_100g <= limit[i]: break;return i
    return 10

def pts_prot(protein_100g):
    # Protéines (g/100g ou 100mL), toutes les catégories
    limit=[1.6,3.2,4.8,6.4,8]
    for i in range(5):
        if protein_100g <= limit[i]: break;return i
    return 5

def pts_fib(fibre_100g):
    # Fibres (g/100g ou 100mL), toutes les catégories
    limit=[0.9,1.9,2.8,3.7,4.7]
    for i in range(5):
        if fibre_100g <= limit[i]: break;return i
    return 5


def pts_FLN_boisson(FLN_100g):
    # Fruits, légumes, légumineuses, fruits à coques, huiles de colza, de noix et d'olive (%/100g ou 100mL)
    limit=[40,60,80]
    for i in range(3):
        if FLN_100g<limit[i]: break; return i*2
    return 10

def pts_FLN(FLN_100g, categ=None):
    # Fruits, légumes, légumineuses, fruits à coques, huiles de colza, de noix et d'olive (%/100g ou 100mL)
    if categ=='boisson': return pts_FLN_boisson()
    # pas boisson
    limit=[40,60,80]    
    for i in range(3):
        if FLN_100g<limit[i]: break; return i
    return 5


def find_products_by_name(df,chaine, nb=1):
    # get row from barcode:
    filtre = df['product_name'].notnull()
    filtre &= df['product_name'].str.contains(chaine, regex=True, case=True)
    filtre &= df['nutriscore_grade'].notnull()
    products = df[filtre].sort_values(by="product_name")
    if len(products)==0: return None
    return products.head(nb)

def nutriscore_from_name(df,product_name):
    return find_products_by_name(df,product_name,nb=1)

def nutriscore(series:pd.DataFrame):
    a = b = c = d = 0

    # x=find_products_by_name(df,product_name,nb=1)
    x=series.to_dict(orient='records')[0]
    print (x)

    categ=None
    # get row from name:
    
    quantity=x['quantity']
    print (quantity)
    if pd.isna(quantity):
        pass
    else:    
        if 'ml' in quantity:
            categ='boisson'
 
    # points A 
    energy_100g =x['energy_100g'] #Valeur énergétique (kJ/100g ou kJ/100ml)
    sugars_100g =x['sugars_100g'] #Valeur énergétique (kJ/100g ou kJ/100ml)
    saturated_fat_100g= x['saturated-fat_100g']
    sodium_100g=x['sodium_100g']
    if pd.isna(sodium_100g):
        salt_100g=x['salt_100g']
        sodium_100g= salt_100g/2.5*1000    
    fat_100g=x['fat_100g'] 
    if pd.isna(fat_100g) or pd.isna(saturated_fat_100g):
        pass
    elif fat_100g > 10 and saturated_fat_100g < fat_100g:
        categ='added_fat'

    # FLN_100g=x['fruits-vegetables-nuts_100g'] ## presque toujours vide
    # if pd.isna(FLN_100g.any()):
    FLN_100g=0 #sinon on ne peut rien calculé
    proteins_100g=x['proteins_100g']
    fiber_100g=x['fiber_100g']

    # for nutriscore, all but(salt and fat) must be present--> ptsA= pts_energy+pts_sugar+pts_sat_fats+ pts_sodium
    reqd=energy_100g+sugars_100g+saturated_fat_100g+sodium_100g
    + proteins_100g+fiber_100g
    if(pd.isna(reqd)):
        # ne perd pas de temps pour le calcule
        # on doit faire le filtrage avant d'appeler ce procedure
        ret={'e':energy_100g,'s':sugars_100g,'f':saturated_fat_100g,'Na':sodium_100g,'prot':proteins_100g,'fib':fiber_100g}
        return (ret)
    else:    
        a=pts_kj(energy_100g,categ)
        b=pts_glus(sugars_100g,categ)
        c=pts_ags(saturated_fat_100g,categ,fat_100g)
        d=pts_na(sodium_100g)
    
    pts_a =a+b+c+d


    V=pts_a,
    S=pts_prot(proteins_100g)
    T=pts_fib(fiber_100g)
    U=pts_FLN(FLN_100g)
    pts_b=S+T+U
    # deja testé si STU remplis
    if categ=='fromage':
        score = pts_a - pts_b
    # cas général
    elif pts_a <11:
        score= pts_a - pts_b
    elif pts_a>=11 and U==5:
        # fruits très sucrés/salés, noix etc
        # subtract proteins
        score=pts_a-pts_b
    else:
        # Pas besoin de subtraire les proteins
        score= pts_a-U-T
    return score

ap = find_products_by_name(cleaned_data,'apple pie',nb=1)
# ap['score'] = nutriscore(ap)
nutriscore(ap)
# ap['score'].values[0]


{'Unnamed: 0': '2524', 'Unnamed: 0.1': '511303', 'code': '891123002176', 'url': 'http://world-en.openfoodfacts.org/product/0891123002176/cranberry-apple-pie', 'creator': 'org-database-usda', 'created_t': Timestamp('2020-04-23 21:13:11'), 'created_datetime': Timestamp('2020-04-23 21:13:11'), 'last_modified_t': Timestamp('2020-04-23 21:13:11'), 'last_modified_datetime': Timestamp('2020-04-23 21:13:11'), 'product_name': 'Cranberry apple pie', 'generic_name': nan, 'quantity': nan, 'packaging': nan, 'packaging_tags': nan, 'brands': nan, 'brands_tags': nan, 'categories': 'Biscuits and cakes, Cakes', 'categories_tags': 'en:biscuits-and-cakes,en:cakes', 'categories_en': 'Biscuits and cakes,Cakes', 'origins': nan, 'origins_tags': nan, 'origins_en': nan, 'manufacturing_places': nan, 'manufacturing_places_tags': nan, 'labels': nan, 'labels_tags': nan, 'labels_en': nan, 'emb_codes': nan, 'emb_codes_tags': nan, 'first_packaging_code_geo': nan, 'cities_tags': nan, 'purchase_places': nan, 'stores': n

25

### PCA étape 3  -  standardise (centrage et réduction d'echelle)
Variables sur des échelles differentes sont standardisé pour contribuer egalement au analyse

In [None]:
from sklearn.preprocessing import StandardScaler

def pca_standardise(df):
    """Standardize the data"""
    scaler = StandardScaler()
    # return scaler.fit(df).transform(df) # numpy array
    return pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

X_scaled_df = pca_data5.pipe(pca_standardise)
X_scaled_df.head()



Remove outliers

In [None]:
def z_score(df):
    """
        applique le z-score en utilisant .mean() et .std()
        même résultat que utilisant StandardScaler
        StandardScaler est optimisé pour des grand matrices
    """
    df_std = df.copy()
    for column in df_std.columns:
        df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[column].std(ddof=0)       
    return df_std
    
# call the z_score function
X_scaled_df2 = z_score(pca_data5)

X_scaled_df2.head()




In [None]:

target_df= cleaned_data[['nova_group']].join(X_scaled_df2,how='inner')
target_df=target_df.rename_axis('id').reset_index()

In [None]:
# choix du nombre de composantes à calculer
n_comp = 6

# Create the PCA model
from sklearn.decomposition import PCA
pca = PCA(n_components=X_scaled_df2.shape[1])
# Fit the model with the standardised data
pca.fit(X_scaled_df2)

nwD=pca.transform(X_scaled_df2)
pcs = pca.components_ 
pca.explained_variance_ratio_.cumsum()

In [None]:
from sklearn.decomposition import PCA
nb_colonnes=len(pca_data5.columns)
dpca = PCA(n_components=nb_colonnes)
pc = dpca.fit_transform(X_scaled_df2)
nom_cols= [f'PC{i}' for i in range(1,nb_colonnes+1)]
print (nom_cols)
pc_df = pd.DataFrame(data = pc , columns = nom_cols)
pc_df.index=X_scaled_df2.index
pc_df.head()

In [None]:
for_visual=target_df[['id','nova_group']].join(pc_df,how='inner')
for_visual.head()

In [None]:
for_visual=pc_df.join(cleaned_data,how='inner')
for_visual[for_visual['PC2']>10].index

## Additives

In [None]:
additives_data=pd.read_csv(os_path_join(OUT_FOLDER,ADDITIVES_DETAIL), encoding='UTF-8', sep='\t')
additives_data.info(verbose=False)

In [None]:
additives_data['efsa_evaluation_overexposure_risk'].value_counts()

In [None]:
def get_unsafe_additives(df, tags=('high','moderate')):
    if type(tags) is str: tags=[tags]
    tags = '|'.join(tags)
    filtre = df['efsa_evaluation_overexposure_risk'].fillna('').str.contains(tags, regex=True)
    return df[filtre]

additives_data.pipe(get_unsafe_additives,'high')
additives_data.pipe(get_unsafe_additives,'moderate')
unsafe_additives = additives_data.pipe(get_unsafe_additives)
print(unsafe_additives['name'].tolist())



In [None]:
def get_safe_additives(df, tags=('high','moderate')):
    tags = 'high|moderate'
    filtre = df['efsa_evaluation_overexposure_risk'].fillna('').str.contains(tags, regex=True)
    return df[~filtre]

safe_additives = additives_data.pipe(get_safe_additives)
print(len(safe_additives))
safe_additives['name']
# safe_additives_en=safe_additives['name'].tolist()


In [None]:
nutriscore_palette = {'a':'#038141', 'b':'#85bb2f', 'c':'#fecb02', 'd':'#ee8100', 'e':'#e63e11'}
ecoscore_palette = {'a':'#1e8f4e', 'b':'#2ecc71', 'c':'#f5c100', 'd':'#ef7e1a', 'e':'#d93726'}
nova_palette = {'1': '#00aa00','2': '#ffcc00','3': '#ff6600','4': '#ff0000'}
siga_palette = {'A': '#009c3c','B': '#99c336','C': '#008dd1','D': '#00629e'}

groups=list(nova_palette.keys())
colors=list(nova_palette.values())
print(groups)
print(colors)

def get_palette(feature):
    if feature=='nutriscore_grade': return nutriscore_palette 
    elif feature=='nova_score': return nova_palette 
    elif feature=='eco_score' or feature=='ecoscore_grade_fr': return ecoscore_palette 
    else : return None 

def explode_series(series:pd.Series) -> pd.Series:
    """convertir ['a,b,c','d,e','f']--> ['a','b','c','d','e','f']  """
    if series.str.contains(',').any():
        return series.str.split(',').explode()
    else:
        return series.explode()
        # return series

def get_palette_df(df,feature,nb=10,exclude_values=None):
    """return dataframe[key,color]"""
    palette=get_palette(feature)
    if palette is None:   
        # if df[feature].dtype.kind in 'biufc':return None # value_counts
        if exclude_values is None: exclude_values=[]
        print(f'{feature}, exclude nb: {len(exclude_values)}')
        series = explode_series(df[feature]).copy()
        filtre=series.isin(exclude_values)
        # for val in exclude_values:
        #     series = series[series.index!=val]
        print(f'exclude count={len(exclude_values)}')
        print(f'exclude index count={len(series[filtre])}')
        series=series[~filtre]
        groups=series.value_counts().head(nb).index
        nb_features=min(len(groups),nb)
        groups = groups[:nb_features]
        if nb_features<11:
            colors = sns.color_palette("tab10").as_hex()[:nb_features]
        else:    
            colors = sns.color_palette("tab20").as_hex()[:nb_features]
        ret=pd.DataFrame(groups,columns={'group'})
        ret['color']=colors
    else:
        ret=pd.DataFrame.from_dict(palette,orient='index',columns={'color'}).rename_axis('group').sort_index().reset_index()
    return ret


safe_additives_en=safe_additives['name'].tolist()

print(cleaned_data.pipe(get_palette_df,'nutriscore_grade'))
print(cleaned_data.pipe(get_palette_df,'ecoscore_grade_fr'))
print(cleaned_data.pipe(get_palette_df,'pnns_groups_1',exclude_values=['unknown','Alcoholic beverages']))
print(cleaned_data.pipe(get_palette_df,'additives_en',exclude_values=safe_additives['name']))


In [None]:


convert_string_to_list = lambda x: x.str.split(',') if x else x

def explode_by_series(df,col,exclude_groups=None):
    # si le colonne est numérique, pas besoin de traitement
    if df[col].dtype.kind in 'biufc':return df
    ret=df.dropna().copy()
    ret[col]= ret[col].str.split(',')
    ret=ret.explode(col)
    return ret

feature='additives_tags'
feature='nutriscore_grade'
explode_by_series(for_visual[['PC1','PC2',feature]],feature).head()

In [None]:


def plot_components(data,axis1='PC1',axis2='PC2',feature='nutriscore_grade',exclude_groups=None):
    df=explode_by_series(data[[axis1,axis2,feature]],feature,exclude_groups)
    # return df.shape
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel(axis1, fontsize = 15)
    ax.set_ylabel(axis2, fontsize = 15)
    ax.set_title(f'Principal Components ({axis1}, {axis2}) vs. {feature}', fontsize = 20)
    palette = get_palette_df(df,feature,exclude_values=exclude_groups)
    groups = palette['group']
    colors = palette['color']
    for group, color in zip(groups,colors):
        if (group!='unknown'): 
            indicesToKeep = df[feature] == group
            ax.scatter(df.loc[indicesToKeep, axis1],
                      df.loc[indicesToKeep, axis2],
                        c = color, alpha=0.5,s = 1)
    # ax.legend(groups,bbox_to_anchor=(1.2,0.5),loc="center right", fontsize=12, 
            # bbox_transform=plt.gcf().transFigure)
    ax.legend(groups,markerscale=6)
    ax.grid()
    # plt.subplots_adjust(left=0.0, bottom=0.1, right=0.4)
    plt.subplots_adjust()
    plt.show()


plot_components(for_visual, axis1='PC1',axis2='PC2',feature='nutriscore_grade')
plot_components(for_visual, axis1='PC1',axis2='PC2',feature='ecoscore_grade_fr')
plot_components(for_visual, axis1='PC1',axis2='PC2',feature='pnns_groups_1',exclude_groups=['unknown','Alcoholic beverages'])
# plot_components(for_visual, axis1='PC1',axis2='PC2',feature='additives_n')
plot_components(for_visual, axis1='PC1',axis2='PC2',feature='additives_en',exclude_groups=safe_additives['name'])
plot_components(for_visual, axis1='PC1',axis2='PC2',feature='additives_tags',exclude_groups=safe_additives['id'])
plot_components(for_visual, axis1='PC3',axis2='PC4')

In [None]:
for_visual[for_visual['PC2']>20]['product_name']

In [None]:
dpca.explained_variance_ratio_

In [None]:
df = pd.DataFrame({'var':dpca.explained_variance_ratio_,
             'PC':nom_cols})
sns.barplot(x='PC',y="var", 
           data=df, color="c");

### Scree plot
A visual approach to selecting the number of principal components to keep means the use of a scree plot. A scree plot shows the number of components on the X-axis against the proportion of the variance explained on the Y-axis. The suggested number of components to keep is where the plot forms an elbow and the curve flattens out. Unfortunately, the scree plot often presents some ambiguity.  Further, a practical approach often prompts analysts to evaluate the first few principal components, and if they are of interest, the analyst would continue considering additional principal components.  However, if the first few principal components provide little relevance, evaluation of additional principal components is likely of no use.

The Kaiser rule suggests the minimum eigenvalue rule.  In this case, the number of principal components to keep equals the number of eigenvalues greater than 1.

Finally, the number of components to keep could be determined by a minimal threshold that explains variation in the data. In this case, we would keep as many principal components as needed to explain at least 70% (or some other threshold) of the total variation in the data. 

In [None]:
from matplotlib.ticker import MaxNLocator

def display_scree_plot(pca):
    '''Display a scree plot for the pca'''
    # scree = pca.explained_variance_
    scree = pca.explained_variance_ratio_*100
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    # plt.xlabel('Principal Component')
    # plt.ylabel('Eigenvalue')
    plt.axhline(y=1, linewidth=1, color='r', alpha=0.5)
    plt.title('Scree Plot of PCA: Component Eigenvalues')

    plt.xlabel("Number of principal components")
    plt.ylabel("Percentage explained variance")
    plt.title("Scree plot")
    plt.show(block=False)

display_scree_plot(pca)

In [None]:
def var_explained(pca):
    import numpy as np
    from matplotlib.pyplot import figure, show
    from matplotlib.ticker import MaxNLocator

    # ax = figure().gca()
    x=np.arange(1,len(pca.explained_variance_ratio_)+1)
    y=np.cumsum(pca.explained_variance_ratio_)
    print(f'x = {len(x)}; y = {len(y)}')

    ax=sns.lineplot(x=x,y=y,marker='+')

    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.axvline(x=8, linewidth=1, color='r', alpha=0.5)
    plt.axhline(y=0.9, linewidth=1, color='r', alpha=0.5)
    plt.title('Explained Variance of PCA by Component')
    show()

var_explained(dpca)

# Cercle des corrélations

In [None]:
def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
    for d1, d2 in axis_ranks: # On affiche les 3 premiers plans factoriels, donc les 6 premières composantes
        if d2 < n_comp:

            # initialisation de la figure
            fig, ax = plt.subplots(figsize=(7,6))

            # détermination des limites du graphique
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # affichage des flèches
            # s'il y a plus de 30 flèches, on n'affiche pas le triangle à leur extrémité
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
                   pcs[d1,:], pcs[d2,:], 
                   angles='xy', scale_units='xy', scale=1, color="grey")
                # (voir la doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=.1, color='black'))
            
            # affichage des noms des variables  
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        plt.text(x, y, labels[i], fontsize='14', ha='center', va='center', rotation=label_rotation, color="blue", alpha=0.5)
            
            # affichage du cercle
            circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
            plt.gca().add_artist(circle)

            # définition des limites du graphique
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)
        
            # affichage des lignes horizontales et verticales
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel(f'PC{d1+1} ({round(100*pca.explained_variance_ratio_[d1],1)}%)')
            plt.ylabel(f'PC{d2+1} ({round(100*pca.explained_variance_ratio_[d2],1)}%)')

            plt.title("Cercle des corrélations (PC{} et PC{})".format(d1+1, d2+1))
            plt.show(block=False) 

In [None]:

pcs = dpca.components_
display_circles(pcs, n_comp, dpca, [(0,1),(2,3),(4,5)], labels = np.array(features))

In [None]:
def display_circles_again(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
    """Display correlation circles, one for each factorial plane"""

    # For each factorial plane
    for d1, d2 in axis_ranks: 
        if d2 < n_comp:

            # Initialise the matplotlib figure
            fig, ax = plt.subplots(figsize=(20,20))

            # Determine the limits of the chart
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # Add arrows
            # If there are more than 30 arrows, we do not display the triangle at the end
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
                   pcs[d1,:], pcs[d2,:], 
                   angles='xy', scale_units='xy', scale=1, color="grey")
                # (see the doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=.1, color='black'))
            
            # Display variable names
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        plt.text(x, y, labels[i], fontsize='14', ha='center', va='center', rotation=label_rotation, color="blue", alpha=0.5)
            
            # Display circle
            circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
            plt.gca().add_artist(circle)

            # Define the limits of the chart
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)
        
            # Display grid lines
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # Label the axes, with the percentage of variance explained
            plt.xlabel('PC{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('PC{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))
            nr=d1+1
            plt.title("Correlation Circle (PC{} and PC{})".format(d1+1, d2+1))
            plt.show(block=False)
            d = {'values': pca.components_[d1], 'factors': labels}
            df1= pd.DataFrame(d)
            df1.set_index('factors')
            df2=df1.sort_values(by='values', ascending=False)
            df3=df1.sort_values(by='values', ascending=True)
            print("Principal Component" + str(nr)+ " Presenting Values")
            print(df2.head(3))
            print(df3.head(3))
            
            nr=d2+1
            
            d = {'values': pca.components_[d2], 'factors': labels}
            df1= pd.DataFrame(d)
            df1.set_index('factors')
            df2=df1.sort_values(by='values', ascending=False)
            df3=df1.sort_values(by='values', ascending=True)
            print("Principal Component" + str(nr)+ " Presenting Values")
            print(df2.head(3))
            print(df3.head(3))

In [None]:
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=1, illustrative_var=None):
    '''Display a scatter plot on a factorial plane, one for each factorial plane'''

    # For each factorial plane
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
 
            # Initialise the matplotlib figure      
            fig = plt.figure(figsize=(7,6))
        
            # Display the points
            if illustrative_var is None:
                plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha,s=1)
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value, s=1)
                plt.legend()

            # Display the labels on the points
            if labels is not None:
                for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
                    plt.text(x, y, labels[i],
                              fontsize='14', ha='center',va='center') 
                
            # Define the limits of the chart
            boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 1.1
            plt.xlim([-boundary,boundary])
            plt.ylim([-boundary,boundary])
        
            # Display grid lines
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # Label the axes, with the percentage of variance explained
            plt.xlabel('PC{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('PC{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Projection of points (on PC{} and PC{})".format(d1+1, d2+1))
            #plt.show(block=False)


In [None]:
# Projection des individus
X_projected = pca.transform(X_scaled_df2)
display_factorial_planes(X_projected, n_comp, dpca, [(0,1),(2,3),(4,5)], labels = None)

plt.show()


# Partie 2: Clustering via K Nearest Neighbours
K-means : unsupervised --> creates (new) classes if a new feature is needed (eg if wish to do KNN but don't have any labels yet)
KNN : supervised --> after training, can attribute unlabeled individuals to an existing label (class)

- Explanation
  - <https://towardsdatascience.com/getting-acquainted-with-k-nearest-neighbors-ba0a9ecf354f>
  - <https://www.datasciencecentral.com/profiles/blogs/k-nearest-neighbor-algorithm-using-python>
    - The kNN task can be broken down into writing 3 primary functions:
        1. Calculate the distance between any two points
        2. Find the nearest neighbours based on these pairwise distances
        3. Majority vote on a class labels based on the nearest neighbour list
  - <https://stats.stackexchange.com/questions/56500/what-are-the-main-differences-between-k-means-and-k-nearest-neighbours>

- Imputation
  - <https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/>
    - with train test verification before applying the imputation
  - <https://medium.com/@kyawsawhtoon/a-guide-to-knn-imputation-95e2dc496e>

- visualisation
  - <https://towardsdatascience.com/knn-visualization-in-just-13-lines-of-code-32820d72c6b6>
- using sparse matrices (500000 columns!!) and cosine similarity
  - <https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea>

- Movie Recommender System Using K-Means Clustering AND K-Nearest Neighbor
  - <https://ieeexplore.ieee.org/document/8776969>
  - <https://github.com/BenChristensen12/movie-recommender-system>

- Step-by-step (movie) recommendation system based on tags similarity (actors, genres, description)
  - <https://www.analyticsvidhya.com/blog/2020/08/recommendation-system-k-nearest-neighbors/>
  - <https://www.kaggle.com/heeraldedhia/movie-ratings-and-recommendation-using-knn>
  - could replace with tag column similarities (brand, categories_en, ingredients)
    input label name --> find product with closest name, then use recommender to recommend similar products

- Pipelines
  - <https://stackoverflow.com/questions/50335203/how-to-apply-knn-on-a-mixed-datasetnumerical-categorical-after-doing-one-hot>

### KNN step 1 - Divide Data Into Features and Labels


In [None]:
pca_data5.info()

In [None]:
print(list(cleaned_data.columns))
print(list(pca_data5.columns))

In [None]:
def replace_unknown_values(df, col='pnns_groups_1'):
    # series = df[[col]].copy()
    # series.replace('unknown',np.NaN)
    # df.loc[:,col]=series.loc[:,col]
    df.loc[:,col] = df.loc[:,col].replace('unknown', np.NaN)
    # df[col].map(lambda x: np.nan if x=="unknown"  else x)
    return df

sel_cols=pca_data5.columns.to_list()
label_cols=['nova_group', 'pnns_groups_1','ingredients_text','additives_n','nutriscore_score','nutriscore_grade','nutrition-score-fr_100g']
# anal_cols=sel_cols+label_cols

anal_cols= [
     'energy_100g', 'fat_100g', 'carbohydrates_100g', 'proteins_100g', 'sugars_100g','saturated-fat_100g','sodium_100g', 
     
     'nutriscore_score', 'nutriscore_grade', 'nutrition-score-fr_100g',
      'ingredients_text','ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 'additives_n',  
       'nova_group',   
      'main_category_en', 'categories_en',
      'pnns_groups_1', 'pnns_groups_2',
      'ecoscore_grade_fr', 'brands'
      ]
to_analyse=cleaned_data[anal_cols].pipe(replace_unknown_values,'pnns_groups_1').pipe(replace_unknown_values,'pnns_groups_2')

to_analyse.info()


In [None]:
import missingno as msno
to_analyse=to_analyse.sort_values(by=['energy_100g', 'fat_100g', 'carbohydrates_100g', 'proteins_100g', 'sugars_100g','sodium_100g', 
     'saturated-fat_100g',
     'nutriscore_score', 'nutriscore_grade',
      'ingredients_text','ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 'additives_n',
      'pnns_groups_1',
           'nova_group'])
to_analyse=to_analyse.sort_values(by=['nutriscore_grade','ingredients_that_may_be_from_palm_oil_n',
'pnns_groups_1','main_category_en','nova_group',
'sodium_100g','fat_100g','energy_100g',])
msno.matrix(to_analyse)

In [None]:
to_analyse=to_analyse.sort_values(by=['ingredients_that_may_be_from_palm_oil_n','nutriscore_grade',
'pnns_groups_1','main_category_en','nova_group',
'energy_100g','sodium_100g','fat_100g',])
msno.matrix(to_analyse)

In [None]:

feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
X = pca_data5[feature_columns].values
y = dataset['Species'].values



In [None]:

import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets, neighbors
# from mlxtend.plotting import plot_decision_regions

def knn_comparison(data, k):
    x = data[['X','Y']].values
    y = data['class'].astype(int).values
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    clf.fit(x, y)
    # Plotting decision region
    # plot_decision_regions(x, y, clf=clf, legend=2)
    # Adding axes annotations
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title(f'Knn with K={ str(k)}')
    plt.show()

# data1 = pd.read_csv('ushape.csv')
# for i in [1,5,20,30,40,80]:
    # knn_comparison(data1, i)

In [None]:
def append_class(df, class_name, feature, thresholds, names):
    '''Append a new class feature named 'class_name' based on a threshold split of 'feature'.  
    Threshold values are in 'thresholds' and class names are in 'names'.'''
    
    n = pd.cut(df[feature], bins = thresholds, labels=names)
    df[class_name] = n

In [None]:
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(Z, names, figsize=(10,25)):
    '''Plot a dendrogram to illustrate hierarchical clustering'''

    plt.figure(figsize=figsize)
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('distance')
    dendrogram(
        Z,
        labels = names,
        orientation = "left",
    )
    #plt.show()

In [None]:
from pandas.plotting import parallel_coordinates

def display_parallel_coordinates(df, num_clusters):
    '''Display a parallel coordinates plot for the clusters in df'''

    # Select data points for individual clusters
    cluster_points = []
    for i in range(num_clusters):
        cluster_points.append(df[df.cluster==i])
    
    # Create the plot
    fig = plt.figure(figsize=(12, 15))
    title = fig.suptitle("Parallel Coordinates Plot for the Clusters", fontsize=18)
    fig.subplots_adjust(top=0.95, wspace=0)

    # Display one plot for each cluster, with the lines for the main cluster appearing over the lines for the other clusters
    for i in range(num_clusters):    
        plt.subplot(num_clusters, 1, i+1)
        for j,c in enumerate(cluster_points): 
            if i!= j:
                pc = parallel_coordinates(c, 'cluster', color=[addAlpha(palette[j],0.2)])
                pc = parallel_coordinates(cluster_points[i], 'cluster', color=[addAlpha(palette[i],0.5)])

        # Stagger the axes
        ax=plt.gca()
        for tick in ax.xaxis.get_major_ticks()[1::2]:
            tick.set_pad(20)        

In [None]:
def display_parallel_coordinates_centroids(df, num_clusters):
    '''Display a parallel coordinates plot for the centroids in df'''

    # Create the plot
    fig = plt.figure(figsize=(12, 5))
    title = fig.suptitle("Parallel Coordinates plot for the Centroids", fontsize=18)
    fig.subplots_adjust(top=0.9, wspace=0)

    # Draw the chart
    parallel_coordinates(df, 'cluster', color=palette)

    # Stagger the axes
    ax=plt.gca()
    for tick in ax.xaxis.get_major_ticks()[1::2]:
        tick.set_pad(5)  

# Partie 3 : ANOVA