# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics as sm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import mpld3
import random
# mpld3.enable_notebook()

### Read csv obtain from part 1

In [None]:
PATH = './dataset/openfoodfacts.csv'

In [None]:
dataset = pd.read_csv(PATH, sep = '\t') 

### Clean remove the entries impossible

In [None]:
dataset =  dataset[dataset["energy-kcal_100g"] < 15000] 

### Number of entries per categories
Check number in each category to see which one to study

In [1]:
dataset.groupby("pnns_groups_1").count()["product_name"]

NameError: name 'dataset' is not defined

## Clustering approaches

### Remove NA
We have chosen to study the most relevant and filled columns

In [None]:
dataset = dataset.dropna(subset=[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
])

### Define X
We choose here to first study **Cereals and potatoes**.

In [None]:
X = (dataset[dataset["pnns_groups_1"]=="Cereals and potatoes"])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]]

### K-means

Fit model

In [2]:
model=KMeans()
label = model.fit_predict(X)
label

NameError: name 'KMeans' is not defined

Display of the result on two by two graphs

In [None]:
cmap = np.array([ np.random.choice(range(256), size=3)/256 for j in range(len(label))])
plt.rcParams["figure.figsize"]=20,20
fig, axs = plt.subplots(3, 3)
axs[0, 0].scatter(X["energy-kcal_100g"], X["energy_100g"], s=1, c=cmap[label])
axs[0, 0].set_title('energy-kcal_100g x energy_100g')
axs[0, 1].scatter(X["energy-kcal_100g"], X["fat_100g"], s=1, c=cmap[label])
axs[0, 1].set_title('energy-kcal_100g x fat_100g')
axs[0, 2].scatter(X["energy-kcal_100g"], X["saturated-fat_100g"], s=1, c=cmap[label])
axs[0, 2].set_title('energy-kcal_100g x saturated-fat_100g')
axs[1, 0].scatter(X["energy-kcal_100g"], X["carbohydrates_100g"], s=1, c=cmap[label])
axs[1, 0].set_title('energy-kcal_100g x carbohydrates_100g')
axs[1, 1].scatter(X["energy-kcal_100g"], X["sugars_100g"], s=1, c=cmap[label])
axs[1, 1].set_title('energy-kcal_100g x sugars_100g')
axs[1, 2].scatter(X["energy-kcal_100g"], X["fiber_100g"], s=1, c=cmap[label])
axs[1, 2].set_title('energy-kcal_100g x fiber_100g')
axs[2, 0].scatter(X["energy-kcal_100g"], X["proteins_100g"], s=1, c=cmap[label])
axs[2, 0].set_title('energy-kcal_100g x proteins_100g')
axs[2, 1].scatter(X["energy-kcal_100g"], X["salt_100g"], s=1, c=cmap[label])
axs[2, 1].set_title('energy-kcal_100g x salt_100g')
axs[2, 2].scatter(X["energy-kcal_100g"], X["sodium_100g"], s=1, c=cmap[label])
axs[2, 2].set_title('energy-kcal_100g x sodium_100g')

for ax in axs.flat:
    ax.set(xlabel='x-label', ylabel='y-label')

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()

Let's see some elements of some clusters to try to understand the clusters made by the K-means

In [None]:
X_with_labels = X.copy()
X_with_labels["labels"] = label

In [None]:
(X_with_labels[X_with_labels["labels"]==0])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]].head()

In [None]:
(X_with_labels[X_with_labels["labels"]==1])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]].head()

### K-means with PCA first

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(X)
print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))
print('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(pca_2.explained_variance_ratio_)))

In [None]:
model=KMeans(5)
label = model.fit_predict(pca_2_result)

Display result

In [None]:
plt.rcParams["figure.figsize"]=10,10
cmap = np.array([ np.random.choice(range(256), size=3)/256 for j in range(len(label))])
plt.scatter(pca_2_result[:,0], pca_2_result[:,1], s=1, c=cmap[label])

Let's see some elements of some clusters to try to understand the clusters made by the K-means

In [None]:
X_with_labels = X.copy()
X_with_labels["labels"] = label
X_with_labels["product_name"] = dataset["product_name"]

In [None]:
(X_with_labels[X_with_labels["labels"]==0])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]].head()

Let's see an summary of the outliers of each cluster

In [None]:
for col in X.columns:
    print(col)
    for _ in set(label):
        print(f"\tLabel {_}")

#         Median
        print(f"\t\tMedian = {(X_with_labels[X_with_labels['labels']==_])[[ col ]].median()[0] }" )

#         Min    
        print(f"\t\tMin = { (X_with_labels[X_with_labels['labels']==_])[[ col ]].min()[0] }")
        for product in ( X_with_labels[ (X_with_labels[ col ] == (X_with_labels[X_with_labels["labels"]==_])[[ col ]].min()[0] ) & (X_with_labels["labels"]==_) ] )["product_name"]:
            product_line = dataset[ dataset['product_name'] == product]
            print(f"\t\t\t{product_line['product_name'].values[0]}")
            for column in dataset.columns.tolist():
                print(f"\t\t\t\t{column} : {product_line[column].values[0]}")
            break
            

#         Max
        print(f"\t\tMax = { (X_with_labels[X_with_labels['labels']==_])[[ col ]].max()[0] }")
        for product in ( X_with_labels[ (X_with_labels[ col ] == (X_with_labels[X_with_labels["labels"]==_])[[ col ]].max()[0] ) & (X_with_labels["labels"]==_) ] )["product_name"]:
            product_line = dataset[ dataset['product_name'] == product]
            print(f"\t\t\t{product_line['product_name'].values[0]}")
            for column in dataset.columns.tolist():
                print(f"\t\t\t\t{column} : {product_line[column].values[0]}")
            break

        print("----------")


    print("-------------------------------------------")

### K-means with PCA with log on energy and energy-kcal

In [None]:
import math
X["energy-kcal_100g"] = X["energy-kcal_100g"].apply(lambda x : math.log(x) if x!=0 else 0)
X["energy_100g"] = X["energy_100g"].apply(lambda x : math.log(x) if x!=0 else 0)

In [None]:
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(X)
print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))
print('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(pca_2.explained_variance_ratio_)))

In [None]:
model=KMeans(5)
label = model.fit_predict(pca_2_result)

Display result

In [None]:
plt.rcParams["figure.figsize"]=10,10
cmap = np.array([ np.random.choice(range(256), size=3)/256 for j in range(len(label))])
plt.ylim(top=10)
plt.scatter(pca_2_result[:,0], pca_2_result[:,1], s=1, c=cmap[label])

Let's see an summary of the outliers of each cluster

In [None]:
X_with_labels = X_cap.copy()
X_with_labels["labels"] = label
X_with_labels["product_name"] = dataset["product_name"]

(X_with_labels[X_with_labels["labels"]==0])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]].head()

for col in X_cap.columns:
    print(col)
    for _ in set(label):
        print(f"\tLabel {_}")

#         Median
        print(f"\t\tMedian = {(X_with_labels[X_with_labels['labels']==_])[[ col ]].median()[0] }" )

#         Min    
        print(f"\t\tMin = { (X_with_labels[X_with_labels['labels']==_])[[ col ]].min()[0] }")
        for product in ( X_with_labels[ (X_with_labels[ col ] == (X_with_labels[X_with_labels["labels"]==_])[[ col ]].min()[0] ) & (X_with_labels["labels"]==_) ] )["product_name"]:
            product_line = dataset[ dataset['product_name'] == product]
            print(f"\t\t\t{product_line['product_name'].values[0]}")
            for column in dataset.columns.tolist():
                print(f"\t\t\t\t{column} : {product_line[column].values[0]}")
            break
            

#         Max
        print(f"\t\tMax = { (X_with_labels[X_with_labels['labels']==_])[[ col ]].max()[0] }")
        for product in ( X_with_labels[ (X_with_labels[ col ] == (X_with_labels[X_with_labels["labels"]==_])[[ col ]].max()[0] ) & (X_with_labels["labels"]==_) ] )["product_name"]:
            product_line = dataset[ dataset['product_name'] == product]
            print(f"\t\t\t{product_line['product_name'].values[0]}")
            for column in dataset.columns.tolist():
                print(f"\t\t\t\t{column} : {product_line[column].values[0]}")
            break

        print("----------")


    print("-------------------------------------------")

### Sugary snacks
We tried the same approach on sugary snacks

In [None]:
X = (dataset[dataset["pnns_groups_1"]=="Sugary snacks"])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]]

In [None]:
X["energy-kcal_100g"] = X["energy-kcal_100g"].apply(lambda x : math.log(x) if x!=0 else 0)
X["energy_100g"] = X["energy_100g"].apply(lambda x : math.log(x) if x!=0 else 0)

In [None]:
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(X)
print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))
print('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(pca_2.explained_variance_ratio_)))

In [None]:
model=KMeans(5)
label = model.fit_predict(pca_2_result)

Display result

In [None]:
plt.rcParams["figure.figsize"]=10,10
cmap = np.array([ np.random.choice(range(256), size=3)/256 for j in range(len(label))])
plt.scatter(pca_2_result[:,0], pca_2_result[:,1], s=1, c=cmap[label])

Let's see an summary of the outliers of each cluster

In [None]:
X_with_labels = X_cap.copy()
X_with_labels["labels"] = label
X_with_labels["product_name"] = dataset["product_name"]

(X_with_labels[X_with_labels["labels"]==0])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]].head()

for col in X_cap.columns:
    print(col)
    for _ in set(label):
        print(f"\tLabel {_}")

#         Median
        print(f"\t\tMedian = {(X_with_labels[X_with_labels['labels']==_])[[ col ]].median()[0] }" )

#         Min    
        print(f"\t\tMin = { (X_with_labels[X_with_labels['labels']==_])[[ col ]].min()[0] }")
        for product in ( X_with_labels[ (X_with_labels[ col ] == (X_with_labels[X_with_labels["labels"]==_])[[ col ]].min()[0] ) & (X_with_labels["labels"]==_) ] )["product_name"]:
            product_line = dataset[ dataset['product_name'] == product]
            print(f"\t\t\t{product_line['product_name'].values[0]}")
            for column in dataset.columns.tolist():
                print(f"\t\t\t\t{column} : {product_line[column].values[0]}")
            break
            

#         Max
        print(f"\t\tMax = { (X_with_labels[X_with_labels['labels']==_])[[ col ]].max()[0] }")
        for product in ( X_with_labels[ (X_with_labels[ col ] == (X_with_labels[X_with_labels["labels"]==_])[[ col ]].max()[0] ) & (X_with_labels["labels"]==_) ] )["product_name"]:
            product_line = dataset[ dataset['product_name'] == product]
            print(f"\t\t\t{product_line['product_name'].values[0]}")
            for column in dataset.columns.tolist():
                print(f"\t\t\t\t{column} : {product_line[column].values[0]}")
            break

        print("----------")


    print("-------------------------------------------")

## DBSCAN
We tried another clustering approach

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

In [None]:
X = (dataset[dataset["pnns_groups_1"]=="Beverages"])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]]

In [None]:
X["energy-kcal_100g"] = X["energy-kcal_100g"].apply(lambda x : math.log(x) if x!=0 else 0)
X["energy_100g"] = X["energy_100g"].apply(lambda x : math.log(x) if x!=0 else 0)

In [None]:
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(X)
print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))
print('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(pca_2.explained_variance_ratio_)))

In [None]:
db = DBSCAN(eps=3.5, min_samples=5).fit(pca_2_result)

In [None]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Display some results

In [None]:
plt.rcParams["figure.figsize"]=10,10
cmap = np.array([ np.random.choice(range(256), size=3)/256 for j in range(len(labels))])
plt.ylim(top=10)
plt.scatter(pca_2_result[:,0], pca_2_result[:,1], s=10, c=cmap[labels])

In [None]:
plt.rcParams["figure.figsize"]=20,20
fig, axs = plt.subplots(3, 3)
axs[0, 0].scatter(X["salt_100g"], X["energy_100g"], s=1, c=cmap[labels])
axs[0, 0].set_title('energy-kcal_100g x energy_100g')
axs[0, 1].scatter(X["salt_100g"], X["fat_100g"], s=1, c=cmap[labels])
axs[0, 1].set_title('energy-kcal_100g x fat_100g')
axs[0, 2].scatter(X["salt_100g"], X["saturated-fat_100g"], s=1, c=cmap[labels])
axs[0, 2].set_title('energy-kcal_100g x saturated-fat_100g')
axs[1, 0].scatter(X["salt_100g"], X["carbohydrates_100g"], s=1, c=cmap[labels])
axs[1, 0].set_title('energy-kcal_100g x carbohydrates_100g')
axs[1, 1].scatter(X["salt_100g"], X["sugars_100g"], s=1, c=cmap[labels])
axs[1, 1].set_title('energy-kcal_100g x sugars_100g')
axs[1, 2].scatter(X["salt_100g"], X["fiber_100g"], s=1, c=cmap[labels])
axs[1, 2].set_title('energy-kcal_100g x fiber_100g')
axs[2, 0].scatter(X["salt_100g"], X["proteins_100g"], s=1, c=cmap[labels])
axs[2, 0].set_title('energy-kcal_100g x proteins_100g')
axs[2, 1].scatter(X["salt_100g"], X["salt_100g"], s=1, c=cmap[labels])
axs[2, 1].set_title('energy-kcal_100g x salt_100g')
axs[2, 2].scatter(X["salt_100g"], X["sodium_100g"], s=1, c=cmap[labels])
axs[2, 2].set_title('energy-kcal_100g x sodium_100g')

for ligne in axs:
    for _ in ligne:
        _.set_xlim([0,10])

for ax in axs.flat:
    ax.set(xlabel='x-label', ylabel='y-label')

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()

Let's see an summary of the outliers of each cluster

In [None]:
X_with_labels = X_cap.copy()
X_with_labels["labels"] = label
X_with_labels["product_name"] = dataset["product_name"]

(X_with_labels[X_with_labels["labels"]==0])[[
    "energy-kcal_100g",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g"
]].head()

for col in X_cap.columns:
    print(col)
    for _ in set(label):
        print(f"\tLabel {_}")

#         Median
        print(f"\t\tMedian = {(X_with_labels[X_with_labels['labels']==_])[[ col ]].median()[0] }" )

#         Min    
        print(f"\t\tMin = { (X_with_labels[X_with_labels['labels']==_])[[ col ]].min()[0] }")
        for product in ( X_with_labels[ (X_with_labels[ col ] == (X_with_labels[X_with_labels["labels"]==_])[[ col ]].min()[0] ) & (X_with_labels["labels"]==_) ] )["product_name"]:
            product_line = dataset[ dataset['product_name'] == product]
            print(f"\t\t\t{product_line['product_name'].values[0]}")
            for column in dataset.columns.tolist():
                print(f"\t\t\t\t{column} : {product_line[column].values[0]}")
            break
            

#         Max
        print(f"\t\tMax = { (X_with_labels[X_with_labels['labels']==_])[[ col ]].max()[0] }")
        for product in ( X_with_labels[ (X_with_labels[ col ] == (X_with_labels[X_with_labels["labels"]==_])[[ col ]].max()[0] ) & (X_with_labels["labels"]==_) ] )["product_name"]:
            product_line = dataset[ dataset['product_name'] == product]
            print(f"\t\t\t{product_line['product_name'].values[0]}")
            for column in dataset.columns.tolist():
                print(f"\t\t\t\t{column} : {product_line[column].values[0]}")
            break

        print("----------")


    print("-------------------------------------------")