## Classification
This notebook classifies all the data that has no PNNS category, by using a random forest trained on the data_nutri_pd dataframe created in the main notebook. The results of the classification will then be visually checked and stored in the PNNS_groups_1 column 

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
from mpl_toolkits.mplot3d import Axes3D
from difflib import get_close_matches

import pickle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

import findspark
findspark.init()

from pyspark.sql import *
import pyspark.sql.functions as F

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
%matplotlib inline

import plotly.plotly as py
import plotly.graph_objs as go


In [2]:
def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

def save_pickle(result, file_path = 'pickle'):
    with open(file_path, 'wb') as file:
        pickle.dump(result, file)

In [3]:
DATA_FOLDER = './data/'
data = spark.read.csv(DATA_FOLDER +'en.openfoodfacts.org.products.csv',header=True,sep='\t')  

In [4]:
columns = ["energy_100g","fat_100g","sugars_100g",\
                       "proteins_100g","carbohydrates_100g",\
                       "salt_100g","product_name"]

nutrition_without_group = data.select(columns)

In [5]:
nutrition_pd = nutrition_without_group.toPandas()

In [6]:
nutrition_pd = nutrition_pd.dropna()

In [7]:
data_nutri_pd = load_pickle('data_nutri')

In [8]:
data_nutri_pd

Unnamed: 0,pnns_groups_1,energy_100g,fat_100g,sugars_100g,proteins_100g,carbohydrates_100g,salt_100g,product_name
0,Fruits And Vegetables,657.0,0.00,27.000,0.600,36.00,0.000000,Compote de poire
1,Cereals And Potatoes,669.0,2.20,0.600,9.500,25.20,0.358000,BAguette bressan
2,Fish Meat Eggs,1059.0,17.00,0.500,23.000,0.50,2.500000,Pavé de saumon fumé à la ficelle
3,Composite Foods,450.0,2.20,0.500,6.800,15.30,0.700000,Blanquette de Volaille et son Riz
4,Composite Foods,455.0,4.20,1.400,4.400,12.50,0.600000,Raviolini au Fromage de chèvre et Pesto
5,Fruits And Vegetables,1210.0,12.00,0.000,22.000,23.00,2.160000,Salade Cesar
6,Sugary Snacks,1520.0,14.40,28.100,4.790,54.10,0.922000,Danoises à la cannelle roulées
8,Sugary Snacks,1090.0,10.70,24.700,3.330,38.70,0.647000,Chaussons tressés aux pommes
10,Beverages,213.0,0.00,0.000,0.000,14.00,1.000000,Root Beer
12,Composite Foods,478.0,6.79,0.714,5.360,7.86,0.499000,Quiche Lorraine


In [9]:
X=data_nutri_pd.iloc[:,1:7]
y=data_nutri_pd.iloc[:,0]#LABELS

#standardize values
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

model = RandomForestClassifier()
model.fit(X_std, y.values.ravel())



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
unlabeled_X = scaler.transform(nutrition_pd.iloc[:,0:6])

prediction =  model.predict(unlabeled_X)

In [11]:
nutrition_pd

Unnamed: 0,energy_100g,fat_100g,sugars_100g,proteins_100g,carbohydrates_100g,salt_100g,product_name
0,1569,7,15,7.8,70.1,1.4,Vitória crackers
2,88,0,0.4,0.2,4.8,2.04,Sauce Sweety chili 0%
4,134,0.3,3.9,0.9,5.3,0.42,Salade de carottes râpées
5,540,4.9,16.3,4.4,16.3,0.25,Fromage blanc aux myrtilles
8,929,3.3,1.8,11.7,38.4,0.678,Baguette parisien
9,1213,9.4,2,12.5,41,0.9,&quot;Baguette Lyonnais&quot;
10,916,5.9,1.7,9.7,30.3,0.464,Solène céréales poulet
11,1594,22,21.9,4.6,27.3,0.1,Tarte noix de coco
12,418,0,23,0,24,3.2,Salade de fruits exotiques
13,657,21,20,7.8,39,1.5,Chouquettes x 30


In [12]:
nutrition_pd['pnns_groups_1'] = prediction

In [13]:
concatenated = pd.concat([nutrition_pd, data_nutri_pd], sort=True)

In [14]:
concatenated.head()

Unnamed: 0,carbohydrates_100g,energy_100g,fat_100g,pnns_groups_1,product_name,proteins_100g,salt_100g,sugars_100g
0,70.1,1569,7.0,Cereals And Potatoes,Vitória crackers,7.8,1.4,15.0
2,4.8,88,0.0,Fat And Sauces,Sauce Sweety chili 0%,0.2,2.04,0.4
4,5.3,134,0.3,Fruits And Vegetables,Salade de carottes râpées,0.9,0.42,3.9
5,16.3,540,4.9,Milk And Dairy Products,Fromage blanc aux myrtilles,4.4,0.25,16.3
8,38.4,929,3.3,Cereals And Potatoes,Baguette parisien,11.7,0.678,1.8


Let's visualize it again: performing t-sne to check if the clustering is preserved.

In [None]:
numeric_columns  = ['carbohydrates_100g','energy_100g','proteins_100g','salt_100g','sugars_100g']
X = StandardScaler().fit_transform(concatenated[numeric_columns])
%time Y_tsne_concat = TSNE().fit_transform(X)