# PNNS Groups
Last modification : 13/05/2021

Short description :

    - Clean pnns groups 1 and 2.
    - Search unknown pnns 1 and 2.
    - Find a pnns 3 and 4 when possible.
    - Export csv as 'df_multi_pnns_v2.csv'

In [2]:
from eml.datanavig import off_columns_dict
cols = off_columns_dict.copy()
from eml.datanavig import *
from eml.functions import *

In [3]:
import pandas as pd
import numpy as np 
import re
import os
import time 
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

#robotoff
from robotoff.products import ProductDataset
from robotoff.taxonomy import get_taxonomy

from IPython.lib.deepreload import reload as dreload
from pandas import json_normalize

#Settings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
sns.set()

In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
%%time
df_raw = pd.read_csv(r'C:\Users\Antoine\Coding Bootcamp\machine learning\Open Food Facts\data\en.openfoodfacts.org.products.csv', sep='\t', low_memory=False)
df_raw.shape

Wall time: 1min 20s


(1625068, 184)

In [6]:
df = df_raw.dropna(subset=['categories_tags'])
df.shape

(808019, 184)

In [7]:
df_tax = pd.read_json(r'C:\Users\Antoine\Coding Bootcamp\machine learning\Open Food Facts\data\taxonomy_pnns.json')
df_tax.head()

Unnamed: 0,all_taxonomy_possibilities,pnns,taxonomy_suggestion
0,{'en:appetizers': 0},appetizers,en:appetizers
1,"{'en:artificially-sweetened-beverages': 2, 'en...",artificially sweetened beverages,en:artificially-sweetened-beverages
2,"{'en:biscuits': 10, 'en:biscuits-and-cakes': 2...",biscuits and cakes,en:biscuits-and-cakes
3,"{'en:bagel-breads': 7, 'en:baguette-made-with-...",bread,en:breads
4,"{'en:breakfast-cereals': 1, 'en:breakfast-cere...",breakfast cereals,en:breakfast-cereals


#Apply only on rows with 'unknown' or NaN pnns
def find_pnns(row, df_taxonomy):
    output = np.nan
    tags_list = row.split(',')
    tax_list = get_taxonomy_info(tags_list[0])
    if not isinstance(tax_list, float):
        tax_list = tax_list + tags_list
        tax_list = [tax.strip(' ').strip('') for tax in tax_list]
        tax_list = list(set(tax_list))
        for suggestion, pnns in zip(df_taxonomy.taxonomy_suggestion, df_taxonomy.pnns):
            if suggestion in tax_list:
                output = pnns
                break
            else:
                continue
        if isinstance(output, float):
            for possibilities, pnns in zip(df_taxonomy.all_taxonomy_possibilities, df_taxonomy.pnns):
                for possibility in possibilities.keys():
                    if possibility in tax_list:
                        output = pnns
                        break
                    else :
                        continue
    return output

In [8]:
df['pnns_groups_2'] = df['pnns_groups_2'].str.lower()
df['pnns_groups_1'] = df['pnns_groups_1'].str.lower()
df['pnns_groups_1'] = df['pnns_groups_1'].str.replace('-',' ')
df.pnns_groups_1.value_counts()

unknown                    154910
sugary snacks              140653
milk and dairy products     82904
fish meat eggs              82817
cereals and potatoes        74820
fat and sauces              63479
beverages                   61758
fruits and vegetables       50780
composite foods             50224
salty snacks                30263
Name: pnns_groups_1, dtype: int64

In [9]:
def find_pnns(row=None, df_taxonomy=None, pnns_n=1, nb_groups=4, search_duplicates=True):
    """
    Find a pnns group based on categories tags and taxonomy suggestions.

    Parameters:
    -----------
        - row: df row to apply
        - df_taxonomy: pd.DataFrame with pnns suggestions & matching tags
        - pnns_n:
        - nb_groups:
        - search_duplicates: 

    Return:
    -------

    Example:
    --------

    """
    #---- Setup variables ----

    #setup idx desired
    pnns_index = pnns_n - 1 
    #setup list of pnns candidates
    pnns_candidates = [] 
    #setup output
    output = np.nan 

    #---- Search Categories Tags ----

    #convert row to a list of tags
    tags_list = row['categories_tags'].split(',') 
    #search parents in taxonomy
    tax_list = [get_taxonomy_info(tags_list[i], info_type='parents') for i in range(len(tags_list))]
    #keep only tags
    tax_list = [item for item in tax_list if not isinstance(item, float)] 
    #convert list of lists to list
    tax_list = [item for sublist in tax_list for item in sublist] 
    #add original tags
    tax_list = tax_list + tags_list 
    #clean strings
    tax_list = [tax.strip(' ').strip('') for tax in tax_list]
    #remove duplicates
    tax_list = list(set(tax_list)) 

    #---- Search pnns candidates ----

    #find candidates in main suggestions
    for suggestion, pnns in zip(df_taxonomy.taxonomy_suggestion, df_taxonomy.pnns): 
        if suggestion in tax_list: pnns_candidates.append(pnns)
    #find candidates in others possibilities
    for possibilities, pnns in zip(df_taxonomy.all_taxonomy_possibilities, df_taxonomy.pnns): 
        for possibility in possibilities.keys():
            if possibility in tax_list: pnns_candidates.append(pnns)

    #---- Setup pnns output ----

    #return nan if no pnns founded
    if not len(pnns_candidates): return output
    #return unique pnns if only one founded
    elif len(pnns_candidates) == 1: 
        output = pnns_candidates[0]
    #return pnns 1 or more
    elif len(pnns_candidates) > 1: 
        output = pnns_candidates[pnns_index]
    
    #---- Search duplicates option ----

    #If pnns already exist in another group
    if search_duplicates:
        existing_values = [row[f'pnns_groups_{i}'] for i in range (1, nb_groups)]
        if output in existing_values: 
        #try to find another pnns
            for i in range(len(pnns_candidates)):
                output = pnns_candidates[i]
                if output not in existing_values:
                    break
    #return output
    return output

In [10]:
%time df['pnns_groups_2'].loc[df['pnns_groups_2']== 'unknown'] = df.apply(lambda row: find_pnns(row=row, df_taxonomy=df_tax, pnns_n=1, nb_groups=2, search_duplicates=False), axis=1)

Wall time: 6min 14s


In [11]:
df.loc[df['pnns_groups_2'] == 'pizza pies and quiches']['pnns_groups_2'] = 'pizza pie and quiches'

In [12]:
%time df['pnns_groups_3'] = df.apply(lambda row: find_pnns(row=row, df_taxonomy=df_tax, pnns_n=2, nb_groups=2), axis=1)
%time df['pnns_groups_4'] = df.apply(lambda row: find_pnns(row=row, df_taxonomy=df_tax, pnns_n=2, nb_groups=3), axis=1)

Wall time: 6min 6s
Wall time: 6min 35s


In [13]:
def find_pnns_groups_1(df):
    data = df.copy()
    vals_to_find = list(data.pnns_groups_1.unique())
    vals_to_find.remove('unknown')
    for val in vals_to_find:
        group_2_vals = list(data['pnns_groups_2'].loc[data['pnns_groups_1'] == val].unique())
        data['pnns_groups_1'].loc[(data['pnns_groups_1'] == 'unknown') & 
        (data['pnns_groups_2'].isin(group_2_vals))] = val
    return data

In [14]:
pnns = [f'pnns_groups_{i}'for i in range (1,5)]
pnns

['pnns_groups_1', 'pnns_groups_2', 'pnns_groups_3', 'pnns_groups_4']

In [15]:
df.dropna(subset=pnns, inplace=True)

In [16]:
df_pnns = find_pnns_groups_1(df)

In [17]:
df_pnns.drop(index=690514, inplace=True)

In [18]:
df_pnns['pnns_groups_2'].loc[df_pnns['pnns_groups_2'] == 'plant-based milk substitutes'] = 'plant based milk substitutes'
df_pnns['pnns_groups_2'].loc[df_pnns['pnns_groups_2'] == 'one-dish meals'] = 'one dish meals'

In [19]:
df_pnns.shape

(674517, 186)

In [20]:
cols['ingredients']

['labels',
 'labels_tags',
 'labels_en',
 'ingredients_text',
 'allergens',
 'traces',
 'traces_tags',
 'traces_en',
 'additives_n',
 'additives_tags',
 'additives_en',
 'ingredients_from_palm_oil_n',
 'ingredients_from_palm_oil_tags',
 'ingredients_that_may_be_from_palm_oil_n',
 'ingredients_that_may_be_from_palm_oil_tags',
 'nutriscore_score',
 'nutriscore_grade',
 'nova_group']

In [21]:
df_pnns[cols['infopdt']].isnull().sum()

product_name                  5430
abbreviated_product_name    674193
generic_name                580076
quantity                    403268
packaging                   474399
packaging_tags              474407
packaging_text              672762
brands                      184560
brands_tags                 184582
stores                      519438
serving_size                350036
serving_quantity            349785
nutriscore_score            128924
nutriscore_grade            128924
nova_group                  237771
brand_owner                 456145
dtype: int64

In [22]:
df_pnns[cols['ingredients']].isnull().sum()

labels                                        467195
labels_tags                                   467175
labels_en                                     467175
ingredients_text                              207157
allergens                                     536139
traces                                        600091
traces_tags                                   589122
traces_en                                     589122
additives_n                                   207156
additives_tags                                397124
additives_en                                  397124
ingredients_from_palm_oil_n                   207156
ingredients_from_palm_oil_tags                665276
ingredients_that_may_be_from_palm_oil_n       207156
ingredients_that_may_be_from_palm_oil_tags    646225
nutriscore_score                              128924
nutriscore_grade                              128924
nova_group                                    237771
dtype: int64

In [23]:
df_pnns

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,abbreviated_product_name,generic_name,...,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,pnns_groups_3,pnns_groups_4
3,0000000000100,http://world-en.openfoodfacts.org/product/0000...,del51,1444572561,2015-10-11T14:09:21Z,1444659212,2015-10-12T14:13:32Z,moutarde au moût de raisin,,,...,,,,,,,,,dressings and sauces,dressings and sauces
14,0000000000949,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1523440813,2018-04-11T10:00:13Z,1565268412,2019-08-08T12:46:52Z,Salade de carottes râpées,,,...,,,,,,,,,legumes,legumes
22,0000000001281,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1517830801,2018-02-05T11:40:01Z,1527070794,2018-05-23T10:19:54Z,Tarte noix de coco,,,...,,,,,,,,,pizza pies and quiche,pizza pies and quiche
32,0000000001885,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1511180337,2017-11-20T12:18:57Z,1518126491,2018-02-08T21:48:11Z,Compote de poire,,,...,,,,,,,,,fruits,dairy desserts
34,0000000002103,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1539629524,2018-10-15T18:52:04Z,1549964455,2019-02-12T09:40:55Z,Aiguillettes de poulet,,,...,,,,,,,,,meat,meat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1625055,9999992756068,http://world-en.openfoodfacts.org/product/9999...,kiliweb,1587209487,2020-04-18T11:31:27Z,1591133605,2020-06-02T21:33:25Z,Steak haché,,,...,,,,,,,,,meat,meat
1625058,9999999004360,http://world-en.openfoodfacts.org/product/9999...,kiliweb,1548086277,2019-01-21T15:57:57Z,1558357406,2019-05-20T13:03:26Z,Minis beignets,,Beignets natures sucrés,...,,,,,,,,,sweets,sweets
1625060,9999999175305,http://world-en.openfoodfacts.org/product/9999...,sil,1577002381,2019-12-22T08:13:01Z,1596533045,2020-08-04T09:24:05Z,Erdbeerkuchen 1019g tiefgefroren,,,...,,,,,,,,,biscuits and cakes,biscuits and cakes
1625061,99999995,http://world-en.openfoodfacts.org/product/9999...,kiliweb,1538818432,2018-10-06T09:33:52Z,1607508961,2020-12-09T10:16:01Z,Steak haché pur boeuf,,,...,,,,,,,,,meat,meat


In [24]:
pnns.append('code')
pnns.append('url')
pnns.append('product_name')
pnns

['pnns_groups_1',
 'pnns_groups_2',
 'pnns_groups_3',
 'pnns_groups_4',
 'code',
 'url',
 'product_name']

In [25]:
data_export = df_pnns[pnns]
data_export.shape

(674517, 7)

In [26]:
data_export

Unnamed: 0,pnns_groups_1,pnns_groups_2,pnns_groups_3,pnns_groups_4,code,url,product_name
3,fat and sauces,dressings and sauces,dressings and sauces,dressings and sauces,0000000000100,http://world-en.openfoodfacts.org/product/0000...,moutarde au moût de raisin
14,composite foods,one dish meals,legumes,legumes,0000000000949,http://world-en.openfoodfacts.org/product/0000...,Salade de carottes râpées
22,sugary snacks,biscuits and cakes,pizza pies and quiche,pizza pies and quiche,0000000001281,http://world-en.openfoodfacts.org/product/0000...,Tarte noix de coco
32,fruits and vegetables,fruits,fruits,dairy desserts,0000000001885,http://world-en.openfoodfacts.org/product/0000...,Compote de poire
34,fish meat eggs,meat,meat,meat,0000000002103,http://world-en.openfoodfacts.org/product/0000...,Aiguillettes de poulet
...,...,...,...,...,...,...,...
1625055,fish meat eggs,meat,meat,meat,9999992756068,http://world-en.openfoodfacts.org/product/9999...,Steak haché
1625058,sugary snacks,sweets,sweets,sweets,9999999004360,http://world-en.openfoodfacts.org/product/9999...,Minis beignets
1625060,sugary snacks,biscuits and cakes,biscuits and cakes,biscuits and cakes,9999999175305,http://world-en.openfoodfacts.org/product/9999...,Erdbeerkuchen 1019g tiefgefroren
1625061,fish meat eggs,meat,meat,meat,99999995,http://world-en.openfoodfacts.org/product/9999...,Steak haché pur boeuf


In [27]:
data_export.isnull().sum()

pnns_groups_1       0
pnns_groups_2       0
pnns_groups_3       0
pnns_groups_4       0
code                0
url                 0
product_name     5430
dtype: int64

In [29]:
data_export.to_csv (r'C:\Users\Antoine\Coding Bootcamp\Open Food Facts\df_multi_pnns_v2.csv', index = False, header=True)