# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts

### Cleaning dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import libraries

import pandas as pd
import re
import time

In [3]:
def load_sample(PATH='./datasets/', start_range=1, end_range=5):
    start = time.time()

    splitted_datasets = []

    for sample in range(start_range, end_range+1):
        start_load = time.time()

        dataset = pd.read_csv(
            PATH + 'openfoodfacts_part' + str(sample)+'.csv',
            sep='\t')

        end_load = time.time()

        print(f'Sample {sample} : {end_load - start_load} sec.')

        splitted_datasets.append(dataset)

    end = time.time()

    print('-'*20)
    print(f'Load {end_range - start_range + 1} samples : {end - start} sec.')

    return pd.concat(splitted_datasets)

In [4]:
def delete_empty_columns(dataset, rate=0.8):
    columns_to_drop = ['Unnamed: 0', 'url', 'code', 'creator', 'created_t', 'created_datetime', 'last_modified_t',
                       'last_modified_datetime', 'abbreviated_product_name', 'generic_name', 'packaging', 
                       'packaging_tags', 'packaging_text', 'brands', 'brands_tags', 'brand_owner', 'categories', 'categories_en', 'origins',
                       'origins_en', 'manufacturing_places', 'labels', 'labels_en', 'emb_codes', 'emb_codes_tags', 'countries', 'countries_tags', 'countries_en',
                       'first_packaging_code_geo', 'cities', 'purchase_places', 'stores', 'countries', 'countries_en', 
                       'traces', 'traces_en', 'allergens_en', 'serving_size', 'serving_quantity', 'additives', 
                       'additives_en', 'ingredients_from_palm_oil', 'ingredients_that_may_be_from_palm_oil', 'ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 
                       'states', 'states_tags', 'states_en', 'main_category_en', 'image_small_url', 'image_url', 
                       'image_ingredients_url', 'image_ingredients_small_url', 'image_nutrition_url', 
                       'image_nutrition_small_url']

    for col in dataset.columns:
        if dataset[col].isna().sum() / len(dataset) > rate:
            columns_to_drop.append(col)

    return delete_specific_columns(dataset, columns_to_drop=columns_to_drop)

In [16]:
def delete_specific_columns(dataset, columns_to_drop=[]):
    columns_to_keep = ['product_name', 'categories_tags', 'ingredients_text', 'additives_tags',
                       'nutriscore_score', 'nutriscore_grade', 'nova_group', 'pnns_groups_1',
                       'pnns_groups_2', 'main_category', 'energy-kcal_100g', 'energy_100g',
                       'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g',
                       'salt_100g']
    cols = []
    for column in columns_to_drop:
        if column in dataset.columns and column not in columns_to_keep:
            cols.append(column)

    print(f'Delete {len(cols)} columns.')

    return dataset.drop(columns=cols)

In [37]:
def correct_enconding_characters(x):
    x = x.replace('\_', '')
    x = x.replace('\%', '')
    x = x.replace('\*', '')
    
    x = clean_ingredients_list(x)

    x = x.lower()
    x = x.strip()

    x = x.replace('ã©', 'é')
    x = x.replace('&quot;', '')
    x = x.replace('cã¨', 'è')
    x = x.replace('à¨', 'ê')
    x = x.replace('ã', 'à')
    x = x.replace('ã´', 'ô')
    x = x.replace('à´', 'ô')
    x = x.replace('à¢', 'â')
    x = x.replace('à¯', 'ï')
    x = x.replace('à®', 'î')
    x = x.replace('å', 'oe')
    x = x.replace('â', '\'')

    return x

In [26]:
def clean_ingredients_list(x):
    # Delete additives as there is already an 'additive' column
    # Delete vitamins as we are not going to use them
    x = re.sub('(b|e){1}\d*\w', '', x)

    # Delete quantities
    x = re.sub('(\d)+([a-zA-Z])+', '', x)

    return x

In [33]:
dataset = load_sample()
df = delete_empty_columns(dataset)

Sample 1 : 1.4254016876220703 sec.
Sample 2 : 1.2751152515411377 sec.
Sample 3 : 1.2862586975097656 sec.
Sample 4 : 1.3162758350372314 sec.
Sample 5 : 1.2782299518585205 sec.
--------------------
Load 5 samples : 6.582278728485107 sec.
Delete 184 columns.


In [35]:
df = df.dropna(subset = ['ingredients_text'])

In [38]:
df['ingredients_text'] = df['ingredients_text'].apply(correct_enconding_characters)

1        lhe sidnatada, azucar 6.9 lhe dnatada  polva, ...
6        u grain de tégumts de moutarde vinaigre de vin...
25       antioxydant : érythorte de sodium, colorant : ...
33       lait ti, sucre, amidon de maïs, cacao, agar agar.
41       gute poite vin pain gute 50,6: farine de é, u,...
                               ...                        
49995    syrup d (high fructose corn syrup, corn syrup,...
49996    sugar, corn syrup, pnuts, sodium carnate, salt...
49997    sugar, corn syrup, pans, palm oil, salt, sorto...
49998    sugar, rich wht flour (wht starch, niacin, ruc...
49999    sugar, rich wht flour (wht flour, niacin, ruc ...
Name: ingredients_text, Length: 186346, dtype: object

In [12]:
dataset.isna().sum()

Unnamed: 0                 0
code                       0
url                        0
creator                    0
created_t                  0
                       ...  
choline_100g          249999
phylloquinone_100g    249310
beta-glucan_100g      250000
inositol_100g         249992
carnitine_100g        250000
Length: 187, dtype: int64

In [13]:
# TODO : pnns_groups_1  repartition

df['pnns_groups_1'].unique()

array(['unknown', 'Fat and sauces', 'Composite foods', 'Sugary snacks',
       'Fruits and vegetables', 'Fish Meat Eggs', 'Beverages',
       'Milk and dairy products', 'Cereals and potatoes', 'Salty snacks',
       'Alcoholic beverages', nan], dtype=object)

In [14]:
# TODO : pnns_groups_2 repartition

df['pnns_groups_2'].unique()

array(['unknown', 'Dressings and sauces', 'One-dish meals',
       'Biscuits and cakes', 'Fruits', 'Meat', 'Sweetened beverages',
       'Cheese', 'Bread', 'Fish and seafood', 'Sweets',
       'Salty and fatty products', 'Fruit juices', 'Dried fruits',
       'Vegetables', 'Fats', 'Dairy desserts', 'Milk and yogurt',
       'Pastries', 'Pizza pies and quiches', 'Legumes',
       'Unsweetened beverages', 'Nuts', 'Cereals', 'Alcoholic beverages',
       'Breakfast cereals', 'Appetizers', 'Processed meat',
       'Chocolate products', 'Eggs', 'Plant-based milk substitutes',
       'Sandwiches', 'Ice cream', 'Soups',
       'Teas and herbal teas and coffees', 'Potatoes',
       'Artificially sweetened beverages', 'Waters and flavored waters',
       nan, 'Offals', 'Fruit nectars'], dtype=object)

In [15]:
# TODO : nutriscore repartition

df['nutriscore_grade'].unique()

array([nan, 'd', 'b', 'a', 'c', 'e'], dtype=object)