# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts


### Part 1 : Define and clean the vocabulary of ingredients

In [1]:
# Install packages

# !pip install chardet
# !pip install python-magic
# !pip install pyenchant
# !pip install hunspell 
# !pip install tensorflow
# !pip install nltk
# !pip install langdetect
# !pip install pyspellchecker
# !pip install google_trans_new

In [2]:
# import librairies

# import enchant
from langdetect import detect
import nltk
from nltk.metrics import *
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from spellchecker import SpellChecker
import tensorflow as tf
import re
import time
from google_trans_new import google_translator

nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\cheic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Load clean and export dataset

In [3]:
count = 0

In [4]:
# Detect language used in ingredients and translate in english
gtranslator = google_translator()

def translate(x): 
    global count 
    count += 1

    x = x.lower()
    x = x.strip()
    
    x = x.replace('ã©', 'é')
    x = x.replace('&quot;', '')
    x = x.replace('cã¨', 'è')
    x = x.replace('ã', 'à')
    x = x.replace('ã´', 'ô')
    x = x.replace('à´', 'ô')
    x = x.replace('à¢', 'â')

    # Remove special characters : _ - @ :// * ( ) [ ] % . · + # { }
    x = re.sub('_|-|@|:\/\/|\*|\(|\)|\[|\]|\%|\.|\·|\+|\#|\{|\}', '', x)
    # Remove digits
    x = re.sub('\d', '', x)
    
    # Detect language and translate into english
    try: 
        lang = gtranslator.detect(x)
        print(f"Row n°{count} ==> {lang} - {x}", flush = True)

        if lang != 'en': 
            return gtranslator.translate(x, lang_src = lang, lang_tgt = 'en')
        else: 
            return x
    except : 
        print(f"Invalid character at row {count} : {x}", flush = True)
        return "INVALID"
    

In [5]:
def clean_dataset(PATH): 
    dataset = pd.read_csv(PATH, sep = '\t') 
    start_time = time.time()

    columns_to_drop = [
        'url', 
        'code',
        'creator',
        'created_t',
        'created_datetime',
        'last_modified_t',
        'last_modified_datetime',
        'abbreviated_product_name',
        'generic_name',
        'packaging',
        'packaging_tags',
        'packaging_text',
        'brands',
        'categories',
        'categories_en',
        'origins',
        'origins_en',
        'manufacturing_places',
        'labels',
        'labels_en',
        'emb_codes',
        'emb_codes_tags',
        'first_packaging_code_geo',
        'cities',
        'purchase_places',
        'stores',
        'countries',
        'countries_en',
        'traces',
        'traces_en',
        'allergens_en',
        'serving_size',
        'serving_quantity',
        'additives',
        'additives_en',
        'ingredients_from_palm_oil',
        'ingredients_that_may_be_from_palm_oil',
        'states',
        'states_tags',
        'states_en',
        'main_category_en',
        'image_small_url',
        'image_ingredients_url',
        'image_ingredients_small_url',
        'image_nutrition_url',
        'image_nutrition_small_url'
    ]

    # Drop unnecessary columns
    df = dataset.drop(columns = columns_to_drop)

    # Drop rows where product_name, categories_tags or ingredients_text are empty
    df = df.dropna(subset = ['product_name', 'categories_tags', 'ingredients_text'])

    # Detect language and translate in english
    df["ingredients_en"] = df["ingredients_text"].apply(translate) 

    end_time = time.time()
    
    print(f"PATH : {PATH} -- Execution Time : {end_time - start_time}")
    
    return df
    

In [6]:
splitted_datasets = []

for i in range(1,41):
    PATH = './datasets/openfoodfacts_part'+ str(i) +'.csv'
    splitted_datasets.append(clean_dataset(PATH))

dataset = pd.concat(splitted_datasets)
dataset.to_csv('./datasets/openfoodfacts_google_translated.csv', sep = '\t')

  splitted_datasets.append(clean_dataset(PATH))


Invalid character at row 1 : eau graines de téguments de moutarde vinaigre de vin rouge sel vin rouge sucre   moût de raisin  oignons colorants extraits de carotte et extrait de paprika huile de tournesol son de moutarde sel cette moutarde uniquement disponible chez courte paille
Invalid character at row 2 : jus et purée d'abricots  minimun, eau, sucre
Invalid character at row 3 : bâguette bressan pain baguette ,,' farine de blé, eau, sel, levure, gluten, farine de ble malté, levure désactivée, acide ascorbique, garniture ,: filet de poulet braisé , filet de poulet , eau, acidifiant : lactate de potassium et acétate de sodium, amidon modifié de manioc, sel, dextrose glucose, arômes, gélifiants eayam$xydant : érythèfbate de sodium, colorant : grafnel • origine ue, tomatenx, oeuf dur ,/ , 'salade ,/ o
Invalid character at row 4 : glycérine d'origine naturelle stabilisant, spiruline spirulina maxima d'origine naturelle, extrait de fruits de camu camu myrciaria dubia, arôme naturel de citr

In [None]:
dataset.head()

### Tokenize ingredients

In [None]:
dataset = openfoodfacts

In [None]:
tokenizer = RegexpTokenizer("[A-Za-z'%-]+")
dataset["ingredients"] = dataset["ingredients_text"].apply(
    lambda x : tokenizer.tokenize(x))

### Handle mistakes

#### First method : using NLTK's corpus vocabulary

In [None]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
set_ingredients = set([_.lower() for list in dataset["ingredients"].to_list() for _ in list])

for word in list(set_ingredients):
    list_distance = list()
    for _ in english_vocab:
        list_distance.append(edit_distance(_, word))
    print(f"{word} ==> {list(english_vocab)[list_distance.index(min(list_distance))]}", flush = True)

#### Second method : using SpellChecker

In [None]:
spell = SpellChecker()
start_time = time.time()
for _ in set_ingredients:
    misspelled = spell.unknown([_])
    if len(misspelled):
        print(f"{_} ==> {spell.correction(list(misspelled)[0])}")
end_time = time.time()

print(f"Execution Time : {end_time - start_time}")