# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts


### Part 1 : Define and clean the vocabulary of ingredients

In [1]:
# Install packages

# !pip install chardet
# !pip install python-magic
# !pip install pyenchant
# !pip install hunspell 
# !pip install tensorflow
# !pip install nltk
# !pip install langdetect
# !pip install pyspellchecker

In [2]:
# import librairies

# import enchant
from langdetect import detect
import nltk
from nltk.metrics import *
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from spellchecker import SpellChecker
import tensorflow as tf
import re
import time

nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\cheic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Load clean and export dataset

In [3]:
# Detect language used in ingredients and keep only those in english
def define_language(x): 
    global count
    x = x.lower()
    if re.findall("^[a-z]", x): 
        try: 
            lang = detect(x)
            return "en" if lang == "en" else None
        except : 
            print("Invalid character : " + x)
            return None
    else: 
        return None
    

In [4]:
def clean_dataset(PATH): 
    dataset = pd.read_csv(PATH, sep = '\t', encoding='latin1') 
    start_time = time.time()

    columns_to_drop = [
        'url', 
        'code',
        'creator',
        'created_t',
        'created_datetime',
        'last_modified_t',
        'last_modified_datetime',
        'abbreviated_product_name',
        'generic_name',
        'packaging',
        'packaging_tags',
        'packaging_text',
        'brands',
        'categories',
        'categories_en',
        'origins',
        'origins_en',
        'manufacturing_places',
        'labels',
        'labels_en',
        'emb_codes',
        'emb_codes_tags',
        'first_packaging_code_geo',
        'cities',
        'purchase_places',
        'stores',
        'countries',
        'countries_en',
        'traces',
        'traces_en',
        'allergens_en',
        'serving_size',
        'serving_quantity',
        'additives',
        'additives_en',
        'ingredients_from_palm_oil',
        'ingredients_that_may_be_from_palm_oil',
        'states',
        'states_tags',
        'states_en',
        'main_category_en',
        'image_small_url',
        'image_ingredients_url',
        'image_ingredients_small_url',
        'image_nutrition_url',
        'image_nutrition_small_url'
    ]

    # Drop unnecessary columns
    df = dataset.drop(columns = columns_to_drop)

    # Drop rows where product_name, categories_tags or ingredients_text are empty
    df = df.dropna(subset = ['product_name', 'categories_tags', 'ingredients_text'])

    # Detect language and keep only those in english
    df["ingredients_text_language"] = df["ingredients_text"].apply(define_language) 
    df = df.dropna(subset = ['ingredients_text_language'])
    
    # Then drop created column as not necessary for the next steps
    df = df.drop(columns = ['ingredients_text_language'])

    end_time = time.time()
    
    print(f"PATH : {PATH} -- Execution Time : {end_time - start_time}")
    print("="*10)

    return df
    

In [5]:
PATH = './datasets/openfoodfacts.csv'
openfoodfacts = clean_dataset(PATH)
openfoodfacts.shape
openfoodfacts.to_csv('./datasets/openfoodfacts_clean.csv', sep = "\t")

  openfoodfacts = clean_dataset(PATH)


Invalid character : https://vm.tiktok.com/zme7qpxg5/
Invalid character : https://hubpak.com/ https://www.eshop.hubpak.com/
Invalid character : https://static.openfoodfacts.org/images/products/325/039/128/5556/4.100.jpg
PATH : ./datasets/openfoodfacts.csv -- Execution Time : 6131.624509572983


In [None]:
openfoodfacts.shape

### Tokenize ingredients

In [None]:
tokenizer = RegexpTokenizer("[A-Za-z'%-]+")
dataset["ingredients"] = dataset["ingredients_text"].apply(
    lambda x : tokenizer.tokenize(x))

### Handle mistakes

#### First method : using NLTK's corpus vocabulary

In [None]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
set_ingredients = set([_.lower() for list in dataset["ingredients"].to_list() for _ in list])

for word in list(set_ingredients):
    list_distance = list()
    for _ in english_vocab:
        list_distance.append(edit_distance(_, word))
    print(f"{word} ==> {list(english_vocab)[list_distance.index(min(list_distance))]}", flush = True)

#### Second method : using SpellChecker

In [None]:
spell = SpellChecker()
start_time = time.time()
for _ in set_ingredients:
    misspelled = spell.unknown([_])
    if len(misspelled):
        print(f"{_} ==> {spell.correction(list(misspelled)[0])}")
end_time = time.time()

print(f"Execution Time : {end_time - start_time}")