# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts


### Part 1 : Define and clean the vocabulary of ingredients

In [None]:
# Install packages

# !pip install pyenchant
# !pip install hunspell 
!pip install tensorflow
!pip install nltk
!pip install langdetect
!pip install pyspellchecker

In [None]:
# import librairies

# import enchant
from langdetect import detect
import nltk
from nltk.metrics import *
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from spellchecker import SpellChecker
import tensorflow as tf

nltk.download('words')

### Load dataset

In [None]:
# Load dataset
PATH = './datasets/openfoodfacts.csv'
dataset = pd.read_csv(PATH, sep = '\t') 

### Clean dataset

In [None]:
columns_to_drop = [
    'url', 
    'code',
    'creator',
    'created_t',
    'created_datetime',
    'last_modified_t',
    'last_modified_datetime',
    'abbreviated_product_name',
    'generic_name',
    'packaging',
    'packaging_tags',
    'packaging_text',
    'brands',
    'categories',
    'categories_en',
    'origins',
    'origins_en',
    'manufacturing_places',
    'labels',
    'labels_en',
    'emb_codes',
    'emb_codes_tags',
    'first_packaging_code_geo',
    'cities',
    'purchase_places',
    'stores',
    'countries',
    'countries_en',
    'traces',
    'traces_en',
    'allergens_en',
    'serving_size',
    'serving_quantity',
    'additives',
    'additives_en',
    'ingredients_from_palm_oil',
    'ingredients_that_may_be_from_palm_oil',
    'states',
    'states_tags',
    'states_en',
    'main_category_en',
    'image_small_url',
    'image_ingredients_url',
    'image_ingredients_small_url',
    'image_nutrition_url',
    'image_nutrition_small_url'
]

# Drop unnecessary columns
df = dataset.drop(columns = columns_to_drop)

# Drop rows where product_name; categories_tags or ingredients_text are empty
df = df.dropna(subset = ['product_name', 'categories_tags', 'ingredients_text'])

# Detect language used in ingredients and keep only those in english
df["ingredients_text_language"] = df["ingredients_text"].apply(
    lambda x : "en" if detect(x.lower()) == "en" else None) 
df = df.dropna(subset = ['ingredients_text_language'])

# Then drop created column as not necessary for the next steps
df = dataset.drop(columns = ['ingredients_text_language'])

In [None]:
df.shape

In [None]:
# Save cleaned dataset to make the next steps easier
df.to_csv('./datasets/clean_openfoodfacts.csv', sep = "\t")

## Tokenize ingredients

In [None]:
tokenizer = RegexpTokenizer("[A-Za-z'%-]+")
df["ingredients"] = df["ingredients_text"].apply(
    lambda x : tokenizer.tokenize(x))

### Handle mistakes

#### First method : using NLTK's corpus vocabulary

In [None]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
set_ingredients = set([_.lower() for list in df["ingredients"].to_list() for _ in list])

In [None]:
for word in list(set_ingredients):
    list_distance = list()
    for _ in english_vocab:
        list_distance.append(edit_distance(_, word))
    print(f"{word} ==> {list(english_vocab)[list_distance.index(min(list_distance))]}", flush = True)

#### Second method : using SpellChecker

In [None]:
spell = SpellChecker()

for _ in set_ingredients:
    misspelled = spell.unknown([_])
    if len(misspelled):
        print(f"{_} ==> {spell.correction(list(misspelled)[0])}")
