# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts

### Cleaning dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import libraries

from collections import Counter
from deep_translator import GoogleTranslator
from langdetect import detect
import nltk
from nltk.metrics import *
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import re
from spellchecker import SpellChecker
import time

In [3]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\cheic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
def load_sample(start_range=1, end_range=5, PATH='./datasets/'):
    start = time.time()

    splitted_datasets = []

    for sample in range(start_range, end_range+1):
        start_load = time.time()

        dataset = pd.read_csv(
            PATH + 'openfoodfacts_part' + str(sample)+'.csv',
            sep='\t')
        
        end_load = time.time()

        print(f'Sample {sample} : {end_load - start_load} sec.')

        splitted_datasets.append(dataset)

    end = time.time()

    print('-'*20)
    print(f'Load {end_range - start_range + 1} samples : {end - start} sec.')
    
    df = pd.concat(splitted_datasets)
    print(f'Dataset before removing duplicates : {df.shape[0]} entries')
    
    df = df.drop_duplicates()
    print(f'Dataset after removing duplicates : {df.shape[0]} entries')
    return df

In [5]:
def delete_empty_columns(dataset, rate=0.8):
    columns_to_drop = ['Unnamed: 0', 'url', 'code', 'creator', 'created_t', 'created_datetime', 'last_modified_t',
                       'last_modified_datetime', 'abbreviated_product_name', 'generic_name', 'packaging', 
                       'packaging_tags', 'packaging_text', 'brands', 'brands_tags', 'brand_owner', 'categories', 'categories_en', 'origins',
                       'origins_en', 'manufacturing_places', 'labels', 'labels_en', 'emb_codes', 'emb_codes_tags', 'countries', 'countries_tags', 'countries_en',
                       'first_packaging_code_geo', 'cities', 'purchase_places', 'stores', 'countries', 'countries_en', 
                       'traces', 'traces_en', 'allergens_en', 'serving_size', 'serving_quantity', 'additives', 
                       'additives_en', 'ingredients_from_palm_oil', 'ingredients_that_may_be_from_palm_oil', 'ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 
                       'states', 'states_tags', 'states_en', 'main_category_en', 'image_small_url', 'image_url', 
                       'image_ingredients_url', 'image_ingredients_small_url', 'image_nutrition_url', 
                       'image_nutrition_small_url']

    for col in dataset.columns:
        if dataset[col].isna().sum() / len(dataset) > rate:
            columns_to_drop.append(col)

    return delete_specific_columns(dataset, columns_to_drop=columns_to_drop)

In [6]:
def delete_specific_columns(dataset, columns_to_drop=[]):
    columns_to_keep = ['product_name', 'categories_tags', 'ingredients_text', 'additives_tags',
                       'nutriscore_score', 'nutriscore_grade', 'nova_group', 'pnns_groups_1',
                       'pnns_groups_2', 'main_category', 'energy-kcal_100g', 'energy_100g',
                       'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g',
                       'salt_100g']
    cols = []
    for column in columns_to_drop:
        if column in dataset.columns and column not in columns_to_keep:
            cols.append(column)

    print(f'Delete {len(cols)} columns.')

    return dataset.drop(columns=cols)

In [7]:
def correct_enconding_characters(x):
    x = x.replace('\_', '')
    x = x.replace('\%', '')
    x = x.replace('\*', '')
    
    x = clean_ingredients_list(x)

    x = x.lower()
    x = x.strip()

    x = x.replace('ã©', 'é')
    x = x.replace('&quot;', '')
    x = x.replace('cã¨', 'è')
    x = x.replace('à¨', 'ê')
    x = x.replace('ã', 'à')
    x = x.replace('ã´', 'ô')
    x = x.replace('à´', 'ô')
    x = x.replace('à¢', 'â')
    x = x.replace('à¯', 'ï')
    x = x.replace('à®', 'î')
    x = x.replace('å', 'oe')
    x = x.replace('â', '\'')

    return x

In [8]:
def clean_ingredients_list(x):
    # Delete additives as there is already an 'additive' column
    # Delete vitamins as we are not going to use them
    x = re.sub('(b|e){1}\d*\w', '', x)

    # Delete quantities
    x = re.sub('(\d)+([a-zA-Z])+', '', x)
    # Delete percentages
    x = re.sub('\d+\%', '', x)

    return x

In [15]:
def clean_nutrition_facts_for_100g(x):
    if pd.isna(x):
        x = 0
    elif x > 100: 
        x = 100
    elif x < 0: 
        x = 0
    else: 
        x = x 
    return x

In [10]:
dataset = load_sample(start_range=1, end_range=1)

Sample 1 : 3.7555084228515625 sec.
--------------------
Load 1 samples : 3.7565152645111084 sec.
Dataset before removing duplicates : 50000 entries
Dataset after removing duplicates : 50000 entries


In [11]:
df = delete_empty_columns(dataset)

Delete 184 columns.


In [12]:
df.columns

Index(['product_name', 'categories_tags', 'ingredients_text', 'additives_n',
       'additives_tags', 'nutriscore_score', 'nutriscore_grade', 'nova_group',
       'pnns_groups_1', 'pnns_groups_2', 'main_category', 'energy-kcal_100g',
       'energy_100g', 'fat_100g', 'saturated-fat_100g', 'trans-fat_100g',
       'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
       'proteins_100g', 'salt_100g', 'sodium_100g', 'vitamin-a_100g',
       'vitamin-c_100g', 'potassium_100g', 'calcium_100g', 'iron_100g',
       'nutrition-score-fr_100g'],
      dtype='object')

In [13]:
df = df.dropna(subset = ['ingredients_text'])
df['ingredients_text'] = df['ingredients_text'].apply(correct_enconding_characters)

In [16]:
# Cleaning nutrition facts
df['fat_100g'] = df['fat_100g'].apply(clean_nutrition_facts_for_100g)
df['saturated-fat_100g'] = df['saturated-fat_100g'].apply(clean_nutrition_facts_for_100g)
df['carbohydrates_100g'] = df['carbohydrates_100g'].apply(clean_nutrition_facts_for_100g)
df['sugars_100g'] = df['sugars_100g'].apply(clean_nutrition_facts_for_100g)
df['fiber_100g'] = df['fiber_100g'].apply(clean_nutrition_facts_for_100g)
df['proteins_100g'] = df['proteins_100g'].apply(clean_nutrition_facts_for_100g)
df['salt_100g'] = df['salt_100g'].apply(clean_nutrition_facts_for_100g)

### Detect language

In [17]:
def detect_language(x): 
    try: 
        return detect(x)
    except : 
        return "unknown"

In [18]:
start = time.time()
df['language'] = df["ingredients_text"].apply(detect_language)
end = time.time()

print(f'Detect language : {end - start} seconds...')

Detect language : 980.684273481369 seconds...


In [19]:
df['language'].unique()

array(['es', 'fr', 'ca', 'it', 'en', 'ro', 'pl', 'id', 'cy', 'sv', 'tl',
       'sq', 'sw', 'hr', 'nl', 'pt', 'th', 'et', 'unknown', 'da', 'de',
       'fi', 'sk', 'no', 'lv', 'cs', 'so', 'sl', 'tr', 'ru', 'lt', 'af',
       'vi', 'zh-tw', 'ar', 'hu', 'bg', 'zh-cn'], dtype=object)

In [20]:
df.shape[0] - len(df[df['language'] == 'en'])

8971

In [21]:
df_ingredients_en = df[df['language'] == 'en']

In [22]:
print(f"LOSS : {(1 - (df_ingredients_en.shape[0] / df.shape[0])) * 100} %")

LOSS : 23.177883994315984 %


### Translate ingredients into English

In [23]:
# translator = GoogleTranslator(source='auto', target='en')

In [24]:
def translate(x): 
    try:
        return translator.translate(x)
    except: 
        return "Cannot translate"

In [25]:
# start = time.time()
# for i, lang in enumerate(df['language']): 
#     if lang == 'en': 
#         df.at[i, 'ingredients_en'] = df['ingredients_text'].iloc[i]
#     else : 
#         df.at[i, 'ingredients_en'] = translate(df['ingredients_text'].iloc[i])
# end = time.time()

# print(f'Translate ingredients : {end - start} seconds...')

### Tokenize ingredients

In [26]:
tokenizer = RegexpTokenizer("[a-z\'\-]+")
df_ingredients_en["ingredients_token"] = df_ingredients_en["ingredients_text"].apply(lambda x : tokenizer.tokenize(x))

### Handle mistakes
#### First method : using NLTK's corpus vocabulary

In [27]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
ingredient_list = [_ for list in df_ingredients_en["ingredients_token"].to_list() for _ in list]
# Sort ingredient list
ingredient_list.sort()
set_ingredients = set(ingredient_list)

In [28]:
df_spelling_ingredients = pd.DataFrame(set_ingredients, columns=['Initial'])
df_spelling_ingredients

Unnamed: 0,Initial
0,contains
1,nono
2,soup
3,mononitr
4,vannam
...,...
10602,di-glcid
10603,hvp
10604,rophosphate
10605,sanitatpiol


In [None]:
corrects = []
start_time = time.time()
for word in list(set_ingredients):
    list_distance = list()
    if word not in english_vocab: 
        for _ in english_vocab:
            list_distance.append(edit_distance(_, word))
        correct = list(english_vocab)[list_distance.index(min(list_distance))]
        corrects.append(correct)
        print(f"{word} ==> {correct}", flush = True)
end_time = time.time()

In [None]:
df_spelling_ingredients['NLTK'] = corrects       

In [None]:
print(f"Spelling mistakes - Method 1 : {end_time - start_time} seconds.")

#### Second method : using SpellChecker

In [None]:
spell = SpellChecker()

In [None]:
def spell_check_levenshtein(x): 
    misspelled = spell.unknown([x])
    if len(misspelled):
        x = spell.correction(list(misspelled)[0])
    return x

In [None]:
start_time = time.time()
df_spelling_ingredients['levenshtein'] = df_spelling_ingredients['Initial'].apply(spell_check_levenshtein)       
end_time = time.time()

In [None]:
print(f"Spelling mistakes - Method 2 : {end_time - start_time} seconds.")

#### Third method : comparing words in the columns

In [None]:
# Count occurences
occ = Counter(ingredient_list)

In [None]:
occ

In [None]:
len(occ)

#### Third method with stemming

In [None]:
ps = PorterStemmer()

In [None]:
ingredients_stemmed = []
for ingredient in ingredient_list: 
    try: 
        ingredients_stemmed.append(ps.stem(ingredient))
    except: 
        ingredients_stemmed.append("*"+ingredient+"*")

In [None]:
occ_stemmed = Counter(ingredients_stemmed)

In [None]:
occ_stemmed

In [None]:
len(occ_stemmed)