# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts

### Cleaning dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import libraries

from deep_translator import GoogleTranslator
from langdetect import detect
import nltk
from nltk.metrics import *
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import re
from spellchecker import SpellChecker
import time

In [3]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\cheic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
def load_sample(start_range=1, end_range=5, PATH='./datasets/'):
    start = time.time()

    splitted_datasets = []

    for sample in range(start_range, end_range+1):
        start_load = time.time()

        dataset = pd.read_csv(
            PATH + 'openfoodfacts_part' + str(sample)+'.csv',
            sep='\t')

        end_load = time.time()

        print(f'Sample {sample} : {end_load - start_load} sec.')

        splitted_datasets.append(dataset)

    end = time.time()

    print('-'*20)
    print(f'Load {end_range - start_range + 1} samples : {end - start} sec.')

    return pd.concat(splitted_datasets)

In [5]:
def delete_empty_columns(dataset, rate=0.8):
    columns_to_drop = ['Unnamed: 0', 'url', 'code', 'creator', 'created_t', 'created_datetime', 'last_modified_t',
                       'last_modified_datetime', 'abbreviated_product_name', 'generic_name', 'packaging', 
                       'packaging_tags', 'packaging_text', 'brands', 'brands_tags', 'brand_owner', 'categories', 'categories_en', 'origins',
                       'origins_en', 'manufacturing_places', 'labels', 'labels_en', 'emb_codes', 'emb_codes_tags', 'countries', 'countries_tags', 'countries_en',
                       'first_packaging_code_geo', 'cities', 'purchase_places', 'stores', 'countries', 'countries_en', 
                       'traces', 'traces_en', 'allergens_en', 'serving_size', 'serving_quantity', 'additives', 
                       'additives_en', 'ingredients_from_palm_oil', 'ingredients_that_may_be_from_palm_oil', 'ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 
                       'states', 'states_tags', 'states_en', 'main_category_en', 'image_small_url', 'image_url', 
                       'image_ingredients_url', 'image_ingredients_small_url', 'image_nutrition_url', 
                       'image_nutrition_small_url']

    for col in dataset.columns:
        if dataset[col].isna().sum() / len(dataset) > rate:
            columns_to_drop.append(col)

    return delete_specific_columns(dataset, columns_to_drop=columns_to_drop)

In [6]:
def delete_specific_columns(dataset, columns_to_drop=[]):
    columns_to_keep = ['product_name', 'categories_tags', 'ingredients_text', 'additives_tags',
                       'nutriscore_score', 'nutriscore_grade', 'nova_group', 'pnns_groups_1',
                       'pnns_groups_2', 'main_category', 'energy-kcal_100g', 'energy_100g',
                       'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g',
                       'salt_100g']
    cols = []
    for column in columns_to_drop:
        if column in dataset.columns and column not in columns_to_keep:
            cols.append(column)

    print(f'Delete {len(cols)} columns.')

    return dataset.drop(columns=cols)

In [7]:
def correct_enconding_characters(x):
    x = x.replace('\_', '')
    x = x.replace('\%', '')
    x = x.replace('\*', '')
    
    x = clean_ingredients_list(x)

    x = x.lower()
    x = x.strip()

    x = x.replace('ã©', 'é')
    x = x.replace('&quot;', '')
    x = x.replace('cã¨', 'è')
    x = x.replace('à¨', 'ê')
    x = x.replace('ã', 'à')
    x = x.replace('ã´', 'ô')
    x = x.replace('à´', 'ô')
    x = x.replace('à¢', 'â')
    x = x.replace('à¯', 'ï')
    x = x.replace('à®', 'î')
    x = x.replace('å', 'oe')
    x = x.replace('â', '\'')

    return x

In [8]:
def clean_ingredients_list(x):
    # Delete additives as there is already an 'additive' column
    # Delete vitamins as we are not going to use them
    x = re.sub('(b|e){1}\d*\w', '', x)

    # Delete quantities
    x = re.sub('(\d)+([a-zA-Z])+', '', x)

    return x

In [9]:
dataset = load_sample(start_range=1, end_range=1)
df = delete_empty_columns(dataset)

Sample 1 : 1.380401372909546 sec.
--------------------
Load 1 samples : 1.380401372909546 sec.
Delete 184 columns.


In [10]:
df = df.dropna(subset = ['ingredients_text'])

In [11]:
df['ingredients_text'] = df['ingredients_text'].apply(correct_enconding_characters)

### Detect language

In [12]:
def detect_language(x): 
    try: 
        return detect(x)
    except : 
        return "unknown"

In [13]:
start = time.time()
df['language'] = df["ingredients_text"].apply(detect_language)
end = time.time()

print(f'Detect language : {end - start} seconds...')

Detect language : 571.7479057312012 seconds...


In [14]:
df['language'].unique()

array(['es', 'fr', 'ca', 'it', 'en', 'ro', 'pl', 'cy', 'id', 'tl', 'sv',
       'sq', 'sw', 'hr', 'nl', 'pt', 'th', 'et', 'unknown', 'hu', 'da',
       'de', 'sk', 'no', 'fi', 'lv', 'so', 'sl', 'tr', 'ru', 'lt', 'af',
       'cs', 'vi', 'zh-tw', 'ar', 'bg', 'zh-cn'], dtype=object)

In [15]:
df.shape[0] - len(df[df['language'] == 'en'])

8938

In [16]:
df_ingredients_en = df[df['language'] == 'en']

In [17]:
(1 - (df_ingredients_en.shape[0] / df.shape[0])) * 100

23.092623692029456

### Translate ingredients into English

In [18]:
# translator = GoogleTranslator(source='auto', target='en')

In [19]:
def translate(x): 
    try:
        return translator.translate(x)
    except: 
        return "Cannot translate"

In [20]:
# start = time.time()
# for i, lang in enumerate(df['language']): 
#     if lang == 'en': 
#         df.at[i, 'ingredients_en'] = df['ingredients_text'].iloc[i]
#     else : 
#         df.at[i, 'ingredients_en'] = translate(df['ingredients_text'].iloc[i])
# end = time.time()

# print(f'Translate ingredients : {end - start} seconds...')

In [21]:
# df.head(5)

### Tokenize ingredients

In [22]:
tokenizer = RegexpTokenizer("[A-Za-z'%-]+")
df_ingredients_en["ingredients"] = df_ingredients_en["ingredients_text"].apply(lambda x : tokenizer.tokenize(x))

In [23]:
df_ingredients_en[['ingredients_text', 'ingredients', 'ingredients_en']]

KeyError: "['ingredients_en'] not in index"