# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts


### Part 1 : Define and clean the vocabulary of ingredients

In [1]:
# Install packages

# !pip install chardet
# !pip install python-magic
# !pip install pyenchant
# !pip install hunspell 
# !pip install tensorflow
# !pip install nltk
# !pip install langdetect
# !pip install pyspellchecker
# !pip install google_trans_new

In [2]:
# import librairies

# import enchant
from langdetect import detect
import nltk
from nltk.metrics import *
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from spellchecker import SpellChecker
import tensorflow as tf
import re
import time
from google_trans_new import google_translator

nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\cheic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Load clean and export dataset

In [3]:
# Detect language used in ingredients and translate in english
gtranslator = google_translator()
count = 0

def translate(x):
    global count 
    count += 1

    x = x.lower()
    x = x.strip()
    
    x = x.replace('ã©', 'é')
    x = x.replace('&quot;', '')
    x = x.replace('cã¨', 'è')
    x = x.replace('ã', 'à')
    x = x.replace('ã´', 'ô')
    x = x.replace('à´', 'ô')
    x = x.replace('à¢', 'â')

    # Remove special characters : _ - @ :// * ( ) [ ] % . · + # { }
    x = re.sub('_|-|@|:\/\/|\*|\(|\)|\[|\]|%|\.|\·|\+|\#|\{|\}', '', x)
    # Remove digits
    x = re.sub('\d', '', x)
    
    # Detect language and translate into english
    try: 
        lang = gtranslator.detect(x)
        print(f"Row n°{count} ==> {lang} - {x}", flush = True)

        if lang != 'en': 
            return gtranslator.translate(x, lang_src = lang, lang_tgt = 'en')
        else: 
            return x
    except : 
        print(f"Invalid character at row {count} : {x}", flush = True)
        return "INVALID"
    

In [4]:
def clean_dataset(PATH): 
    dataset = pd.read_csv(PATH, sep = '\t', encoding='latin1') 
    start_time = time.time()

    columns_to_drop = [
        'url', 
        'code',
        'creator',
        'created_t',
        'created_datetime',
        'last_modified_t',
        'last_modified_datetime',
        'abbreviated_product_name',
        'generic_name',
        'packaging',
        'packaging_tags',
        'packaging_text',
        'brands',
        'categories',
        'categories_en',
        'origins',
        'origins_en',
        'manufacturing_places',
        'labels',
        'labels_en',
        'emb_codes',
        'emb_codes_tags',
        'first_packaging_code_geo',
        'cities',
        'purchase_places',
        'stores',
        'countries',
        'countries_en',
        'traces',
        'traces_en',
        'allergens_en',
        'serving_size',
        'serving_quantity',
        'additives',
        'additives_en',
        'ingredients_from_palm_oil',
        'ingredients_that_may_be_from_palm_oil',
        'states',
        'states_tags',
        'states_en',
        'main_category_en',
        'image_small_url',
        'image_ingredients_url',
        'image_ingredients_small_url',
        'image_nutrition_url',
        'image_nutrition_small_url'
    ]

    # Drop unnecessary columns
    df = dataset.drop(columns = columns_to_drop)

    # Drop rows where product_name, categories_tags or ingredients_text are empty
    df = df.dropna(subset = ['product_name', 'categories_tags', 'ingredients_text'])

    # Detect language and translate in english
    df["ingredients_en"] = df["ingredients_text"].apply(translate) 

    end_time = time.time()
    
    print(f"PATH : {PATH} -- Execution Time : {end_time - start_time}")
    
    return df
    

In [None]:
PATH = './datasets/openfoodfacts.csv'
openfoodfacts = clean_dataset(PATH)
openfoodfacts.shape
openfoodfacts.to_csv('./datasets/openfoodfacts_translated.csv', sep = "\t")

  openfoodfacts = clean_dataset(PATH)


Invalid character at row 1 : eau graines de téguments de moutarde vinaigre de vin rouge sel vin rouge sucre   moà»t de raisin  oignons colorants extraits de carotte et extrait de paprika huile de tournesol son de moutarde sel cette moutarde uniquement disponible chez courte paille
Invalid character at row 2 : jus et purée d'abricots  minimun, eau, sucre
Invalid character at row 3 : bâguette bressan pain baguette ,,' farine de blà, eau, sel, levure, gluten, farine de ble malté, levure désactivée, acide ascorbique, garniture ,: filet de poulet braisé , filet de poulet , eau, acidifiant : lactate de potassium et acétate de sodium, amidon modifié de manioc, sel, dextrose glucose, arômes, gélifiants eayam$xydant : érythà¨fbate de sodium, colorant : grafnel â¢ origine ue, tomatenx, oeuf dur ,/ , 'salade ,/ o
Invalid character at row 4 : glycérine d'origine naturelle stabilisant, spiruline spirulina maxima d'origine naturelle, extrait de fruits de camu camu myrciaria dubia, arôme naturel de

Invalid character at row 35 : sucre  , sirop de glucose, huiles et graisses végétales palme, tournesol, illipé, sirop de sucre inverti, stabilisant : eii, eau, humectant : e, épaississant : e, émulsifiant : e, acidifiant : e, arôme, conservateur
Invalid character at row 36 : ingràdients ; brioche farine, eau, raisins secs, beurre, sucre, levure, fàcule de pomme de terre modifiàe, åuf entier liquide, sel, poudre de lactosàrum, gluten de blà, lait àcràmà dessàchà , amidon de blà, acàtate de calcium, phisphate de sodium, làcithine de colza, acide ascorbique, amylase, xylatase, bcarotàne, aràme artificiel  nappage :â eau, sucre, sirop de maàspectine, acide citrique, carraghànine, gomme de xanthane, sorbate de potassium, aràme artificiel, citrate de sodium, chlorure de calcium, mono et diglycàrides, huile de canola, phosphate de sodium, coloranti
Invalid character at row 37 : à¸à¸µà¸à¹à¸à¹
Invalid character at row 38 : pâtes à  l'épeautre  ràªve de neptune  spelt 

Invalid character at row 63 : garniture sauce béchamel  eau, jam  bon cuit supérieur  jambon de porc, sel, sirop de glucose, dextrose, antioxydant ascorbate de sodium, conservateur nitrite de sodium, farine de blé, lait écrémé en poudre, huile de tournesol, emmental  lait, sel, ferments, coagulant, crà¨me fraà®che, comté , lait, ferments, sel, présure, sel, poivre blanc, muscade, pâte  eau, farine de blé, oeuf, lait écrémé en poudre, huile de tournesol, dextrose, sel, sucre viande de porc : origine : france traces de céleri
Invalid character at row 64 : pommes de terre , åufs , eau, beurre ,, farine de blé, huile de tournesol, flocons de pommes de terre pomme de terre déshydratée, émulsifiant mono  et diglycérides d'acides gras, extrait d'épice curcuma, poudre de lait entier, poudres à  lever pyrophosphate de soude, bicarbonate de soude, sel, amidon de blé, poivre gris, muscade traces de mollusques, poisson, crustacés, fruits à  coque, céleri, soja, moutarde
Invalid character at row 6

Invalid character at row 97 : sugar, glucose syrup, cocoa butter, skimmed milk powder, cocoa mass, milk fat, peanuts lactose, palm fat, desiccated coconut wheat flour, whey powder from milk, sunflower oil, full cream milk powder skimmed condensed milk, emulsifiers soya lecithin, e, barley malt extract, fat reduced cocoa, salt, humectant glycerol, egg white powder, natural vanilla extract, raising agents e, e, e, coconut oil hydrolysed milk protein, may contain hazelnut, milk chocolate contains milk solids  minimum and cocoa solids  minimum
Invalid character at row 98 : sugar, cocoa butter, skimmed milk powder, cocoa mass, whey powder from milk, lactose, milk fat, emulsifier soya lecithin, natural vanilla extract milk chocolate contains milk solids  minimum and cocoa solids  minimum
Invalid character at row 99 : sugar, cocoa butter, skimmed milk powder, cocoa mass, whey powder from milk, lactose, milk fat, emulsifier soya lecithin, natural vanilla extract
Invalid character at row 100 : 

Invalid character at row 124 : sugar, cocoa butter, cream powder , corn , cocoa mass, lactose, emulsifier: lecithin soy, salt barley malt, natural flavor
Invalid character at row 125 : sugar, almonds, water, sorbitol, invert sugar, food coloring, turmeric, carmine, fd&c blue : egg white
Invalid character at row 126 : only the best: unbleached flour, organic steel cut whole oats, organic california raisins, creamery butter, whole egg, unrefined sugar, madagascar vanilla
Invalid character at row 127 : only the best: unbleached flour,  organic peanut butter, peanuts, creamery butter, unrefined sugar, whole egg
Invalid character at row 128 : bart & judy's proprietary rice flour blend, butter, sweet potato, cranberries, brown cane sugar, whole egg, baking soda, salt
Invalid character at row 129 : bart & judy's proprietary rice flour blend, butter, fresh ground ginger, brown cane sugar, molasses, whole egg, baking soda, salt
Invalid character at row 130 : isolat de protéines de soja, fructos

Invalid character at row 165 : milk chocolate sugar, milk, cocoa butter, chocolate, soy lecithinan emulsifier, vanillianan artificial flavor
Invalid character at row 166 : semisweet chocolate: sugar, chocolate processed with alkali, cocoa butter, milk fat, soy lecithinan emulsifier, vanillianan artificial flavor, natural flavors
Invalid character at row 167 : milk chocolate sugar, milk, cocoa butter, chocolate, soy lecithinan emulsifier, vanillianan artificial flavor
Invalid character at row 168 : milk chocolate sugar, milk, cocoa butter, chocolate, soy lecithin  an emulsifier, vanillianan artificial flavor
Invalid character at row 169 : milk chocolate sugar, milk, cocoa butter, chocolate, soy lecithinan emulsifier, vanillianan artificial flavor
Invalid character at row 170 : semisweet chocolate: sugar, chocolate processed with alkali, cocoa butter, milk fat, soy lecithinan emulsifier, vanillianan artificial flavor, natural flavors
Invalid character at row 171 : semisweet chocolate: su

Invalid character at row 215 : sugar, corn syrup, enriched wheat flour wheat, iron, niacin, thiamine, riboflavin & folic acid, water, cane syrup, artificial flavor, modified food starch corn, caramel color, palm oil, a&w root beer concentrate caramel color, water, natural & artificial flavors, citric acid, soy mono & diglycerides, potassium sorbate, titanium dioxide, salt
Invalid character at row 216 : sugar, corn syrup, enriched wheat flour wheat, iron, niacin, thiamin, riboflavin & folic acid, water, golden cane syrup, artificial flavor, modified food starch corn, palm oil, hawaiian punch concentrate concentrated juices apple clarified pineapple, passionfruit orange, water, citric acid & less than  of fruit purees apricot, papaya, guava, natural and artificial flavors, pectin, acacia gum, ester gum, red , blue , sodium benzoate, citric acid, malic acid, soy mono & diglycerides, potassium sorbate, titanium dioxide, salt, fd&c red 
Invalid character at row 217 : corn syrup, sugar, wate

Invalid character at row 247 : corn syrup, sugar, water, apple juice concentrate, modified food starch corn, gelatin, dextrose, apple puree, citric acid, artificial flavor, ascorbic acid, mineral oil, carnuba wax, artificial colors fd&c red , yellow , yellow , blue 
Invalid character at row 248 : corn syrup, sugar, water, apple juice concentrate, modified food starch corn, gelatin, dextrose, apple puree, citric acid, artificial flavor, ascorbic acid, mineral oil, carnuba wax, artificial colors fd&c red , yellow , blue 
Invalid character at row 249 : fuji apples
Invalid character at row 250 : bananas, strawberries
Invalid character at row 251 : mangos
Invalid character at row 252 : grapes
Invalid character at row 253 : fuji apples, ascorbic acid, citric acid
Invalid character at row 254 : bananas, strawberries
Invalid character at row 255 : mangos
Invalid character at row 256 : caranberries, sugar, natural flavors, sunflower oil
Invalid character at row 257 : cranberries, sugar, natural

Invalid character at row 301 : chicken soup base corn syrup solids, salt, corn starch, hydrolyzed corn, soy protein, rendered chicken fat, sugar, onion powder, disodium inosinate, disodium guanylate spices, turmeric, dehydrated parsley, silicon dioxide, salt, sugar, yeast extract, sweet whey, turmeric, garlic powder, caramel color, dehydrated onion
Invalid character at row 302 : cheese powder whey, buttermilk solids, cheeses granular and cheddar pasteurized milk, cheese culture, salt, enzymes, whey protein concentrate, salt, sodium phosphate, citric acid, fd&c yellow , fd&c yellow , lactic acid, enzymes, modified corn starch, creamer maltodextrin, palm oil, silicon dioxide
Invalid character at row 303 : iodized salt
Invalid character at row 304 : bananas, coconut/vegetable oil, sugar/honey, natural flavoring
Invalid character at row 305 : sliced freezedried strawberries
Invalid character at row 306 : dehydrated potatoes
Invalid character at row 307 : potatoes, monoglycerides, sodium ac

Invalid character at row 335 : pain burger au pavot , : farine de ble , eau, levure, dextrose, huile de olza, sucre, arôme, graines de pavot ,, sarrasin ,, sel, gluten de ble, farine de fà¨ves, conservateur : propionate de calcium, émulsifiant : mono  et diglycerides gras, antioxydant : acide ascorbique garniture : ,  aiguillettes de poulet pané , aiguillettes poulet , farine de ble, eau, huile de tournesol, sel, amidon modifié arôme lactose, gluten, stabilisants : ei  elili  ei, levure, épaississant : ea, fromage cheddar  fromages dont cheddar , lait écrémé réhydraté, beurre, sels de fonte : citrates de sodium, acide citrique protéines de lait, sel colorant : extrait de paprika, antiagglomérant : lécithine de tourneso!, guacamole ,  avocat hass ,, poivron rouge sel, piment jalapeno  antioxygà¨ne : acide ascorbique, oignon, épaississant : gomme de xanthane, acidifiant : ascorbate de sodium, ail, acidifiant : acide citrique, coriandre, salade batavia , tomate , 
Invalid character at row

Invalid character at row 358 : jus de pomme  purée de fraise   purée de banane   â¢ purée de pomme â¢ purée de baie de sureau â¢ correcteur d'acidité : acide citrique â¢ : antioxydant : acide ascorbique,
Invalid character at row 359 : carottes , sauce type vinaigrette eau, huile de colza, vinaigre d'alcool, moutarde eau, graines de moutarde, vinaigre d'alcool, sel, sel, épaississants: ee, arômes, conservateur : e
Invalid character at row 360 : fruit de la passion
Invalid character at row 361 : salade verte ,, concombre ,, taboule , semoule de ble dur réhydratée , légumes  poivrons rouges et verts, tomates, tomates déshydratées, huile de colza, raisins secs  enrobage huile de colza, vinaigre d'alcool, huile d'olive vierge extra, sel, mentbejus de citron concentré, sucre, conservateur : e arôme, mais ,, tomate cerise ,, åuf dur , carottes râpées ,
Invalid character at row 362 : pâte farine de blé tendre, eau, huile d'olive extra vierge, sel, levure, sauce blanche eau, crà¨me de lait

Invalid character at row 374 : patate douce en poudre
Invalid character at row 375 : ingredients roasted peanuts , vegetable oil palm, peanut, sugar, salt, allergy advice for allergens, seeingredens in bold, not suitable for other nut allergy sufferers,
Invalid character at row 376 : salade verte ,, dà¨s de jambon , jambon de porc  origine ue, eau, sel, sirop de giucose, stabilisants : e, e, arômes naturels conservateurs : e, e, dà¨s d'emmental , lait pasteurisé de vache, sel, ferments lactiques, coagulant, chlorure de calcium, croutons à  l'ail  farine de ble ,, huile de toueneseol, arôme naturel d'ail, gluten de ble, sel, sucre, levure, tomate cerà­se , ceuf dur 
Invalid character at row 377 : cheddar , eau, fromage , lait écrémé en poudre, beurre, sels de fonte e, e, protéines de lait, arômes naturels, sel, colorants beta carotà¨ne, extrait de paprika, antiagglomérant lécithine de tournesol
Invalid character at row 378 : celeri , sauce rémoulade  eau, huile de colza, moutarde eau, g

Invalid character at row 400 : pain ciabatta , : farine ble, orge malté torréfié, ble malté, eau, huiles et graisses végétales huile d'olive vierge extra, gluten de ble, sel, levure, levain de seigle désactivé, ferments, agent de traitement de la farine : acide ascorbique garniture , : filet de poulet rá»ti , filet de poulet  origine france eau, dextrose, huile de tournesol, sel, protéines de pois, arômes, stabilisant e, fibre de blé sans gluten, gélifiant ea, concombre ,, carottes rapées , salade batavia ,, coriandre fraiche ,, sauce teriaki , sauce soja eau, graines de soja, ble, sel, alcool, sucre, eau, amidon modifié de maà¯s, purée d'oignon, vinaigre d'alcool, jus de pomme concentré, poudre d'ail, jus de citron concentré coriandre fraà®che ,  exprimés sur la garniture
Invalid character at row 401 : salade verte ,, tomate cerise ,
Invalid character at row 402 : monocalcium phosphate, sodium bicarbonate, cornstarch
Invalid character at row 403 : pain suedois , : farine de ble, eau,

Invalid character at row 427 : enriched wheat flour wheat flour, malted barley flour, niacin, iron, thiamin mononitrate, riboflavin, folic acid, water, yeast, high fructose corn syrup, soybean oil, contains  or less of the following: wheat gluten, sesame seeds, salt, cultured wheat flour, dough conditioners sodium stearoyl lactylate, monoglycerides, ascorbic acid, yeast nutrients monocalcium phosphate, calcium sulfate, ammonium sulfate, calcium propionate preservative, vinegar, soy lecithin
Invalid character at row 428 : pain de mie complet farines de blé, de fà¨ve, d'orge maltée, eau, huile de colza, son de blé, levure boulangà¨re, sucre, sel, vinaigre d'alcool, gluten de blé, viande de poulet, salade, huiles végétales colza, tournesol, eau, jaune d'oeuf, vinaigre d'alcool, dextrose, fécule de manioc, amidon de , arôme naturel, sel, arôme, poivre, graines de moutarde
Invalid character at row 429 : whole wheat flour, water, high fructose corn syrup, yeast, wheat gluten, soybean oil, co

Invalid character at row 453 : wheatflour contains gluten with nuti wheatflour, calcium carbonate, iron, niacin, thiamin â¢ butter milk  â¢ sugar â¢ oat flakes contain gluten â¢ belgian dark chocolate chunks  sugar â¢ cocoa mass â¢ cocoa butter â¢ emulsifier: soya lecithin â¢ vanilla flavouring â¢ crystallised stem ginger  stem ginger â¢ sugar â¢ hazelnuts  â¢ belgian milk chocolate chunks sugar â¢ dried whole milk â¢ cocoa butter â¢ cocoa mass â¢ emulsifier: soya lecithin â¢ vanilla flavouring â¢ diced pecan nuts â¢ currants â¢ golden syrup invert sugar syrup sunflower seeds â¢ dried skimmed milk â¢ pumpkin seeds â¢ raising agent: sodium bicarbonate, e, e â¢ invert sugar syrup â¢ salt â¢ ground ginger â¢ flavouring belgian dark chocolate chunks contain cocoa solids  minimum belgian milk chocolate chunks contain cocoa solids  minimum and milk solids  minimum for allergens see ingredients in bold suitable for vegetarians
Invalid character at row 454 : pates to

Invalid character at row 477 : sugar, wheat starch, whole soya flour, glucose syrup, cocoa butter, palm fat, caramel sugar syrup, whole milk powder, lactose, cocoa mass dextrose, peanuts, hazelnuts, glucosefructose syrup, sunflower lecithin, invertase, soya bean oil, rice flour, artificial flavor, orange peel, citric acid, salt
Invalid character at row 478 : sugar, partly hydrogenated vegetable fats palm, shea, rapeseed, soy, sunflower, coconut; in varying proportions, cocoa mass, whole milk powder, cocoa butter, skimmed milk powder, fat reduced cocoa powder, whey powder from milk, milk fat, emulsifiers soy lecithins, polyglycerol polyricinoleate, flavorings
Invalid character at row 479 : complement alimentair ingrédients pour  comprimes poudre de passiflore parties aériennes  mg, carbonate de magnesium, talc poids net :  g produit issu de l'agriculture biologique frbio agriculture ue/non u voir au fond
Invalid character at row 480 : wheatflour gluten with wheatflour, calcium carbonate

Invalid character at row 505 : noix de coco , sucre, farine de ble, creme fraà®che, eau, oeuf frais entier, margarines huiles et graisses végétales en l'état, raffinées et fractionnées palme, colza, eau, sel, émulsifiants : e, correcteur d'acidité : acide citrique, colorant : bétacarotà¨ne, conservateur : e, arôme, beurre, sucre glace fécule de pomme de terre, poudre de lait écrémé, amidons transtormés de maà¯s, fibre de ble, e  e protéines de lait, agents de fermentation désactivés, dextrose, poudre levante poudre à  lever eiiie, colorants : extrait de paprika  betacarotene, arôme, sel, gélifiants : agar agar  pectines dextrose, acidifiant: acide citrique
Invalid character at row 506 : framboises , farine de froment agent de traitement de la farine : e, sucre, sirop de glucosefructose, eau, beurre, huile de palme, graisse de palme fractionnée, huile de colza, oeuf entier, poudre de petit lait enrichie en protéines, gélifiant : e, acidifiant: e, correcteurs d'acidité e, e, e, stabilisa

Invalid character at row 545 : farine de blé contient gluten avec farine de blé, carbonate de calcium, fer, niacine, thiamine, sucre, raisins secs   raisins de smyrne, raisins de corinthe vostizza, raisins secs, eau, beurre lait  , pommes, huile de palme, huile de colza, graisse végétale huile de palme, huile de tournesol, farine de riz, sucre roux, àcorces d'orange et de citron confites sirop de glucosefructose, àcorce d'orange, sucre, àcorce de citron, correcteur d'acidité : acide citrique, dextrose, gluten de blé déshydraté, jus de citron, àpices moulues cannelle, noix de muscade, clous de girofle, zeste de citron, poudre à  lever : e, bicarbonate de soude, sirop de sucre noir, sel, acidifiant : acide acétique ,correcteur d'acidité : acide citrique, gélifiant : pectine issue de fruits, antioxydant : acide ascorbique
Invalid character at row 546 : sugar, glucose syrup, pgi red orange juice of sicily, citric acid, natural flavor of pgi red orange of sicily
Invalid character at row

Invalid character at row 582 : cashewnuts , sesame , pistachios , sugar, glucose syrup, aromalemon ,
Invalid character at row 583 : fat free milk, vitamin a palmitate, vitamin d
Invalid character at row 584 : milk, cream
Invalid character at row 585 : cultured pasteurized nonfat milk, pasteurized milk, pasteurized cream, sea salt, stabilizer nonfat milk, locust bean gum, agar, carbon dioxide to maintain freshness
Invalid character at row 586 : cooked basmati rice  waterâ basmati rice â chicken breast 'â onà­onsâ tomatoes  â tomato paste, rapeseed oil  flame seared red peppers  â butter milk â single cream milk â gingerâ gà¡rlic puréà¨ â low fat yogurt milk â salt â cider vinegar, corianderâ red chilli purée â lemon juice â cornflour â ground coriander â green chillies â ground cumin â balti garam masala sweet paprika â roasted cardamom, roasted coriander â roasted cumin â crushed red chillies â roasted black pepper â roasted cassia â star anise â nutmeg â ajwain seeds â chilli powder â

Invalid character at row 608 : sugar, dried whole milk, cocoa butter, cocoa mass, emulsifier: soya lecithinâ tapioca starch, dried skimmed milk, vanilla  flavouring, butter oil milk, colour: anthocyanins from black carrots, paprika extract, plant and vegetable extracts safflower, carrot, spirulina, invert sugar syrup  milk chocolate contains cocoa solids  minimum, milk solids  minimum
Invalid character at row 609 : waterâ broccoli  â spinach  â peas , onions, edamame soybeans  â spring cabbage  â leeks  â low fat natural yogurt milk â ginger purée â kale  â rapeseed oil  spring onions â coriander, cornflour, salt â sodium bicarbonate, mint
Invalid character at row 610 : jus d'orange    purée de carotte    purée de pomme  jus de pomme  purée de banane  purée de mangue   purée de citrouille  jus de citron concentré  antioxydant: acide ascorbique  régulateur d'acidité: acide citrique
Invalid character at row 611 : water, cranberry juice from concentrate , raspberry juice from concentrate 

Invalid character at row 635 : nonfat milk, partially hydrogenated soybean oil, contains  or less of: dipotassium phosphate, carrageenan, vitamin a palmitate, vitamin d
Invalid character at row 636 : calcium carbonate, microcrystalline cellulose, stearic acid, vitamin b methylcobalamin
Invalid character at row 637 : ingredienti magnesio bisglicinato in polvere, idrossido di calcio, involucro della capsula idrossipropilmetilcellulosa, acido lascorbico vitamina c, cloruro di potassio, fumarato ferroso, polvere di calendula calendula officinalis, fiore luteina, tocoferolo acetato vitamina e, niacina vitamina b, gluconato di manganese, ossido di zinco, estratto di tagete francese tagetes erecrta l, fiore zeaxantina, gluconato di rame, acido pantotenico vitamina b, retinolo acetato vitamina a, colecalciferolo vitamina d, riboflavina vitamina b, tiamina hci vitamina b, piridossina hcl vitamina b, cromo picolinato, loduro di potassio, metilcobalamina vitamina b, acido folico vitamina b, bioti

Invalid character at row 658 : filling: ricotta cheese milk, whey, cream, vinegar, salt, natural stabilizers xanthan gum, locust bean gum, guar gum, mozzarella cheese pasteurized whole milk, culture, salt, enzymes, parmesan cheese part skimmed pasteurized cow's milk, cheese cultures, salt, rennet, milk, egg whites, natural parmesan cheese flavor parmesan cheese milk, culture, salt, enzymes, salt, yeast extract, unsalted butter, corn oil, olive oil, creme powder, sugar and natural flavoring, salt, black pepper, nutmeg pasta: durum flour, natural wheat flour, water, eggs, turmeric oil
Invalid character at row 659 : organic durum semolina flour, water, organic eggs, organic turmeric oil
Invalid character at row 660 : filling: organic cheese blend ricotta cheese, mozzarella cheese, cheddar cheese milk, vinegar, salt, cheese cultures, enzymes, organic breadcrumbs wheat flour, evaporated cane syrup, yeast, sea salt, organic spinach, organic spinach powder, organic black pepper, sea salt past

Invalid character at row 677 : filling: chicken sausage chicken, water, salt, spices, ricotta cheese milk, whey, cream, vinegar, salt, natural stabilizers xanthan gum, locust bean gum, guar gum, kale, cheddar cheese milk, enzymes, salt, mozzarella cheese cultured pasteurized milk, salt, enzymes, milk, parmesan base parmesan cheese milk, culture, salt, enzymes, salt, yeast extract, unsalted butter, corn oil, olive oil, creme powder, sugar and natural flavoring, eggs, garlic, salt, black pepper, blended oil canola, olive, crushed red pepper pasta: durum flour, water, eggs, betacarotene
Invalid character at row 678 : filling: ricotta cheese milk whey, cream, vinegar, salt, natural stabilizers xanthan gum, locust bean gum, guar gum, mozzarella cheese pasteurized whole milk culture, salt, enzymes, tomatoes, parmesan cheese milk, salt, enzymes, parmesan flavor parmesan cheese milk, culture, salt, enzymes salt, yeast extract, unsalted butter, corn oil olive oil, cream powder, sugar and natura

Invalid character at row 690 : eau, riz, levure de riz, sucre, alcool , par volume, sel
Invalid character at row 691 : mackerel scomber scombrus fish , water, tomato purée , salt
Invalid character at row 692 : wheat flour, partially hydrogenated palm and canola oils, sugar, whole milk powder, nonfat milk powder, chocolate liquor, cocoa processed with alkali, cheese, salt, soylecithin, sodium bicarbonate leavening, yeast powder, artificial flavors, natural coloring beta carotene and caramel
Invalid character at row 693 : wheat flour, partially hydrogenated palm and canola oils, sugar, whey, lactose, whole milk powder, cheese, salt, soylecithin, strawberry powder, natural coloring beet red, beta carotene and caramel, sodium bicarbonate leavening, yeast powder, artificial flavor, citric acid
Invalid character at row 694 : cracker: wheat flour, shortening, vegetable oil, sugar, butter, cocoa mass, cocoa powder, full cream milk powder, amino acid seasoning, sugar ester, emulsifier soya leci

In [None]:
openfoodfacts.shape

### Tokenize ingredients

In [None]:
tokenizer = RegexpTokenizer("[A-Za-z'%-]+")
dataset["ingredients"] = dataset["ingredients_text"].apply(
    lambda x : tokenizer.tokenize(x))

### Handle mistakes

#### First method : using NLTK's corpus vocabulary

In [None]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
set_ingredients = set([_.lower() for list in dataset["ingredients"].to_list() for _ in list])

for word in list(set_ingredients):
    list_distance = list()
    for _ in english_vocab:
        list_distance.append(edit_distance(_, word))
    print(f"{word} ==> {list(english_vocab)[list_distance.index(min(list_distance))]}", flush = True)

#### Second method : using SpellChecker

In [None]:
spell = SpellChecker()
start_time = time.time()
for _ in set_ingredients:
    misspelled = spell.unknown([_])
    if len(misspelled):
        print(f"{_} ==> {spell.correction(list(misspelled)[0])}")
end_time = time.time()

print(f"Execution Time : {end_time - start_time}")