# Advanced Machine Learning for NLP and Text Processing
## Project 1 : OpenFoodFacts

### Cleaning dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [32]:
# Import libraries

from collections import Counter
from deep_translator import GoogleTranslator
from langdetect import detect
import nltk
from nltk.metrics import *
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import re
from spellchecker import SpellChecker
import time

In [3]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\cheic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
def load_sample(start_range=1, end_range=5, PATH='./datasets/'):
    start = time.time()

    splitted_datasets = []

    for sample in range(start_range, end_range+1):
        start_load = time.time()

        dataset = pd.read_csv(
            PATH + 'openfoodfacts_part' + str(sample)+'.csv',
            sep='\t')

        end_load = time.time()

        print(f'Sample {sample} : {end_load - start_load} sec.')

        splitted_datasets.append(dataset)

    end = time.time()

    print('-'*20)
    print(f'Load {end_range - start_range + 1} samples : {end - start} sec.')

    return pd.concat(splitted_datasets)

In [5]:
def delete_empty_columns(dataset, rate=0.8):
    columns_to_drop = ['Unnamed: 0', 'url', 'code', 'creator', 'created_t', 'created_datetime', 'last_modified_t',
                       'last_modified_datetime', 'abbreviated_product_name', 'generic_name', 'packaging', 
                       'packaging_tags', 'packaging_text', 'brands', 'brands_tags', 'brand_owner', 'categories', 'categories_en', 'origins',
                       'origins_en', 'manufacturing_places', 'labels', 'labels_en', 'emb_codes', 'emb_codes_tags', 'countries', 'countries_tags', 'countries_en',
                       'first_packaging_code_geo', 'cities', 'purchase_places', 'stores', 'countries', 'countries_en', 
                       'traces', 'traces_en', 'allergens_en', 'serving_size', 'serving_quantity', 'additives', 
                       'additives_en', 'ingredients_from_palm_oil', 'ingredients_that_may_be_from_palm_oil', 'ingredients_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil_n', 
                       'states', 'states_tags', 'states_en', 'main_category_en', 'image_small_url', 'image_url', 
                       'image_ingredients_url', 'image_ingredients_small_url', 'image_nutrition_url', 
                       'image_nutrition_small_url']

    for col in dataset.columns:
        if dataset[col].isna().sum() / len(dataset) > rate:
            columns_to_drop.append(col)

    return delete_specific_columns(dataset, columns_to_drop=columns_to_drop)

In [38]:
def delete_specific_columns(dataset, columns_to_drop=[]):
    columns_to_keep = ['product_name', 'categories_tags', 'ingredients_text', 'additives_tags',
                       'nutriscore_score', 'nutriscore_grade', 'nova_group', 'pnns_groups_1',
                       'pnns_groups_2', 'main_category', 'energy-kcal_100g', 'energy_100g',
                       'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g',
                       'salt_100g']
    cols = []
    for column in columns_to_drop:
        if column in dataset.columns and column not in columns_to_keep:
            cols.append(column)

    print(f'Delete {len(cols)} columns.')

    return dataset.drop(columns=cols)

In [7]:
def correct_enconding_characters(x):
    x = x.replace('\_', '')
    x = x.replace('\%', '')
    x = x.replace('\*', '')
    
    x = clean_ingredients_list(x)

    x = x.lower()
    x = x.strip()

    x = x.replace('ã©', 'é')
    x = x.replace('&quot;', '')
    x = x.replace('cã¨', 'è')
    x = x.replace('à¨', 'ê')
    x = x.replace('ã', 'à')
    x = x.replace('ã´', 'ô')
    x = x.replace('à´', 'ô')
    x = x.replace('à¢', 'â')
    x = x.replace('à¯', 'ï')
    x = x.replace('à®', 'î')
    x = x.replace('å', 'oe')
    x = x.replace('â', '\'')

    return x

In [8]:
def clean_ingredients_list(x):
    # Delete additives as there is already an 'additive' column
    # Delete vitamins as we are not going to use them
    x = re.sub('(b|e){1}\d*\w', '', x)

    # Delete quantities
    x = re.sub('(\d)+([a-zA-Z])+', '', x)
    # Delete percentages
    x = re.sub('\d+\%', '', x)

    return x

In [39]:
dataset = load_sample(start_range=1, end_range=1)
dataset.shape

Sample 1 : 1.3599600791931152 sec.
--------------------
Load 1 samples : 1.3599600791931152 sec.


(50000, 187)

In [40]:
df = delete_empty_columns(dataset)

Delete 184 columns.


In [37]:
df.columns

Index(['product_name', 'categories_tags', 'ingredients_text', 'additives_n',
       'additives_tags', 'nutriscore_score', 'nutriscore_grade', 'nova_group',
       'pnns_groups_1', 'pnns_groups_2', 'main_category', 'energy-kcal_100g',
       'energy_100g', 'fat_100g', 'saturated-fat_100g', 'trans-fat_100g',
       'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
       'proteins_100g', 'salt_100g', 'sodium_100g', 'vitamin-a_100g',
       'vitamin-c_100g', 'potassium_100g', 'calcium_100g', 'iron_100g',
       'nutrition-score-fr_100g'],
      dtype='object')

In [10]:
df = df.dropna(subset = ['ingredients_text'])

In [11]:
df['ingredients_text'] = df['ingredients_text'].apply(correct_enconding_characters)

### Detect language

In [12]:
def detect_language(x): 
    try: 
        return detect(x)
    except : 
        return "unknown"

In [13]:
start = time.time()
df['language'] = df["ingredients_text"].apply(detect_language)
end = time.time()

print(f'Detect language : {end - start} seconds...')

Detect language : 455.1784086227417 seconds...


In [14]:
df['language'].unique()

array(['es', 'fr', 'ca', 'it', 'en', 'ro', 'pl', 'id', 'sv', 'tl', 'cy',
       'sq', 'sw', 'hr', 'lt', 'pt', 'th', 'et', 'unknown', 'nl', 'da',
       'de', 'fi', 'af', 'lv', 'no', 'tr', 'so', 'sl', 'ru', 'sk', 'cs',
       'vi', 'zh-tw', 'ar', 'he', 'hu', 'zh-cn'], dtype=object)

In [15]:
df.shape[0] - len(df[df['language'] == 'en'])

8961

In [16]:
df_ingredients_en = df[df['language'] == 'en']

In [17]:
(1 - (df_ingredients_en.shape[0] / df.shape[0])) * 100

23.152047539077635

### Translate ingredients into English

In [18]:
# translator = GoogleTranslator(source='auto', target='en')

In [19]:
def translate(x): 
    try:
        return translator.translate(x)
    except: 
        return "Cannot translate"

In [20]:
# start = time.time()
# for i, lang in enumerate(df['language']): 
#     if lang == 'en': 
#         df.at[i, 'ingredients_en'] = df['ingredients_text'].iloc[i]
#     else : 
#         df.at[i, 'ingredients_en'] = translate(df['ingredients_text'].iloc[i])
# end = time.time()

# print(f'Translate ingredients : {end - start} seconds...')

In [21]:
# df.head(5)

### Tokenize ingredients

In [22]:
tokenizer = RegexpTokenizer("[a-z'-]+")
df_ingredients_en["ingredients_token"] = df_ingredients_en["ingredients_text"].apply(lambda x : tokenizer.tokenize(x))

### Handle mistakes
#### First method : using NLTK's corpus vocabulary

In [30]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
ingredient_list = [_ for list in df_ingredients_en["ingredients_token"].to_list() for _ in list]
set_ingredients = set(ingredient_list)

In [27]:
row_list = []
start_time = time.time()
for word in list(set_ingredients):
    list_distance = list()
    if word not in english_vocab: 
        for _ in english_vocab:
            list_distance.append(edit_distance(_, word))
            
        print(f"{word} ==> {list(english_vocab)[list_distance.index(min(list_distance))]}", flush = True)
end_time = time.time()

print(f"Spelling mistakes - Method 1 : {end_time - start_time} seconds.")

vanillan ==> vanilla
monfat ==> nonfat
lychs ==> lycus
trate ==> irate
carrangnan ==> caranna
lutr ==> lut
varying ==> warding
exct ==> exact
diglcid ==> diacid
farine ==> parine
un-ch ==> nunch
hucklry ==> hackery
aglycid ==> glycid
ard ==> dard
salisry ==> salish
dipotass ==> potass
knal ==> knap
goats' ==> goaty
butyloctyl ==> tylostyle
ocesses ==> acestes
hydroylz ==> hydrol
aflavors ==> flavory
dipot ==> divot
mst ==> tst
northn ==> north
containslessthanof ==> curtainless
vingar ==> dingar
aminos ==> minos


KeyboardInterrupt: 

#### Second method : using SpellChecker

In [28]:
spell = SpellChecker()
start_time = time.time()

for _ in set_ingredients:
    misspelled = spell.unknown([_])
    if len(misspelled):
        print(f"{_} ==> {spell.correction(list(misspelled)[0])}")
end_time = time.time()

print(f"Spelling mistakes - Method 1 : {end_time - start_time} seconds.")

vanillan ==> vanilla
monfat ==> nonfat
lychs ==> lochs
trate ==> trade
carrangnan ==> carrangnan
lutr ==> lute
exct ==> exact
diglcid ==> diploid
farine ==> marine
un-ch ==> bunch
hucklry ==> hickory
aglycid ==> aglycid
salisry ==> satisfy
dipotass ==> dipotass
knal ==> anal
goats' ==> goats
butyloctyl ==> butyloctyl
ocesses ==> dresses
hydroylz ==> hydroxyl
aflavors ==> flavors
dipot ==> depot
mst ==> must
northn ==> north
containslessthanof ==> containslessthanof
vingar ==> vinegar
aminos ==> amino
lgian ==> lian
spaning ==> sparing
comtry ==> country
swai ==> swap
frk ==> fry
uals ==> pals
stl-cut ==> stl-cut
zn ==> in
onctrate ==> nitrate
granulat ==> granular
arul ==> aru
acontains ==> contains
ngre ==> ogre
amountisving ==> amountisving
jarlsrg ==> jarlsrg
isotyrate ==> isotyrate
lithin-anulsifi ==> lithin-anulsifi
sorae ==> sore
cocao ==> cocoa
cholula ==> cholla
lactlate ==> lactate
rirose ==> hirose
xanthai ==> bantha
sodiumphosphat ==> sodiumphosphat
alumina ==> alumna
anch =

fruitrim ==> fruitrim
anchony ==> anthony
apo-carotal ==> apo-carotal
jalap ==> jalal
nrtrition ==> nutrition
cihcihhati ==> cihcihhati
-avag ==> naval
capicola ==> capitol
cow'a ==> cow's
almord ==> almond
hd ==> he
cti ==> chi
tuxo ==> tux
chry-laur ==> chry-laur
watmor ==> water
-ue ==> due
yvalue ==> value
buttscotch ==> butterscotch
lingonrry ==> lingonberry
nwn ==> own
ulture ==> culture
marits ==> merits
tacarote ==> nagarote
araffinum ==> araffinum
trisodnphosphate ==> trisodnphosphate
nitrit ==> nitric
nonfai ==> nonfat
consvativ ==> consvativ
fudgf ==> fudge
chocolare ==> chocolate
kr ==> or
almonis ==> almonds
cookcon ==> cocoon
mzaluna ==> malena
acesulfame ==> acesulfame
walle ==> wall
nyl ==> ny
aufw ==> auf
rlavor ==> flavor
ingrie ==> ingrid
boysrri ==> boys're
wani ==> want
aspartame-aculfame ==> aspartame-aculfame
lp ==> up
fatmilk ==> family
rhianna ==> rihanna
acryloyldimhyltaurate ==> acryloyldimhyltaurate
carronat ==> carrot
chv ==> chi
mt-ss ==> miss
harina ==> m

mtasulfite ==> mtasulfite
calgium ==> calcium
choilt ==> child
kp ==> up
rits ==> its
glyco ==> glycol
syrp ==> syrup
roquorti ==> roquorti
ptn ==> pin
putassium ==> potassium
distririt ==> district
hydrog ==> hydro
flaxmeal ==> flaxman
propiary ==> topiary
acidy ==> acid
chdr ==> char
solue ==> solve
dicalcium ==> calcium
asido ==> aside
lusifi ==> luigi
viamin ==> vitamin
poxide ==> oxide
psvativ ==> psvativ
isaturatsugars ==> isaturatsugars
askhsh ==> asks
uuiiquulgo ==> uuiiquulgo
ttswt ==> test
hylhylglycin ==> hylhylglycin
jomanco ==> romance
aup ==> up
wht-fr ==> wht-fr
kalamata-style ==> kalamata-style
strc ==> stri
colar ==> color
jic ==> vic
trit ==> trip
fth ==> ith
sun-rip ==> subrip
partum ==> parfum
tapioga ==> tapioca
nutits ==> nuits
unfilt ==> unfit
litihin ==> within
contrte ==> contrite
whtflour ==> whtflour
hsh's ==> ash's
iame ==> came
tiglc ==> till
roaast ==> roast
gallmills ==> gallmills
ultra-filt ==> ultra-filt
anti-cake ==> anti-cake
glutamic ==> glutamine
mi

seven-upcd ==> seven-upcd
moofied ==> roofied
trigo ==> trio
savony ==> savory
pponi ==> phone
diglycids ==> diglycids
tinulin ==> insulin
garli ==> garlic
minal-s ==> finals
pattie ==> patties
dulse ==> pulse
cottsd ==> costs
famili ==> family
carhy ==> carry
polyoxyhyle ==> polyoxyhyle
applo ==> apply
chips-sugar ==> chips-sugar
auftrag ==> austral
calchum ==> calcium
ssonumg ==> ssonumg
prottive ==> protective
vanin ==> vain
facook ==> cook
iatin ==> latin
lithi ==> lithe
semi-sweet ==> semi-sweet
contam ==> contact
containss ==> contains
hexametaphosphate ==> hexametaphosphate
potatue ==> potatoe
ingri ==> ingrid
a-t ==> at
ptone-calcium ==> ptone-calcium
gurgi ==> urge
sdium ==> sodium
srte ==> site
rinyl ==> vinyl
tlured ==> lured
dha ==> ha
satidaction ==> satisfaction
carxy-mhylclulose ==> carxy-mhylclulose
palmitat ==> palmitic
fua ==> fun
lecithin ==> leithian
salti ==> salt
-moisture ==> moisture
calori ==> calorie
tripolyphoshate ==> tripolyphoshate
ladyfings ==> ladyfinger

phylalanin ==> phylalanin
annatto-vae ==> annatto-vae
lilk ==> milk
chik'n ==> chicken
chonds ==> chords
jlyans ==> plans
lactylate ==> lactate
organlc ==> organic
applood ==> applied
niacianamide ==> niacianamide
ftty ==> fatty
modifitapioca ==> modifitapioca
rthpaste ==> rthpaste
crm ==> cry
pinpple ==> pineapple
soy-div ==> soy-div
watmon ==> watson
natamyacin ==> natamyacin
sulphite ==> sulphide
uar ==> car
vitan ==> vital
valunalrs ==> valunalrs
parkiway ==> parkway
mitrowaving ==> microwaving
l-arginine ==> l-arginine
taoloca ==> tapioca
molydate ==> molydate
ncn ==> non
vanille ==> vanilla
casinate ==> castrate
j ==> i
rhns ==> runs
kraftcirsings ==> kraftcirsings
odifi ==> modify
acylat ==> ayla
ingts ==> ingots
schizochytrium ==> schizochytrium
fidge ==> fridge
nutti ==> nutty
ccm ==> com
liita ==> lista
monintrate ==> monintrate
otrt ==> tort
vhich ==> which
thiam ==> thia
molcaje ==> molcaje
ethanolamine ==> ethanolamine
roduct ==> product
guajillo ==> guajillo
solulis ==> s

anti-caking ==> anti-caking
carott ==> carpet
cold- ==> cold
malz ==> male
powdi ==> power
gtle ==> gale
ajwain ==> again
factionat ==> factional
advahced ==> advanced
proje ==> prove
hestnuy ==> chestnut
olipton ==> lipton
travse ==> traves
-flavouring ==> flavouring
oilj ==> oil
dtromhorphan ==> dtromhorphan
rainw ==> rain
hanc ==> hand
sulphte ==> sulphate
tmt ==> tit
fudgion ==> fusion
clusts ==> crusts
slimfast ==> slimmest
pj ==> pa
anti- ==> anti
tds ==> ads
pgi ==> pig
thophosphate ==> thophosphate
'-inosinate ==> '-inosinate
morss ==> moss
vall ==> all
shap ==> ship
dilc ==> disc
lvings ==> livings
nutraswt ==> nutraswt
ugar ==> sugar
maulphaite ==> maulphaite
diglyci ==> diglyci
adjustmt ==> adjust
mfg ==> mug
tripolyphosphat ==> tripolyphosphat
riviana ==> vivian
omog ==> smog
washrn ==> wash
casiate ==> caste
ngal ==> gal
diphosphate ==> phosphate
freshiness ==> freshness
ltthin ==> within
fgt ==> fit
phyalanine ==> phyalanine
coopative ==> cooperative
wrigl ==> will
filiti

pyprophosphate ==> pyprophosphate
raviolti ==> ravioli
ico ==> ice
susin ==> susan
sifi ==> sift
swtrays ==> strays
pign ==> sign
oxychloride ==> oxychloride
drogat ==> drought
potassm ==> potassium
dygcid ==> dygcid
piquin ==> piquing
lk ==> la
whtgrass ==> wheatgrass
frkl ==> from
chd ==> chi
thamin ==> thatin
saude ==> sauce
polasalum ==> polasalum
induding ==> inducing
paprikal ==> paprika
vaiuc ==> value
chioide ==> choice
chri ==> chris
suu ==> sun
contts ==> contos
nutrie ==> nurse
ancit ==> andit
ttoms ==> atoms
bruksanvis ==> bruksanvis
pks ==> pas
framis ==> frames
dary ==> day
trolyt ==> truly
monochloride ==> monochloride
pantothic ==> pantothic
monocalchum ==> monocalchum
intnatiuval ==> intnatiuval
hicago ==> chicago
sc ==> so
ocoa ==> cocoa
tellow ==> fellow
barl ==> ball
anhydrouus ==> anhydrous
sainsrys ==> sainsrys
carthamus ==> carthamus
tcornutarca ==> tcornutarca
rraladkaof ==> rraladkaof
cld ==> old
clars ==> class
un-sulfur ==> un-sulfur
nutri ==> nuts
onganig ==

trasodium ==> trasodium
mls ==> mes
poppysds ==> poppy's
vanillin-an-artificial ==> vanillin-an-artificial
clr ==> car
expeller ==> expelled
mononi ==> monani
bicarnate ==> incarnate
radiatore ==> radiator
anic ==> panic
autolyz ==> autolyz
wural ==> rural
grse ==> arse
eau-orga ==> eau-orga
mustgard ==> mustard
ractive ==> active
lacfic ==> lactic
dicat ==> ducat
nvt ==> not
iami ==> miami
pork-chick ==> pork-chick
fitin ==> itin
cocoyl ==> cool
cnate ==> crate
garnzo ==> garner
pastriz ==> pastry
rction ==> action
ronstitut ==> constitute
vine-ri ==> vine-ri
natur ==> nature
oistat ==> distant
contains-shorting ==> contains-shorting
choltal ==> choral
gmo ==> go
hydrolyzate ==> hydrolyzate
zylanase ==> zylanase
millssales ==> millssales
worchtshire ==> worchtshire
conc ==> con
tahorat ==> throat
nm ==> no
tulsifi ==> tulsi
macrochalus ==> macrochalus
lauryl ==> laurel
hard-cook ==> hard-cook
phossphate ==> phosphate
calcuim ==> calcium
rightus ==> rights
monounsaturat ==> monounsatur

lactyle ==> lactose
frht ==> frat
suntiow ==> suction
fordook ==> forsook
glucona ==> glucose
umoshi ==> moshi
lactosidase ==> lactosidase
fidus ==> ficus
amountisi ==> amounts
gligie ==> glide
sustainae ==> sustained
fatls ==> falls
carhyorat ==> carhyorat
casnat ==> can't
gaplic ==> garlic
ythorte ==> thorne
ralin ==> rain
fidolactis ==> fidolactis
humtant ==> human
floz ==> flow
prootic ==> erotic
pmit ==> pit
yumberry ==> mulberry
urad ==> grad
citricaycle ==> citricaycle
tt-type ==> tintype
matitol ==> marital
footll ==> fool
kfast ==> fast
anrs-sch ==> anrs-sch
tf ==> to
conds ==> bonds
stfding ==> studing
nf ==> of
ngrits ==> grits
suuntautuv ==> suuntautuv
rlic ==> relic
drt ==> art
tomoto ==> tomato
garni ==> marni
trisodia ==> trisodia
stroyl- ==> stroll
stk ==> sta
tricalcum ==> tricalcum
aquus ==> equus
natrium-saccharin ==> natrium-saccharin
folioacid ==> folioacid
prussiate ==> prussian
aragic ==> tragic
microal ==> micro
practition ==> partition
assurano ==> assurance
wo

fructo-oligosaccharid ==> fructo-oligosaccharid
mitamin ==> vitamin
ctains ==> chains
icts ==> its
hydroomide ==> hydroxide
falavor ==> flavor
chry ==> cry
owr ==> or
trolytic ==> prolytic
st-fding ==> standing
kn ==> in
frhly ==> fly
almnm ==> along
clulase ==> clause
candidum ==> candid
wholome ==> whole
cartohydrate ==> carbohydrate
daffnat ==> daffnat
caru ==> care
ladura ==> laura
glac ==> glad
sls ==> sly
smuth ==> south
biscoff ==> scoff
intior ==> into
slin ==> skin
pte ==> ate
cyl ==> cal
rgy ==> ray
suagrs ==> sugars
angino ==> angio
raysnosugaradd ==> raysnosugaradd
low-fat ==> lowfat
aicd ==> aid
nflow ==> flow
glutarate ==> glutamate
ta-carote ==> ta-carote
ptra ==> para
shortings ==> shootings
lorannoils ==> lorannoils
fts ==> its
chse-type ==> chse-type
lizardfish ==> lizardfish
occurn ==> occur
mamaphosphate ==> mamaphosphate
tital ==> total
cocpa ==> cocoa
pattasium ==> pattasium
psiste ==> piste
oma- ==> omar
hwood ==> wood
nitirite ==> nitrite
prumus ==> primus
fldur

chick-sugar ==> chick-sugar
apo-carotol ==> apo-carotol
snachs ==> snacks
trose ==> those
orthophosphate ==> orthophosphate
nutz ==> nuts
catu ==> cat
potass ==> potash
rount ==> round
community-l ==> community
chr ==> car
artif ==> artie
wl ==> we
solt ==> sort
phytonadione ==> phytonadione
bzy ==> by
optimizs ==> optimize
sinflower ==> sunflower
rrigsrate ==> rrigsrate
incl ==> inch
kprog ==> prog
kolasahm ==> kolasahm
synp ==> sync
prar ==> pray
cmg ==> cog
stratham ==> strata
'chine' ==> chine
flh ==> fly
alpina ==> alpine
rulator ==> curator
glol ==> glow
tractions ==> traction
rifalvin ==> ritalin
judias ==> judas
cocoa-procs ==> cocoa-procs
azodi ==> avoid
acesulfamek ==> acesulfamek
hational ==> national
dyrat ==> drat
caramine ==> carmine
m ==> i
cinnamom ==> cinnamon
azilian ==> brazilian
srp ==> sip
grt ==> get
hoir ==> hour
contain-cocoa ==> contain-cocoa
smartlal ==> martial
rult ==> rule
orgnic ==> organic
wcome ==> come
alpo ==> also
ictalurus ==> ictalurus
d-xylose ==> 

krafthinz ==> krafthinz
pinp ==> pink
dtromhoptan ==> dtromhoptan
tomako ==> tomato
hydroliz ==> hydroliz
ptuccini ==> ptuccini
slphte ==> spite
pasilla ==> padilla
ythorc ==> thorn
acic ==> acid
pholufite ==> pholufite
ruciron ==> rubicon
paprikia ==> paprika
chocoal ==> cocoa
snlac ==> sneak
tapiocastarch ==> tapiocastarch
utle ==> tle
uncur ==> uncut
doloriz ==> dolores
not-fat ==> nonfat
jtab ==> stab
mixe ==> mine
ysrri ==> sari
mnt ==> met
insuia ==> insula
rocesses ==> processes
hydrowde ==> hydroxide
df ==> of
rhurb ==> hurt
suins-rice ==> suins-rice
tracto ==> tractor
bld ==> old
ttu ==> tu
buckwht ==> bucket
dts ==> its
avourings ==> favouring
allantoin ==> allentown
hydroya ==> hydra
uice ==> nice
waler ==> water
nsfnongmo ==> nsfnongmo
conctratc ==> contract
cutl ==> cut
niloticus ==> niloticus
crispi ==> crisp
sdurce ==> source
sunfow ==> snow
rtrict ==> strict
buttmilk ==> buttermilk
nodica ==> notice
comtryl ==> control
uillon ==> dillon
prinssn ==> prison
guittard ==> g

KeyboardInterrupt: 

#### Third method : comparing words in the columns

In [33]:
# Sort ingredient list
ingredient_list.sort()

# Count occurences
occ = Counter(ingredient_list)

In [34]:
occ

Counter({"'": 22,
         "'-": 3,
         "'-carotal": 12,
         "'-guanylate": 1,
         "'-inosinate": 2,
         "'-monophosphate": 4,
         "'-monophosphte": 1,
         "'adds": 1,
         "'carotal": 1,
         "'certified": 1,
         "'chine'": 1,
         "'f": 2,
         "'for": 1,
         "'i": 1,
         "'itamins": 1,
         "'s": 6,
         "'sugar": 1,
         "'tane": 1,
         "'tau": 1,
         '-': 2852,
         '-apo-': 1,
         '-avag': 1,
         '-carnate': 3,
         '-carotal': 2,
         '-co': 1,
         '-cystn': 1,
         '-cystne': 2,
         '-d': 1,
         '-d-': 1,
         '-dat': 1,
         '-dimhyl-': 1,
         '-diol': 2,
         '-do': 1,
         '-dove': 2,
         '-flavouring': 1,
         '-fri': 1,
         '-hour': 1,
         '-hydroxy-': 1,
         '-hyl-': 1,
         '-inch': 1,
         '-iron': 1,
         '-l': 2,
         '-lactylate': 57,
         '-lo': 1,
         '-mhylpyrazine': 2,
   