In [19]:
import numpy as np
import pandas as pd
from unidecode import unidecode
import re
import string
import warnings
warnings.filterwarnings('ignore')	


### Load Ingredients

In [20]:
relative_path = '../Data/_Product__RAW_Data.xlsx'
org_products = pd.read_excel(relative_path)
products = org_products.copy()

display(products)

Unnamed: 0,Barcode,Unnamed: 1,Ingredient List,Name
0,3600542399326,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...
1,8606029266766,8606029266766,Silicone,Velnea Fingering Silik.Lila
2,4049639429550,4049639429550,"aqua (water),stearic acid,copernicia cerifera ...","Magic Finish 5-In-1 Hybrid Mascara\nVolume, Le..."
3,4011700740291,4011700740291,"butane,alcohol,propane,parfum (fragrance),dipr...",Echt Kï¿½Lnisch Wasser Aerosol Deodorant Spray...
4,4011700740475,4011700740475,"sodium palmate,sodium palm kernelate,aqua (wat...",Echt Kï¿½Lnisch Wasser Cream Soap\nMildly Clea...
...,...,...,...,...
13670,6001051004959,6001051004959,"Aqua, Alcohol Denat., Butyl Methoxydibenzoylme...",Sun Babies & Kids Kids' Sun Spray Spf 50+
13671,5060447940494,5060447940494,"Aqua (Water), Octocrylene, Alcohol Denat., Gly...",Sun Cream Spf 30
13672,4005900870407,4005900870407,"ALPHA-ISOMETHYL IONONE, *******, BUTYL METHOXY...",Sun Oil Spray Tropical Bronze Spf 6
13673,8436575091013,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream


### Clean DataSet

In [21]:
def clean_text(s):
    if isinstance(s, str):
        s = unidecode(s)    # è -> e (unicode transliteration)
        s = s.lower()
        return s

In [22]:
def unidecode_text(s):      # function for product_name only
    if isinstance(s, str):
        s = unidecode(s)    # è -> e (unicode transliteration)
        return s

In [23]:
products = products.drop(columns='Unnamed: 1')
products.rename(columns={'Ingredient List': 'ingredient_list_original'}, inplace=True)
products.rename(columns={'Name': 'product_name'}, inplace=True)
products['ingredient_list'] = products['ingredient_list_original'].apply(clean_text)

ingredients_pattern = r'^(?i)ingredients[^\w\s]*:$'     # brisenje na stringot ingredients: sto se srekjava na pocetok na nekoj od listite so sostojki
products['ingredient_list'] = products['ingredient_list'].str.replace(ingredients_pattern, '')
products['ingredient_list'] = products['ingredient_list'].str.replace('f,i,l,', 'fil')
products.reset_index(drop=True, inplace=True)

In [24]:
products['product_name'] = products['product_name'].apply(unidecode_text)
search_string = re.escape(', *******')                                                                      # brisenje na sting so '******' koj se srekjava
products['ingredient_list'] = products['ingredient_list'].str.replace(search_string, '')
#products['ingredient_list'] = products['ingredient_list'].str.replace('.', '')                             # mislam ne treba oti ima brojki so tocka odvoeni kako procenti
products['ingredient_list'] = products['ingredient_list'].str.split(", ")

products = products.explode('ingredient_list')
products.reset_index(drop=True, inplace=True)

In [25]:
patterns_to_replace = patterns_to_replace = ['[+/-]:', '[+/-:', '[+ -]', '[+/-', '(+ - ', '(+ / -)', '(+/-)', '(+/', '[ + / -', '[ +(', '[ +/-', '[+  -', '[+ -', '[+ / -', '[+ -', '[+(', '[+/-]', '[+/-', '[+/- ', '+/- ['] 
for pattern in patterns_to_replace: 
    products['ingredient_list'] = products['ingredient_list'].str.replace(re.escape(pattern), '') # brisenje na tekstot na sostojki sto pocnuvaat so: '+/-' ili '[+ -' ili '[+/- ' ime na sostojka

products = products.dropna(subset=['ingredient_list']).loc[products['ingredient_list'] != '']

In [26]:
products = products[~(products['ingredient_list'].astype(str).str.contains('may contain', case=False, regex=False) | (products['ingredient_list'].isna()))]      # brisenje na redot na sostojki sto pocnuvaat sto sodrzat: may contain

products = products[~products['ingredient_list'].astype(str).str.match(r'^[0-9.,-]+$')]                     # brisenje na red koj sodrzi samo broj + punktuacija
products = products[~products['ingredient_list'].astype(str).str.match(r'^[^\w\s]+[0-9]+$')]                # brisenje na red koj sodrzi samo punktuacija + broj
products['ingredient_list'] = products['ingredient_list'].str.lstrip(',')                                   # brisenje na ',' ako stringot pocnuva so ','
products = products[~products['ingredient_list'].str.strip(string.punctuation).eq('')]                      # brisenje na red koj sodrzi samo punktuacija
products = products[~products['ingredient_list'].apply(lambda x: pd.to_numeric(x, errors='coerce')).notna() | (products['ingredient_list'] == '')]  # brisenje na red koj sodrzi samo broj
products['ingredient_list'] = products['ingredient_list'].astype(str).str.replace('*', '')                  # brisenje na dzvezdicki
products['ingredient_list'] = products['ingredient_list'].str.strip()                                       # trim

In [27]:
mask_to_drop = (products['ingredient_list'].str.len() < 3) | (products['ingredient_list'].str.len() > 70)   # brisenje na red koj sodrzi pomalku od 3 ili povekje od 70 karakteri
products = products[~mask_to_drop]

In [28]:
nan_rows = products['ingredient_list'].isna()
print(nan_rows.sum())
products = products.dropna(subset=['ingredient_list'])

nan_rows = products['ingredient_list'].isna()
print(nan_rows.sum())

0
0


In [29]:
products

Unnamed: 0,Barcode,ingredient_list_original,product_name,ingredient_list
0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aqua / water
1,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aluminum chlorohydrate
2,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,cetearyl alcohol
3,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,ceteareth-33
4,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,parfum / fragrance
...,...,...,...,...
236715,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,"6,10-dimethylundeca-1,5,9-trien-4-ol"
236716,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,6-amino-m-cresol
236717,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,"6,7-dihydrolinalool"
236718,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,6-hydroxyindole


In [30]:
products.to_excel('products.xlsx')

In [31]:
org_products = pd.read_excel('products.xlsx')
products = org_products.copy()

display(products)

Unnamed: 0.1,Unnamed: 0,Barcode,ingredient_list_original,product_name,ingredient_list
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aqua / water
1,1,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aluminum chlorohydrate
2,2,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,cetearyl alcohol
3,3,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,ceteareth-33
4,4,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,parfum / fragrance
...,...,...,...,...,...
228234,236715,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,"6,10-dimethylundeca-1,5,9-trien-4-ol"
228235,236716,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,6-amino-m-cresol
228236,236717,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,"6,7-dihydrolinalool"
228237,236718,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,6-hydroxyindole


In [33]:
nan_rows = products['ingredient_list'].isna()
print(nan_rows.sum())
products = products.dropna(subset=['ingredient_list'])

nan_rows = products['ingredient_list'].isna()
print(nan_rows.sum())

9
0


In [34]:
products.to_excel('products.xlsx')

In [35]:
org_products = pd.read_excel('products.xlsx')
products = org_products.copy()

display(products)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Barcode,ingredient_list_original,product_name,ingredient_list
0,0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aqua / water
1,1,1,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aluminum chlorohydrate
2,2,2,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,cetearyl alcohol
3,3,3,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,ceteareth-33
4,4,4,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,parfum / fragrance
...,...,...,...,...,...,...
228225,228234,236715,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,"6,10-dimethylundeca-1,5,9-trien-4-ol"
228226,228235,236716,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,6-amino-m-cresol
228227,228236,236717,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,"6,7-dihydrolinalool"
228228,228237,236718,8436575091013,"2-Ethyl-3,(5 Or 6)-Dimethylpyrazine, 2-Trans, ...",,6-hydroxyindole


In [36]:
nan_rows = products['ingredient_list'].isna()
print(nan_rows.sum())
products = products.dropna(subset=['ingredient_list'])

nan_rows = products['ingredient_list'].isna()
print(nan_rows.sum())

0
0
