In [1]:
import numpy as np
import pandas as pd
from unidecode import unidecode
import re
import string
import warnings
warnings.filterwarnings('ignore')


### Load Ingredients

In [2]:
relative_path = '../Data/raw/_Product__RAW_Data.xlsx'
org_products = pd.read_excel(relative_path)
products = org_products.copy()

products = products.dropna(subset=['Ingredient List', 'Name'])
display(products)

Unnamed: 0,Barcode,Unnamed: 1,Ingredient List,Name
0,3600542399326,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...
1,8606029266766,8606029266766,Silicone,Velnea Fingering Silik.Lila
2,4049639429550,4049639429550,"aqua (water),stearic acid,copernicia cerifera ...","Magic Finish 5-In-1 Hybrid Mascara\nVolume, Le..."
3,4011700740291,4011700740291,"butane,alcohol,propane,parfum (fragrance),dipr...",Echt Kï¿½Lnisch Wasser Aerosol Deodorant Spray...
4,4011700740475,4011700740475,"sodium palmate,sodium palm kernelate,aqua (wat...",Echt Kï¿½Lnisch Wasser Cream Soap\nMildly Clea...
...,...,...,...,...
13669,8034063521822,8034063521822,"Aqua [Water], Sodium coco-sulfate, Cocamidopro...",Shampoo Leaves Strengthening Anti-Age With Gin...
13670,6001051004959,6001051004959,"Aqua, Alcohol Denat., Butyl Methoxydibenzoylme...",Sun Babies & Kids Kids' Sun Spray Spf 50+
13671,5060447940494,5060447940494,"Aqua (Water), Octocrylene, Alcohol Denat., Gly...",Sun Cream Spf 30
13672,4005900870407,4005900870407,"ALPHA-ISOMETHYL IONONE, *******, BUTYL METHOXY...",Sun Oil Spray Tropical Bronze Spf 6


### Cleaning the DataSet

In [3]:
# function for unicode transliteration and lower case for ingredient_list
def clean_text(s):               
    if isinstance(s, str):
        s = unidecode(s)    
        s = s.lower()
        return s

In [4]:
# function for unicode transliteration for product_name 
def unidecode_text(s):          
    if isinstance(s, str):
        s = unidecode(s)    
        return s

In [5]:
# Renaming the columns and adding index column
products = products.drop(columns='Unnamed: 1')
products.rename(columns={'Ingredient List': 'full_ingredient_list'}, inplace=True)
products.rename(columns={'Name': 'product_name'}, inplace=True)
products.reset_index(drop=False, inplace=True)
products.rename(columns={'index': 'productID'}, inplace=True)
products

Unnamed: 0,productID,Barcode,full_ingredient_list,product_name
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...
1,1,8606029266766,Silicone,Velnea Fingering Silik.Lila
2,2,4049639429550,"aqua (water),stearic acid,copernicia cerifera ...","Magic Finish 5-In-1 Hybrid Mascara\nVolume, Le..."
3,3,4011700740291,"butane,alcohol,propane,parfum (fragrance),dipr...",Echt Kï¿½Lnisch Wasser Aerosol Deodorant Spray...
4,4,4011700740475,"sodium palmate,sodium palm kernelate,aqua (wat...",Echt Kï¿½Lnisch Wasser Cream Soap\nMildly Clea...
...,...,...,...,...
13669,13669,8034063521822,"Aqua [Water], Sodium coco-sulfate, Cocamidopro...",Shampoo Leaves Strengthening Anti-Age With Gin...
13670,13670,6001051004959,"Aqua, Alcohol Denat., Butyl Methoxydibenzoylme...",Sun Babies & Kids Kids' Sun Spray Spf 50+
13671,13671,5060447940494,"Aqua (Water), Octocrylene, Alcohol Denat., Gly...",Sun Cream Spf 30
13672,13672,4005900870407,"ALPHA-ISOMETHYL IONONE, *******, BUTYL METHOXY...",Sun Oil Spray Tropical Bronze Spf 6


In [6]:
# Making unicode transliteration, lower case and tokenization 
products['product_name'] = products['product_name'].apply(unidecode_text)
products['ingredient_list'] = products['full_ingredient_list'].apply(clean_text)
products['ingredient_list'] = products['ingredient_list'].str.split(", ")
products.reset_index(drop=True, inplace=True)
products

Unnamed: 0,productID,Barcode,full_ingredient_list,product_name,ingredient_list
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,"[aqua / water, aluminum chlorohydrate, ceteary..."
1,1,8606029266766,Silicone,Velnea Fingering Silik.Lila,[silicone]
2,2,4049639429550,"aqua (water),stearic acid,copernicia cerifera ...","Magic Finish 5-In-1 Hybrid Mascara\nVolume, Le...","[aqua (water),stearic acid,copernicia cerifera..."
3,3,4011700740291,"butane,alcohol,propane,parfum (fragrance),dipr...",Echt Ki? 1/2 Lnisch Wasser Aerosol Deodorant S...,"[butane,alcohol,propane,parfum (fragrance),dip..."
4,4,4011700740475,"sodium palmate,sodium palm kernelate,aqua (wat...",Echt Ki? 1/2 Lnisch Wasser Cream Soap\nMildly ...,"[sodium palmate,sodium palm kernelate,aqua (wa..."
...,...,...,...,...,...
13669,13669,8034063521822,"Aqua [Water], Sodium coco-sulfate, Cocamidopro...",Shampoo Leaves Strengthening Anti-Age With Gin...,"[aqua [water], sodium coco-sulfate, cocamidopr..."
13670,13670,6001051004959,"Aqua, Alcohol Denat., Butyl Methoxydibenzoylme...",Sun Babies & Kids Kids' Sun Spray Spf 50+,"[aqua, alcohol denat., butyl methoxydibenzoylm..."
13671,13671,5060447940494,"Aqua (Water), Octocrylene, Alcohol Denat., Gly...",Sun Cream Spf 30,"[aqua (water), octocrylene, alcohol denat., gl..."
13672,13672,4005900870407,"ALPHA-ISOMETHYL IONONE, *******, BUTYL METHOXY...",Sun Oil Spray Tropical Bronze Spf 6,"[alpha-isomethyl ionone, *******, butyl methox..."


In [7]:
# Exploding the dataframe by ingredients
products = products.explode('ingredient_list')
products

Unnamed: 0,productID,Barcode,full_ingredient_list,product_name,ingredient_list
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aqua / water
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aluminum chlorohydrate
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,cetearyl alcohol
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,ceteareth-33
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,parfum / fragrance
...,...,...,...,...,...
13673,13673,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream,charcoal powder
13673,13673,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream,parfum (fragrance)
13673,13673,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream,limonene
13673,13673,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream,linalool


In [8]:
# Remove unclosed brackets
def remove_unclosed_brackets(input_str):
    if input_str is None or not isinstance(input_str, str):
        return input_str

    stack = []
    result = list(input_str)

    for i, char in enumerate(input_str):
        if char in ['(', '[']:
            stack.append(i)
        elif char in [')', ']']:
            if stack:
                stack.pop()
            else:
                result[i] = ' '

    # Replace unclosed open brackets with an empty space
    for index in stack:
        result[index] = ' '

    return ''.join(result)

products['ingredient_list'] = products['ingredient_list'].apply(remove_unclosed_brackets)

In [9]:
ingredients_pattern = r'^(?i)ingredients[^\w\s]*:$'                                                         # brisenje na stringot ingredients: sto se srekjava na pocetok na nekoj od listite so sostojki
products['ingredient_list'] = products['ingredient_list'].str.replace(ingredients_pattern, '')              # brisenje na text posle statement: may contain
products['ingredient_list'] = products['ingredient_list'].str.replace(r'may contain.*$', '', case=False, regex=True).str.strip()

In [10]:
products['ingredient_list'] = products['ingredient_list'].str.replace('[', '(').str.replace(']', ')')       # zamena na aglesti zagradi so obicni
products['ingredient_list'] = products['ingredient_list'].str.replace('\\', '/')                            # zamena na \ so /
products['ingredient_list'] = products['ingredient_list'].str.lstrip(',')                                   # brisenje na ',' ako stringot pocnuva so ','
products['ingredient_list'] = products['ingredient_list'].str.replace(' ,', ', ')                           # zamena na ' ,' so ', '
products['ingredient_list'] = products['ingredient_list'].str.rstrip(string.punctuation.replace(')', ''))   # brisenje punktuacija na kraj na sting, no bez zagradite
products['ingredient_list'] = products['ingredient_list'].str.lstrip('/')                                   # brisenje / na pocetok na string

special_characters = ['*', '$', '?', '!', '@', '}', '{', '--', '>', '<', '~', '&', '=', '"']                # brisenje na specijalnite znaci bilo kade vo stringot 
for char in special_characters:
    products['ingredient_list'] = products['ingredient_list'].astype(str).str.replace(char, ' ')

products['ingredient_list'] = products['ingredient_list'].str.replace('f,i,l,', ' ')
products['ingredient_list'] = products['ingredient_list'].str.replace('f,i,l', ' ')
products['ingredient_list'] = products['ingredient_list'].str.replace('f.i.l ', ' ')
products['ingredient_list'] = products['ingredient_list'].str.replace('f.i.l.', ' ')
products['ingredient_list'] = products['ingredient_list'].str.replace(r'cl (\d{5})', r'ci \1')                # zamena na cl XXXXX so ci XXXXX
products['ingredient_list'] = products['ingredient_list'].replace(r',+', ',', regex=True)                   # brisenje na povekje od edna posledovatelni zapirki
products['ingredient_list'] = products['ingredient_list'].str.strip().str.replace(r'\s+', ' ')              # brisenje na povekje od 1 posledovatelni prazni mesta
products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s*/\s*', '/')                      # brisenje na prazni mesta pred i posle '/'
products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s*-\s*', '-')                      # brisenje na prazni mesta pred i posle '-'
products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s+\.', '.')                        # brisenje na prazno mesto pred '.'
products['ingredient_list'] = products['ingredient_list'].str.replace(r'\(\s*', '(').str.replace(r'\s*\)', ')') # brisenje na prazni mesta posle otvorena zagrada i pred zatvorena zagrada
products['ingredient_list'] = products['ingredient_list'].str.strip()                                       # trim
products = products[~products['ingredient_list'].astype(str).str.match(r'^[\d\W]+$')]                       # brisenje na red koj sodrzi samo broj i punktuacija
products = products[~products['ingredient_list'].str.strip(string.punctuation).eq('')]                      # brisenje na red koj sodrzi samo punktuacija
products = products[~products['ingredient_list'].apply(lambda x: pd.to_numeric(x, errors='coerce')).notna() | (products['ingredient_list'] == '')]  # brisenje na red koj sodrzi samo broj

In [11]:
# Brisenje na mnogu cesti paterni vo tekstot
patterns_to_replace = ['(+/-):', '(+/-)', '(+-)', '(+/)', '+/-:', '+/-', '+-', '+/']

combined_pattern = '|'.join(map(re.escape, patterns_to_replace))
products['ingredient_list'] = products['ingredient_list'].apply(lambda x: re.sub(combined_pattern, '', str(x)))
products = products.dropna(subset=['ingredient_list']).loc[products['ingredient_list'] != '']
products['ingredient_list'] = products['ingredient_list'].str.strip()                                       # trim

In [12]:
# Brisenje na red koj sodrzi pomalku od 3 ili povekje od 70 karakteri
mask_to_drop = (products['ingredient_list'].str.len() < 3) | (products['ingredient_list'].str.len() > 70)   
products = products[~mask_to_drop]
products.reset_index(drop=True, inplace=True)

In [13]:
# Brisenje na prazni stringovi i NaN
products['ingredient_list'] = products['ingredient_list'].str.strip()   
products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s+', ' ', regex=True)
products = products.dropna(subset=['ingredient_list'])

In [14]:
# Brisenje na redovi kade ima ingredient sto se vika 'nan', bidejki excel fajlot posle vcituvanje go interpretira kako NaN
products = products[~products['ingredient_list'].eq('nan')]
nan_rows = products['ingredient_list'].isna()
print(nan_rows.sum())

0


In [15]:
products

Unnamed: 0,productID,Barcode,full_ingredient_list,product_name,ingredient_list
0,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aqua/water
1,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,aluminum chlorohydrate
2,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,cetearyl alcohol
3,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,ceteareth-33
4,0,3600542399326,"Aqua / Water, Aluminum Chlorohydrate, Cetearyl...",Garnier Mineral Hyaluronic Care 72H Deodorant ...,parfum/fragrance
...,...,...,...,...,...
229205,13673,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream,charcoal powder
229206,13673,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream,parfum (fragrance)
229207,13673,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream,limonene
229208,13673,8436575091013,"AQUA (WATER), CUCUMIS SATIVUS (CUCUMBER）FRUIT ...",Retinol Cream,linalool


In [16]:
products.to_excel('../Data/products.xlsx')

In [17]:
value_counts = products['ingredient_list'].value_counts()

# Print the first 20 most frequent values and their frequencies
for value, frequency in value_counts.head(60).iteritems():
    print(f"Value: {value}, Frequency: {frequency}")

Value: glycerin, Frequency: 4226
Value: aqua, Frequency: 3232
Value: phenoxyethanol, Frequency: 3204
Value: linalool, Frequency: 2809
Value: citric acid, Frequency: 2781
Value: tocopherol, Frequency: 2753
Value: limonene, Frequency: 2417
Value: silica, Frequency: 2076
Value: dimethicone, Frequency: 2027
Value: parfum, Frequency: 1970
Value: ci 77492, Frequency: 1899
Value: ethylhexylglycerin, Frequency: 1866
Value: sodium benzoate, Frequency: 1857
Value: sodium chloride, Frequency: 1729
Value: mica, Frequency: 1662
Value: caprylyl glycol, Frequency: 1590
Value: benzyl alcohol, Frequency: 1559
Value: geraniol, Frequency: 1424
Value: xanthan gum, Frequency: 1405
Value: butylene glycol, Frequency: 1372
Value: citronellol, Frequency: 1367
Value: panthenol, Frequency: 1361
Value: tocopheryl acetate, Frequency: 1314
Value: ci 77491, Frequency: 1314
Value: sodium hydroxide, Frequency: 1261
Value: alcohol denat, Frequency: 1258
Value: propylene glycol, Frequency: 1250
Value: cocamidopropyl bet