In [1]:
import numpy as np
import pandas as pd
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')	

### Load Ingredients dataset

In [2]:
FILE_PATH = '../Data/raw/ingredient_w_synonyms.csv'
ingredients = pd.DataFrame(pd.read_csv(FILE_PATH, sep=';'))
ingredients

Unnamed: 0,name,synonym
0,Niacinamide,"Vitamin B3, Nicotinamide, and 3-Pyridinecarbox..."
1,Hyaluronic Acid,Hyaluronan
2,Salicylic Acid,
3,Glycerin,"Vegetable Glycerin, Glycerine, and Glycerol"
4,Retinol,Vitamin A
...,...,...
27638,Dibutyldecyl Ipdi,
27639,Capsella Bursa-Pastoris Sprout Water,
27640,Carboxyethyl Acrylate,
27641,Candelilla Wax Hydrocarbons,


### Cleaning the DataSet

In [3]:
# Function for unicode transliteration and lower case for ingredient_list
def clean_text(s):
    if isinstance(s, str):
        s = unidecode(s)  
        s = s.lower()
        return s

In [4]:
# Adding some frequent ingridients missing in the database
ingredients = pd.concat([ingredients, pd.DataFrame({'name': ['Capric/Caprylic Triglycerides'], 'synonym': ['caprylic capric triglyceride']})], ignore_index=True)


In [5]:
ingredients = ingredients[~ingredients['name'].str.startswith('(')]                                         # brisenje na 35 redovi sto bea vo dolga lista
ingredients = ingredients[~ingredients['name'].str.startswith('[')]                                         # brisenje na 1 red sto bea vo dolga lista
ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r", ", value=';', regex=True)            # zamena na ', ' so ';' za poednostavna tokenizacija
ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r" and ", value=';', regex=True)         # zamena na stringot 'and ' so ';'
ingredients["synonym"] = ingredients['name'] + ';' + ingredients['synonym']                                 # spojuvanje na genericko ime so synonym
ingredients["synonym"].fillna(ingredients["name"], inplace=True)                                            # ako nema synoym, da go stavi generickoto ime
ingredients["synonym"] = ingredients["synonym"].str.split(";")                                              # tokenizacija

In [6]:
# Sorting, renaming and adding index list
ingredients = ingredients.sort_values(by='name', ignore_index=True)

ingredients.reset_index(inplace=True)
ingredients.rename(columns={'index': 'ingredientID'}, inplace=True)
ingredients.rename(columns={'name': 'generic_name'}, inplace=True)

In [7]:
# Exploding the "synonym" column
ingredients = ingredients.explode("synonym")

In [8]:
ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r"and ", value='', regex=True).str.strip()   # brisenje na "and "
ingredients["synonym"] = ingredients["synonym"].apply(clean_text)                                               # unicode transliteration and lower case
ingredients["synonym"] = ingredients["synonym"].str.replace('[', '(').str.replace(']', ')')                     # zamena na site aglesti zagradi so obicni
ingredients = ingredients[~ingredients["synonym"].apply(lambda x: pd.to_numeric(x, errors='coerce')).notna() | (ingredients["synonym"] == '')]  # brisenje na red koj sodrzi samo broj

In [9]:
# Adding new rows as most frequent COMBINATIONS of synonyms of ingredients

def create_new_rows(generic_name, synonym_values):
    # Find the ingredientID based on the generic_name
    ingredient_id = ingredients.loc[ingredients['generic_name'] == generic_name, 'ingredientID'].iloc[0]

    # Create new rows with the found ingredientID
    new_rows = {'ingredientID': [ingredient_id] * len(synonym_values),
                'generic_name': [generic_name] * len(synonym_values),
                'synonym': synonym_values}

    return new_rows

new_rows1 = create_new_rows('Water', ['water/aqua/eau', 'water/eau', 'water/aqua', 'water aqua', 'water eau', 'water/eau (aqua)', 'aqua water', 'aqua (water eau)',
                                      'water (aqua/eau)', 'water (aqua)', 'aqua (water)', 'aqua/water/eau', 'aqua/water', 'aqua water(eau)', 'aqua water water', 'eau)'])

new_rows2 = create_new_rows('Parfum', ['fragrance (fragrance)', 'fragrance (parfum)', 'fragrance/parfum', 'fragrance(parfum)', 'fragrance parfum', 
                                        'parfum fragrance', 'perfum', 'perfume', 'parfum/fragrance', 'perfume fragrance', 'parfum (fragrance)', 'perfum fragrance', 'perfum (fragrance)'])

new_rows3 = create_new_rows('Aroma', ['aroma (flavor)', 'aroma/flavor', 'aromatics'])
new_rows4 = create_new_rows('Ci 77492', ['iron oxides ci 77492', 'iron oxides (ci 77492)', 'ci 77492 iron oxides', 'ci 77492 (iron oxides)'])
new_rows5 = create_new_rows('Ci 77491', ['iron oxides ci 77491', 'iron oxides (ci 77491)', 'red iron oxide ci 77491'])
new_rows6 = create_new_rows('Ci 77499', ['iron oxides ci 77499', 'iron oxides (ci 77499)', 'ci 77499 (iron oxides)'])
new_rows7 = create_new_rows('Ci 77742', ['manganese violet ci 77742', 'manganese violet (ci 77742)', 'ci 77742 (manganese violet)', 'ci 77742 manganese violet', 'ci 77742/manganese violet'])
new_rows8 = create_new_rows('Ci 77891', ['titanium dioxide ci 77891', 'titanium dioxide/ci 77891', 'titanium dioxides ci 77891', 'ci 77891 (titanium dioxide)', 'ci 77891 (titanium dioxides)', 'ci 77891 titanium dioxide', 'ci 77891/titanium dioxide'])
new_rows9 = create_new_rows('Ci 77510', ['ferric ferrocyanide ci 77510', 'ferric ferrocyanide (ci 77510)', 'ferric ammonium ferrocyanide (ci 77510)', 'ci 77510/ferric ammonium ferrocyanide'])
new_rows10 = create_new_rows('Ci 77007', ['ultramarines ci 77007', 'ultramarines (ci 77007)'])
new_rows11 = create_new_rows('Ci 19140', ['yellow 5 ci 19140', 'yellow 5 (ci 19140)', 'yellow 5 lake ci 19140', 'yellow no. 5 ci 19140', 'fd&c yellow no. 5 aluminum lake (ci 19140)'])
new_rows12 = create_new_rows('Ci 75470', ['ci 75470 carmine', 'carmine (ci 75470)', 'ci 75470/carmine'])
new_rows13 = create_new_rows('Ci 15880', ['d&c red no. 34 calcium lake (ci 15880)', 'red 34 lake (ci 15880)', 'ci 15880/red 34 lake'])
new_rows14 = create_new_rows('Ci 45410', ['ci 45410 (red 28 lake)'])
new_rows15 = create_new_rows('Ci 77000', ['aluminum (ci 77000)', 'aluminum powder (ci 77000)'])
new_rows16 = create_new_rows('Ci 15850', ['ci 15850 (red 7 lake)', 'ci 15850 (red 7)', 'ci 15850 (red 6)'])
new_rows17 = create_new_rows('Ci 42090', ['42090 (blue 1)', '42090 (blue 1 lake)'])
new_rows18 = create_new_rows('Carbon Black', ['ci 77266', 'ci 77266 (nano) black 2', 'ci 77266 nano (black 2)', 'ci 77266 (nano)', 'ci 77266 (black 2) (nano)', 'ci 77266 (nano)/black 2', 'ci 77266 black 2', 'ci 77266 (black 2)'])
new_rows19 = create_new_rows('Mica', ['ci 77019', 'mica (ci 77019)'])
new_rows20 = create_new_rows('Beeswax', ['beeswax (cera alba)', 'cera alba beeswax', 'beeswax/cera alba', 'cera alba/beeswax', 'cera alba/beeswax/cire d"abeille'])
new_rows21 = create_new_rows('Limonene', ['lemonene'])


new_rows_list = [new_rows1, new_rows2, new_rows3, new_rows4, new_rows5, new_rows6, new_rows7, new_rows8, new_rows9, new_rows10, new_rows11, new_rows12, new_rows13, new_rows14, 
                 new_rows15, new_rows16, new_rows17, new_rows18, new_rows19, new_rows20, new_rows21]
for new_rows in new_rows_list:
    ingredients = pd.concat([ingredients, pd.DataFrame(new_rows)], ignore_index=True)

In [10]:
# # Adding new rows for most common misspellings
# new_rows21 = create_new_rows('Alcohol Denat.', ['alchol dent', 'alcoholdenat', 'alcoholdent', 'alchol', 'alkohol'])
# new_rows22 = create_new_rows('Alpha-Isomethyl Ionone', ['alpha, isomethyl ionon', 'alpha, isomethyl ionone', 'alpha isomethyl ionone', 'alpha,isomethyl ionon', 'alpha-isomethylionone'])

# new_rows_list = [new_rows21, new_rows22]
# for new_rows in new_rows_list:
#     ingredients = ingredients.append(pd.DataFrame(new_rows), ignore_index=True)

In [11]:
# Brisenje na prazni stringovi i NaN
ingredients["synonym"] = ingredients["synonym"].str.strip()   
ingredients["synonym"] = ingredients["synonym"].str.replace(r'\s+', ' ', regex=True)

# Deleting NaN
nan_rows = ingredients['synonym'].isna()
print(nan_rows.sum())
ingredients = ingredients.dropna(subset=['synonym'])

0


In [12]:
# Deleting duplicates
print(ingredients[ingredients['synonym'].duplicated()])

ingredients = ingredients.drop_duplicates(subset='synonym')
ingredients[ingredients['synonym'].duplicated()]

       ingredientID                       generic_name  \
2237           2144                   Arachidonic Acid   
5302           5040          Centella Asiatica Extract   
5325           5060                          Cera Alba   
5349           5075                        Ceramide NP   
5354           5076                        Ceramide Ng   
6638           6250                               Clay   
6741           6345                          Cochineal   
6915           6509                  Colloidal Oatmeal   
9553           9045             Ethyl 2-Methylbutyrate   
9594           9086                    Ethyl Cinnamate   
9976           9444     Euterpe Oleracea Fruit Extract   
10447          9869           Gelidium Amansii Extract   
11674         11057           Hippophae Rhamnoides Oil   
14470         13768           Lawsonia Inermis Extract   
15025         14286                Magnesium Gluconate   
15060         14321                 Magnesium Silicate   
15143         

Unnamed: 0,ingredientID,generic_name,synonym


In [13]:
# pd.options.display.max_rows=1000   
# display(ingredients.tail(1000))

In [14]:
ingredients = ingredients.sort_values(by='ingredientID', ignore_index=True)

In [15]:
original = pd.read_csv('../Data/raw/_Ingredient__13.10.23.csv', encoding='latin1')

#original

In [16]:
original['Name'] = original['Name'].str.lower()
merged_df = pd.merge(ingredients, original, left_on='synonym', right_on='Name', how='left')

columns_to_update = ['Carcinogens', 'EndocrineDisruptors', 'Allergen', 'SkinIrritant']
for column in columns_to_update:
    ingredients[column] = merged_df[column]

#ingredients

In [17]:
#display(ingredients[ingredients['Allergen'] == True])

In [18]:
#display(ingredients[ingredients['Carcinogens'] == True])

In [19]:
#display(ingredients[ingredients['EndocrineDisruptors'] == True])

In [20]:
#display(ingredients[ingredients['SkinIrritant'] == True])

In [21]:
ingredients['Allergen'] = ingredients['Allergen'].fillna(False)
ingredients['Carcinogens'] = ingredients['Carcinogens'].fillna(False)
ingredients['EndocrineDisruptors'] = ingredients['EndocrineDisruptors'].fillna(False)
ingredients['SkinIrritant'] = ingredients['SkinIrritant'].fillna(False)


In [22]:
forbidden_ingredients = ['retin-a', 'retinol', 'retinyl palmitate', 'tretinoin', 'benzoyl peroxide',
                          'salicylic acid', 'hydroquinone', 'aluminum chloride', 'formaldehyde', 
                          'tetracycline', 'dihydroxyacetone']

ingredients['Forbidden during pregnancy'] = False

ingredients.loc[ingredients['synonym'].str.lower().isin(forbidden_ingredients), 'Forbidden during pregnancy'] = True

In [23]:
display(ingredients[ingredients['Forbidden during pregnancy'] == True])

Unnamed: 0,ingredientID,generic_name,synonym,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant,Forbidden during pregnancy
1642,1582,Aluminum Chloride,aluminum chloride,False,False,False,False,True
3231,3065,Benzoyl Peroxide,benzoyl peroxide,False,False,False,False,True
8322,7806,Dihydroxyacetone,dihydroxyacetone,False,False,False,False,True
10216,9619,Formaldehyde,formaldehyde,False,False,False,False,True
12606,11922,Hydroquinone,hydroquinone,False,False,False,False,True
22980,21956,Retinol,retinol,False,False,False,False,True
23000,21975,Retinyl Palmitate,retinyl palmitate,False,False,False,False,True
23864,22786,Salicylic Acid,salicylic acid,False,False,False,False,True
27428,26248,Tretinoin,tretinoin,False,False,False,False,True


In [24]:
ingredients

Unnamed: 0,ingredientID,generic_name,synonym,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant,Forbidden during pregnancy
0,0,"1,1-Dimethyl-2-Phenylethyl Isobutyrate","1,1-dimethyl-2-phenylethyl isobutyrate",False,False,False,False,False
1,1,"1,1-Dimethyl-3-Phenylpropyl Isobutyrate","1,1-dimethyl-3-phenylpropyl isobutyrate",False,False,False,False,False
2,2,"1,10-Decanediol","1,10-decanediol",False,False,False,False,False
3,3,"1,2,3,4,4a,5,6,7-Octahydro-2,5,5-Trimethyl-2-N...","1,2,3,4,4a,5,6,7-octahydro-2,5,5-trimethyl-2-n...",False,False,False,False,False
4,4,"1,2,3,4,4a,7,8,8a-Octahydro-2,4a,5,8a-Tetramet...","1,2,3,4,4a,7,8,8a-octahydro-2,4a,5,8a-tetramet...",False,False,False,False,False
...,...,...,...,...,...,...,...,...
28881,27603,Zygophyllum Qatarense Leaf/Stem Extract,zygophyllum qatarense leaf/stem extract,False,False,False,False,False
28882,27604,Zygosaccharomyces Microellipsoides Ferment,zygosaccharomyces microellipsoides ferment,False,False,False,False,False
28883,27605,Zygosaccharomyces/Apple Fruit/Papaya Fruit/Pin...,zygosaccharomyces/apple fruit/papaya fruit/pin...,False,False,False,False,False
28884,27606,Zymomonas Ferment Extract,zymomonas ferment extract,False,False,False,False,False


In [25]:
ingredients.to_csv('../Data/ingredients.csv')