In [36]:
import pandas as pd
from unidecode import unidecode

### Load Ingredients

In [37]:
FILE_PATH = '../Data/raw/ingredient_w_synonyms.csv'
df_ingredients = pd.DataFrame(pd.read_csv(FILE_PATH, sep=';'))
df_ingredients.drop(labels=["page_number"], axis=1, inplace=True) #drop unnecessary columns

FILE_PATH = '../Data/raw/additional_ingredients.csv'
df_additional_ingredients = pd.DataFrame(pd.read_csv(FILE_PATH, sep=';'))

df_ingredients = pd.concat([df_ingredients, df_additional_ingredients])

df_ingredients.reset_index(drop=True,inplace=True)
df_ingredients.rename(columns={'name': 'generic_name'}, inplace=True)

df_ingredients

Unnamed: 0,generic_name,synonym
0,Niacinamide,Vitamin B3|Nicotinamide|3-Pyridinecarboxamide
1,Hyaluronic Acid,Hyaluronic Acid|Hyaluronan
2,Salicylic Acid,Salicylic Acid
3,Glycerin,Glycerin|Vegetable Glycerin|Glycerine|Glycerol
4,Retinol,Retinol|Vitamin A
...,...,...
27631,Beeswax,beeswax (cera alba)|cera alba beeswax|beeswax/...
27632,Limonene,lemonene
27633,Capric/Caprylic Triglycerides,caprylic capric triglyceride
27634,Alcohol Denat.,alchol dent|alcoholdenat|alcoholdent|alchol|al...


### Clean DataSet

In [38]:
def clean_text(s):
    # Check if s is a string
    if isinstance(s, str):
        s = unidecode(s)  # è -> e (unicode transliteration)
        s = s.lower()
        return s

In [39]:
# MISLAM NEMA DA TREBA, EVENTUALNO AKO IZLEZAT MN SOEDINENIJA STO NI FALAT, DA SE NAVRATIME I DA GI IZDVOIME SOSTOJ. OD REDICITE SO MN.PODATOCI
 
# def process_ingredient_list(text):
#     if pd.notna(text) and (len(text) > 30 or '/' in text):
#         return text.replace('/', '; ')
#     else:
#         return text

# ingredients['synonym'] = ingredients['synonym'].apply(process_ingredient_list)

In [40]:
df_ingredients = df_ingredients.dropna() #drop empty rows

df_ingredients = df_ingredients[~(df_ingredients["generic_name"].str.startswith("(") | df_ingredients["generic_name"].str.startswith("["))] #delete unnecessary rows

df_ingredients["synonym"].replace(to_replace=r"(?<=\d),\s+(?=\d)", regex=True, value=',', inplace=True) #example 1, 3, 4-Octadecanetrio to 1,3,4-Octadecanetrio
df_ingredients["synonym"].replace(to_replace=", ", value='|', inplace=True)
df_ingredients["synonym"].replace(to_replace=" and ", value='|', inplace=True)
df_ingredients["synonym"].replace(to_replace="and ", value='', inplace=True)
df_ingredients["synonym"].fillna(df_ingredients["generic_name"], inplace=True)                                            # ako nema synoym, da go stavi generickoto ime
df_ingredients["synonym"] = df_ingredients["synonym"].str.split("|") #split synonyms from string to array of strings
df_ingredients['synonym'] = df_ingredients.apply(lambda x: x["synonym"] if x["generic_name"] in x["synonym"] else [x["generic_name"]] + x["synonym"], axis=1)

df_ingredients = df_ingredients.explode("synonym")

df_ingredients["generic_name"] = df_ingredients["generic_name"].str.strip()
df_ingredients["synonym"] = df_ingredients["synonym"].str.strip()

df_ingredients["synonym"] = df_ingredients["synonym"].str.replace(r'\s+', ' ', regex=True)

df_ingredients

Unnamed: 0,generic_name,synonym
0,Niacinamide,Niacinamide
0,Niacinamide,Vitamin B3
0,Niacinamide,Nicotinamide
0,Niacinamide,3-Pyridinecarboxamide
1,Hyaluronic Acid,Hyaluronic Acid
...,...,...
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionon"
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionone"
27635,Alpha-Isomethyl Ionone,alpha isomethyl ionone
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionon"


In [41]:
df_ingredients["synonym"] = df_ingredients["synonym"].apply(clean_text)

df_ingredients.drop_duplicates(subset='synonym', inplace=True)

print(df_ingredients[df_ingredients['synonym'].duplicated()])

df_ingredients

Empty DataFrame
Columns: [generic_name, synonym]
Index: []


Unnamed: 0,generic_name,synonym
0,Niacinamide,niacinamide
0,Niacinamide,vitamin b3
0,Niacinamide,nicotinamide
0,Niacinamide,3-pyridinecarboxamide
1,Hyaluronic Acid,hyaluronic acid
...,...,...
27634,Alcohol Denat.,alkohol
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionon"
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionone"
27635,Alpha-Isomethyl Ionone,alpha isomethyl ionone


In [42]:
their = pd.read_csv('../Data/raw/_Ingredient__13.10.23.csv', encoding='latin1')

their

Unnamed: 0,IngredientIdentifier,Name,Description,CASCODE,ECHA_LINK,EntityId,CategoryId,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant,Id,CreatedBy,CreatedOn,LastModifiedBy,LastModifiedOn,IsActive,column3
0,G00001,Saccharomyces/Leuconostoc/Apple Fruit/Carrot R...,,,,,1.0,False,False,False,False,1.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,
1,G00002,Lactobacillus/Centella Asiatica/Gleditsia Sine...,,,,,1.0,False,False,False,False,2.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,
2,G00003,Bacillus/Cordyceps Sinensis/Ganoderma Lucidum/...,,,,,1.0,False,False,False,False,3.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,
3,G00004,Ziziphus Spina-Christi Leaf,,Jujube leaves,,,1.0,False,False,False,False,4.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,
4,G00005,Zingiber Officinale Water,,84696-15-1 - Ginger water,,,1.0,False,False,False,False,5.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14695,G09915,Lysine Carboxymethyl Cysteinate,,,,,1.0,False,False,False,False,14696.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,
14696,G09916,Lysine Thiazolidine Carboxylate,,,,,1.0,False,False,False,False,14697.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,
14697,G09917,Palmitoyl Myristyl Serinate,,,,,1.0,False,False,False,False,14698.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,
14698,G09918,Piperonyl Glucoside,,,,,1.0,False,False,False,False,14699.0,system_user,2023-02-12 21:07:00.000 +0100,system_user,2023-02-12 21:07:00.000 +0100,True,


In [43]:
their['Name'] = their['Name'].str.lower()
merged_df = pd.merge(df_ingredients, their, left_on='synonym', right_on='Name', how='left')

columns_to_update = ['Carcinogens', 'EndocrineDisruptors', 'Allergen', 'SkinIrritant']
for column in columns_to_update:
    df_ingredients[column] = merged_df[column]

df_ingredients


Unnamed: 0,generic_name,synonym,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant
0,Niacinamide,niacinamide,False,False,False,False
0,Niacinamide,vitamin b3,False,False,False,False
0,Niacinamide,nicotinamide,False,False,False,False
0,Niacinamide,3-pyridinecarboxamide,False,False,False,False
1,Hyaluronic Acid,hyaluronic acid,,,,
...,...,...,...,...,...,...
27634,Alcohol Denat.,alkohol,,,,
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionon",,,,
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionone",,,,
27635,Alpha-Isomethyl Ionone,alpha isomethyl ionone,,,,


In [44]:
display(df_ingredients[df_ingredients['Allergen'] == True])

Unnamed: 0,generic_name,synonym,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant
349,Hexylresorcinol,hexylresorcinol,False,False,True,False
779,Alteromonas Ferment Extract,alteromonas ferment extract,False,False,True,False
790,Sodium Cocoyl Glutamate,sodium cocoyl glutamate,False,False,True,False
930,Crambe Abyssinica Seed Oil,crambe abyssinica seed oil,False,False,True,True
1300,Borago Officinalis Extract,borago officinalis extract,False,False,True,False
1423,Origanum Vulgare Oil,origanum vulgare oil,False,False,True,False
1423,Origanum Vulgare Oil,oregano select,False,False,True,False
1423,Origanum Vulgare Oil,oregano oil,False,False,True,False
1853,Hydrogenated Olive Oil,hydrogenated olive oil,False,False,True,False
1856,Potassium Cocoate,potassium cocoate,True,False,True,False


In [45]:
display(df_ingredients[df_ingredients['Carcinogens'] == True])

Unnamed: 0,generic_name,synonym,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant
154,Honey Extract,honey extract,True,True,False,False
180,Centella Asiatica Meristem Cell Culture Extract,centella asiatica meristem cell culture extract,True,True,False,False
181,Sea Water,sea water,True,True,False,False
187,Ectoin,ectoin,True,False,False,False
205,Saccharomyces/Rice Ferment Filtrate,saccharomyces/rice ferment filtrate,True,False,False,False
290,Acetyl Tetrapeptide-5,acetyl tetrapeptide-5,True,False,False,False
810,Coco-Caprylate/Caprate,coco-caprylate/caprate,True,False,False,False
1856,Potassium Cocoate,potassium cocoate,True,False,True,False
1959,Bacillus/Turmeric Root/Soybean Ferment Filtrate,bacillus/turmeric root/soybean ferment filtrate,True,True,False,False
3121,Butyloctyl Palmitate,butyloctyl palmitate,True,True,False,False


In [46]:
display(df_ingredients[df_ingredients['EndocrineDisruptors'] == True])

Unnamed: 0,generic_name,synonym,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant
154,Honey Extract,honey extract,True,True,False,False
180,Centella Asiatica Meristem Cell Culture Extract,centella asiatica meristem cell culture extract,True,True,False,False
181,Sea Water,sea water,True,True,False,False
202,Glyceryl Stearate,glyceryl stearate,False,True,False,False
202,Glyceryl Stearate,glyceryl stearate nse,False,True,False,False
202,Glyceryl Stearate,glyceryl stearate gms-nse,False,True,False,False
727,Hibiscus Sabdariffa Flower Extract,hibiscus sabdariffa flower extract,False,True,False,False
913,Hexylglycerin,hexylglycerin,False,True,False,False
1113,Nasturtium Officinale Extract,nasturtium officinale extract,False,True,False,False
1206,Hydroxypropyl Cyclodextrin,hydroxypropyl cyclodextrin,False,True,False,False


In [47]:
display(df_ingredients[df_ingredients['SkinIrritant'] == True])

Unnamed: 0,generic_name,synonym,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant
325,Betula Alba Juice,betula alba juice,False,False,False,True
325,Betula Alba Juice,birch juice,False,False,False,True
446,Azulene,azulene,False,False,False,True
631,Aspergillus Ferment,aspergillus ferment,False,False,False,True
890,Peg-200 Hydrogenated Glyceryl Palmate,peg-200 hydrogenated glyceryl palmate,False,False,False,True
...,...,...,...,...,...,...
24015,Ppg-4 Laureth-7,ppg-4 laureth-7,False,False,False,True
24140,Polyurethane-42,polyurethane-42,False,False,False,True
24698,Peg-14 Laurate,peg-14 laurate,False,False,False,True
26554,Choleth-30,choleth-30,False,False,False,True


In [48]:
df_ingredients.to_excel('../Data/ingredients.xlsx', index=False)

In [49]:
df_ingredients['Allergen'] = df_ingredients['Allergen'].fillna(False)


In [50]:
df_ingredients['Carcinogens'] = df_ingredients['Carcinogens'].fillna(False)

In [51]:
df_ingredients['EndocrineDisruptors'] = df_ingredients['EndocrineDisruptors'].fillna(False)

In [52]:
df_ingredients['SkinIrritant'] = df_ingredients['SkinIrritant'].fillna(False)

In [53]:
forbidden_ingredients = ['retin-a', 'retinol', 'retinyl palmitate', 'tretinoin', 'benzoyl peroxide',
                          'salicylic acid', 'hydroquinone', 'aluminum chloride', 'formaldehyde', 
                          'tetracycline', 'dihydroxyacetone']

df_ingredients['Forbidden during pregnancy'] = False

df_ingredients.loc[df_ingredients['synonym'].str.lower().isin(forbidden_ingredients), 'Forbidden during pregnancy'] = True


In [54]:
display(df_ingredients[df_ingredients['Forbidden during pregnancy'] == True])

Unnamed: 0,generic_name,synonym,Carcinogens,EndocrineDisruptors,Allergen,SkinIrritant,Forbidden during pregnancy
2,Salicylic Acid,salicylic acid,False,False,False,False,True
4,Retinol,retinol,False,False,False,False,True
14,Tretinoin,tretinoin,False,False,False,False,True
15,Benzoyl Peroxide,benzoyl peroxide,False,False,False,False,True
91,Hydroquinone,hydroquinone,False,False,False,False,True
131,Retinyl Palmitate,retinyl palmitate,False,False,False,False,True
3016,Formaldehyde,formaldehyde,False,False,False,False,True
4274,Dihydroxyacetone,dihydroxyacetone,False,False,False,False,True
7453,Aluminum Chloride,aluminum chloride,False,False,False,False,True


In [55]:
#df_ingredients.to_excel('../Data/ingredients.xlsx', index=False)