In [1]:
import numpy as np
import pandas as pd
from unidecode import unidecode

### Load Ingredients

In [2]:
FILE_PATH = '../Data/raw/skinsort_ingredients_20231202.csv'
df_ingredients = pd.DataFrame(pd.read_csv(FILE_PATH, sep=';', index_col=False))
df_ingredients.sort_values(by=["page_number"], inplace=True) #sort
df_ingredients.drop(labels=["page_number"], axis=1, inplace=True) #drop unnecessary columns
df_ingredients.rename(columns={'name': 'generic_name'}, inplace=True)

df_ingredients

Unnamed: 0,generic_name,synonym
4608,Niacinamide,Niacinamide|Vitamin B3|Nicotinamide|3-Pyridine...
4631,Alpha-Arbutin,Alpha-Arbutin
4630,Camellia Sinensis Leaf Extract,Camellia Sinensis Leaf Extract|Green Tea|Oolon...
4629,Tocopherol,Tocopherol|Vitamin E
4628,Citric Acid,Citric Acid
...,...,...
27625,Butylethylpropanediol Dimer Dilinoleate,Butylethylpropanediol Dimer Dilinoleate
27624,Butyl Isovalerate,Butyl Isovalerate
27641,Capsella Bursa-Pastoris Sprout Water,Capsella Bursa-Pastoris Sprout Water
27632,C9-11 Pareth-8,C9-11 Pareth-8


### Clean DataSet

In [3]:
def clean_text(s):
    # Check if s is a string
    if isinstance(s, str):
        s = unidecode(s)  # è -> e (unicode transliteration)
        s = s.lower()
        return s

In [4]:
# MISLAM NEMA DA TREBA, EVENTUALNO AKO IZLEZAT MN SOEDINENIJA STO NI FALAT, DA SE NAVRATIME I DA GI IZDVOIME SOSTOJ. OD REDICITE SO MN.PODATOCI
 
# def process_ingredient_list(text):
#     if pd.notna(text) and (len(text) > 30 or '/' in text):
#         return text.replace('/', '; ')
#     else:
#         return text

# ingredients['synonym'] = ingredients['synonym'].apply(process_ingredient_list)

In [5]:
df_ingredients = df_ingredients.dropna() #drop empty rows

df_ingredients = df_ingredients[~(df_ingredients["generic_name"].str.startswith("(") | df_ingredients["generic_name"].str.startswith("["))] #delete unnecessary rows

df_ingredients["synonym"].replace(to_replace=r"(?<=\d),\s+(?=\d)", regex=True, value=',', inplace=True) #example 1, 3, 4-Octadecanetrio to 1,3,4-Octadecanetrio
df_ingredients["synonym"].replace(to_replace=", ", value='|', inplace=True)
df_ingredients["synonym"].replace(to_replace=" and ", value='|', inplace=True)
df_ingredients["synonym"].replace(to_replace="and ", value='', inplace=True)

df_ingredients["synonym"] = df_ingredients["synonym"].str.split("|") #split synonyms from string to array of strings

df_ingredients = df_ingredients.explode("synonym")

df_ingredients["synonym"] = df_ingredients["synonym"].str.strip()

# ingredients = ingredients[~ingredients['name'].str.startswith('(')]  # brisenje na 36 redovi sto bea so dolga lista
# ingredients = ingredients[~ingredients['name'].str.startswith('[')]
# ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r", ", value=';', regex=True)
# ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r" and ", value=';', regex=True)
# ingredients["synonym"] = ingredients['name'] + ';' + ingredients['synonym']
# ingredients["synonym"].fillna(ingredients["name"], inplace=True)
# ingredients["synonym"] = ingredients["synonym"].str.split(";")
# ingredients = ingredients.explode("synonym")
# ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r"and ", value='', regex=True).str.strip()
# ingredients["synonym"] = ingredients["synonym"].apply(clean_text)
# ingredients.rename(columns={'name': 'generic_name'}, inplace=True)
# ingredients

In [6]:
new_rows1 = pd.DataFrame({'generic_name': ['Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water'],
            'synonym': ['water/aqua/eau', 'water/eau', 'water/aqua', 'water \ aqua \ eau', 'water aqua', 'water eau', 'water/eau (aqua)', 'aqua(water', 
                        'water\aqua\eau', 'water (aqua / eau)', 'aqua (water)', 'aqua/water/eau', 'aqua/water', 'aqua(water(eau', 'aqua(water', 'aqua(water(water', 'eau)']})

new_rows2 = pd.DataFrame({'generic_name': ['Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum'],
            'synonym': ['fragrance (fragrance)', 'fragrance (parfum)', 'fragrance / parfum', 'fragrance(parfum)', 'fragrance/parfum', 'parfum (fragrance)', 
                        'parfum / fragrance', 'parfum fragrance', 'parfum/ fragrance', 'perfum', 'perfum   fragrance', 'perfum (fragrance)', 'perfum fragrance', 'perfume']})

new_rows3 = pd.DataFrame({'generic_name': ['Aroma', 'Aroma', 'Aroma'], 'synonym': ['aroma (flavor)', 'aroma/flavor', 'aromatics']})


df_ingredients = pd.concat([df_ingredients, new_rows1], ignore_index = True)
df_ingredients = pd.concat([df_ingredients, new_rows2], ignore_index = True)
df_ingredients = pd.concat([df_ingredients, new_rows3], ignore_index = True)

In [7]:
df_ingredients["synonym"] = df_ingredients["synonym"].apply(clean_text)

df_ingredients.drop_duplicates(subset='synonym', inplace=True)
df_ingredients.reset_index(drop=True, inplace=True)

In [8]:
df_ingredients.to_csv('../Data/ingredients.csv', sep=";", index=False)