In [292]:
import pandas as pd
from unidecode import unidecode

### Load Ingredients

In [293]:
FILE_PATH = '../Data/raw/ingredient_w_synonyms.csv'
df_ingredients = pd.DataFrame(pd.read_csv(FILE_PATH, sep=';'))
df_ingredients.drop(labels=["page_number"], axis=1, inplace=True) #drop unnecessary columns

FILE_PATH = '../Data/raw/additional_ingredients.csv'
df_additional_ingredients = pd.DataFrame(pd.read_csv(FILE_PATH, sep=';'))

df_ingredients = pd.concat([df_ingredients, df_additional_ingredients])

df_ingredients.reset_index(drop=True,inplace=True)
df_ingredients.rename(columns={'name': 'generic_name'}, inplace=True)

df_ingredients

Unnamed: 0,generic_name,synonym
0,Niacinamide,Vitamin B3|Nicotinamide|3-Pyridinecarboxamide
1,Hyaluronic Acid,Hyaluronic Acid|Hyaluronan
2,Salicylic Acid,Salicylic Acid
3,Glycerin,Glycerin|Vegetable Glycerin|Glycerine|Glycerol
4,Retinol,Retinol|Vitamin A
...,...,...
27631,Beeswax,beeswax (cera alba)|cera alba beeswax|beeswax/...
27632,Limonene,lemonene
27633,Capric/Caprylic Triglycerides,caprylic capric triglyceride
27634,Alcohol Denat.,alchol dent|alcoholdenat|alcoholdent|alchol|al...


### Clean DataSet

In [294]:
def clean_text(s):
    # Check if s is a string
    if isinstance(s, str):
        s = unidecode(s)  # è -> e (unicode transliteration)
        s = s.lower()
        return s

In [295]:
# MISLAM NEMA DA TREBA, EVENTUALNO AKO IZLEZAT MN SOEDINENIJA STO NI FALAT, DA SE NAVRATIME I DA GI IZDVOIME SOSTOJ. OD REDICITE SO MN.PODATOCI
 
# def process_ingredient_list(text):
#     if pd.notna(text) and (len(text) > 30 or '/' in text):
#         return text.replace('/', '; ')
#     else:
#         return text

# ingredients['synonym'] = ingredients['synonym'].apply(process_ingredient_list)

In [296]:
df_ingredients = df_ingredients.dropna() #drop empty rows

df_ingredients = df_ingredients[~(df_ingredients["generic_name"].str.startswith("(") | df_ingredients["generic_name"].str.startswith("["))] #delete unnecessary rows

df_ingredients["synonym"].replace(to_replace=r"(?<=\d),\s+(?=\d)", regex=True, value=',', inplace=True) #example 1, 3, 4-Octadecanetrio to 1,3,4-Octadecanetrio
df_ingredients["synonym"].replace(to_replace=", ", value='|', inplace=True)
df_ingredients["synonym"].replace(to_replace=" and ", value='|', inplace=True)
df_ingredients["synonym"].replace(to_replace="and ", value='', inplace=True)
df_ingredients["synonym"].fillna(df_ingredients["generic_name"], inplace=True)                                            # ako nema synoym, da go stavi generickoto ime
df_ingredients["synonym"] = df_ingredients["synonym"].str.split("|") #split synonyms from string to array of strings
df_ingredients['synonym'] = df_ingredients.apply(lambda x: x["synonym"] if x["generic_name"] in x["synonym"] else [x["generic_name"]] + x["synonym"], axis=1)

df_ingredients = df_ingredients.explode("synonym")

df_ingredients["generic_name"] = df_ingredients["generic_name"].str.strip()
df_ingredients["synonym"] = df_ingredients["synonym"].str.strip()

df_ingredients["synonym"] = df_ingredients["synonym"].str.replace(r'\s+', ' ', regex=True)

df_ingredients

Unnamed: 0,generic_name,synonym
0,Niacinamide,Niacinamide
0,Niacinamide,Vitamin B3
0,Niacinamide,Nicotinamide
0,Niacinamide,3-Pyridinecarboxamide
1,Hyaluronic Acid,Hyaluronic Acid
...,...,...
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionon"
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionone"
27635,Alpha-Isomethyl Ionone,alpha isomethyl ionone
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionon"


In [297]:
df_ingredients["synonym"] = df_ingredients["synonym"].apply(clean_text)

df_ingredients.drop_duplicates(subset='synonym', inplace=True)

print(df_ingredients[df_ingredients['synonym'].duplicated()])

df_ingredients

Empty DataFrame
Columns: [generic_name, synonym]
Index: []


Unnamed: 0,generic_name,synonym
0,Niacinamide,niacinamide
0,Niacinamide,vitamin b3
0,Niacinamide,nicotinamide
0,Niacinamide,3-pyridinecarboxamide
1,Hyaluronic Acid,hyaluronic acid
...,...,...
27634,Alcohol Denat.,alkohol
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionon"
27635,Alpha-Isomethyl Ionone,"alpha,isomethyl ionone"
27635,Alpha-Isomethyl Ionone,alpha isomethyl ionone


In [298]:
df_ingredients.to_csv('../Data/ingredients.csv', sep=";", index=False)