In [1]:
import numpy as np
import pandas as pd
from unidecode import unidecode

### Load Ingredients

In [2]:
FILE_PATH = '../Data/ingredient_w_synonyms.csv'
df_ingredients = pd.DataFrame(pd.read_csv(FILE_PATH, sep=';'))
df_ingredients

Unnamed: 0,name,synonym
0,Niacinamide,"Vitamin B3, Nicotinamide, and 3-Pyridinecarbox..."
1,Hyaluronic Acid,Hyaluronan
2,Salicylic Acid,
3,Glycerin,"Vegetable Glycerin, Glycerine, and Glycerol"
4,Retinol,Vitamin A
...,...,...
27638,Dibutyldecyl Ipdi,
27639,Capsella Bursa-Pastoris Sprout Water,
27640,Carboxyethyl Acrylate,
27641,Candelilla Wax Hydrocarbons,


### Clean DataSet

In [3]:
def clean_text(s):
    # Check if s is a string
    if isinstance(s, str):
        s = unidecode(s)  # è -> e (unicode transliteration)
        s = s.lower()
        return s

In [4]:

df_ingredients = df_ingredients[~df_ingredients['name'].str.startswith('(')]  # brisenje na 36 redovi sto bea so dolga lista
df_ingredients = df_ingredients[~df_ingredients['name'].str.startswith('[')]
df_ingredients["synonym"] = df_ingredients["synonym"].replace(to_replace=r", ", value=';', regex=True)
df_ingredients["synonym"] = df_ingredients["synonym"].replace(to_replace=r" and ", value=';', regex=True)
df_ingredients["synonym"] = df_ingredients['name'] + ';' + df_ingredients['synonym']
df_ingredients["synonym"].fillna(df_ingredients["name"], inplace=True)
df_ingredients["synonym"] = df_ingredients["synonym"].str.split(";")
df_ingredients = df_ingredients.explode("synonym")
df_ingredients["synonym"] = df_ingredients["synonym"].replace(to_replace=r"and ", value='', regex=True).str.strip()
df_ingredients["synonym"] = df_ingredients["synonym"].apply(clean_text)
df_ingredients.rename(columns={'name': 'generic_name'}, inplace=True)

df_ingredients

Unnamed: 0,generic_name,synonym
0,Niacinamide,niacinamide
0,Niacinamide,vitamin b3
0,Niacinamide,nicotinamide
0,Niacinamide,3-pyridinecarboxamide
1,Hyaluronic Acid,hyaluronic acid
...,...,...
27638,Dibutyldecyl Ipdi,dibutyldecyl ipdi
27639,Capsella Bursa-Pastoris Sprout Water,capsella bursa-pastoris sprout water
27640,Carboxyethyl Acrylate,carboxyethyl acrylate
27641,Candelilla Wax Hydrocarbons,candelilla wax hydrocarbons


In [5]:
df_ingredients.to_csv('ingredients.csv')