In [1]:
import numpy as np
import pandas as pd
from unidecode import unidecode

### Load Ingredients

In [2]:
FILE_PATH = '../Data/raw/ingredient_w_synonyms.csv'
ingredients = pd.DataFrame(pd.read_csv(FILE_PATH, sep=';'))
ingredients

Unnamed: 0,name,synonym
0,Niacinamide,"Vitamin B3, Nicotinamide, and 3-Pyridinecarbox..."
1,Hyaluronic Acid,Hyaluronan
2,Salicylic Acid,
3,Glycerin,"Vegetable Glycerin, Glycerine, and Glycerol"
4,Retinol,Vitamin A
...,...,...
27638,Dibutyldecyl Ipdi,
27639,Capsella Bursa-Pastoris Sprout Water,
27640,Carboxyethyl Acrylate,
27641,Candelilla Wax Hydrocarbons,


### Clean DataSet

In [3]:
def clean_text(s):
    # Check if s is a string
    if isinstance(s, str):
        s = unidecode(s)  # è -> e (unicode transliteration)
        s = s.lower()
        return s

In [4]:
# MISLAM NEMA DA TREBA, EVENTUALNO AKO IZLEZAT MN SOEDINENIJA STO NI FALAT, DA SE NAVRATIME I DA GI IZDVOIME SOSTOJ. OD REDICITE SO MN.PODATOCI
 
# def process_ingredient_list(text):
#     if pd.notna(text) and (len(text) > 30 or '/' in text):
#         return text.replace('/', '; ')
#     else:
#         return text

# ingredients['synonym'] = ingredients['synonym'].apply(process_ingredient_list)

In [5]:

ingredients = ingredients[~ingredients['name'].str.startswith('(')]  # brisenje na 36 redovi sto bea so dolga lista
ingredients = ingredients[~ingredients['name'].str.startswith('[')]
ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r", ", value=';', regex=True)
ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r" and ", value=';', regex=True)
ingredients["synonym"] = ingredients['name'] + ';' + ingredients['synonym']
ingredients["synonym"].fillna(ingredients["name"], inplace=True)
ingredients["synonym"] = ingredients["synonym"].str.split(";")
ingredients = ingredients.explode("synonym")
ingredients["synonym"] = ingredients["synonym"].replace(to_replace=r"and ", value='', regex=True).str.strip()
ingredients["synonym"] = ingredients["synonym"].apply(clean_text)
ingredients.rename(columns={'name': 'generic_name'}, inplace=True)
ingredients

Unnamed: 0,generic_name,synonym
0,Niacinamide,niacinamide
0,Niacinamide,vitamin b3
0,Niacinamide,nicotinamide
0,Niacinamide,3-pyridinecarboxamide
1,Hyaluronic Acid,hyaluronic acid
...,...,...
27638,Dibutyldecyl Ipdi,dibutyldecyl ipdi
27639,Capsella Bursa-Pastoris Sprout Water,capsella bursa-pastoris sprout water
27640,Carboxyethyl Acrylate,carboxyethyl acrylate
27641,Candelilla Wax Hydrocarbons,candelilla wax hydrocarbons


In [6]:
# Adding new rows as most frequent COMBINATIONS of synonyms of ingredients

new_rows1 = {'generic_name': ['Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water', 'Water'],
            'synonym': ['water/aqua/eau', 'water/eau', 'water/aqua', 'water \ aqua \ eau', 'water aqua', 'water eau', 'water/eau (aqua)', 
                        'water\aqua\eau', 'water (aqua / eau)', 'aqua (water)']}

new_rows2 = {'generic_name': ['Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum', 'Parfum'],
            'synonym': ['fragrance (fragrance)', 'fragrance (parfum)', 'fragrance / parfum', 'fragrance(parfum)', 'fragrance/parfum', 'parfum (fragrance)', 
                        'parfum / fragrance', 'parfum fragrance', 'parfum/ fragrance', 'perfum', 'perfum   fragrance', 'perfum (fragrance)', 'perfum fragrance', 'perfume']}

new_rows3 = {'generic_name': ['Aroma', 'Aroma', 'Aroma'], 'synonym': ['aroma (flavor)', 'aroma/flavor', 'aromatics']}


ingredients = ingredients.append(pd.DataFrame(new_rows1), ignore_index=True)
ingredients = ingredients.append(pd.DataFrame(new_rows2), ignore_index=True)
ingredients = ingredients.append(pd.DataFrame(new_rows3), ignore_index=True)

  ingredients = ingredients.append(pd.DataFrame(new_rows1), ignore_index=True)
  ingredients = ingredients.append(pd.DataFrame(new_rows2), ignore_index=True)
  ingredients = ingredients.append(pd.DataFrame(new_rows3), ignore_index=True)


In [7]:
nan_rows = ingredients['synonym'].isna()
print(nan_rows.sum())
ingredients = ingredients.dropna(subset=['synonym'])

0


In [8]:
ingredients[ingredients['synonym'].duplicated()]

Unnamed: 0,generic_name,synonym
37,Sodium Hyaluronate,hyaluronic acid
49,Snail Secretion Filtrate,snail secretion filtrate
54,Ceramide NP,ceramide np
151,Avena Sativa Kernel Extract,colloidal oatmeal
186,Ceramide 2,ceramide 2
193,Titanium Dioxide,titanium dioxide
335,Asiatic Acid,asiatic acid
541,Clay,clay
561,Palmitoyl Oligopeptide,palmitoyl oligopeptide
653,Methylpropanediol,methylpropanediol


In [9]:
ingredients = ingredients.drop_duplicates(subset='synonym')
ingredients.reset_index(drop=True, inplace=True)

ingredients[ingredients['synonym'].duplicated()]

Unnamed: 0,generic_name,synonym


In [10]:
# pd.options.display.max_rows=1000   

# display(ingredients.tail(1000))


In [11]:
ingredients.to_csv('../Data/ingredients.csv')