## 1. Introduction

#### Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phola\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phola\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Read the toxic_beauty_ingredients file

In [2]:
toxic_beauty_ingredients = pd.read_csv('./Data/toxic beauty ingredients.csv', sep=";", encoding='latin1')
#The list of ingredients comes out as the name of the column, we will save it in a list
toxic_ingredients_list = str(toxic_beauty_ingredients.columns[0]).split(";")

In [3]:
toxic_ingredients_list 

['Alkylphenols',
 'nonylphenol',
 'nonoxynol',
 'octylphenol',
 'O-phenylphenol',
 'propylphenol',
 'amylphenol',
 'heptylphenol',
 'dodecylphenol',
 'methylphenol',
 'cresol',
 'ethylpenol',
 'xylenol',
 '4-tert-octylphenol',
 'BHA / BHT',
 'E320',
 'Butylated hydroxyanisole',
 'Butylhydroxyanisole',
 'BHA',
 'BHT',
 'butylphenyl methylpropional',
 'lilial',
 'BMHCA',
 'Dioxyde de silicium',
 'E551',
 'Silica',
 'silice',
 'Gel bleu de silice',
 'Gel 60 de silice',
 'Silicon dioxide',
 'terre diatomee',
 'terre de diatomee',
 'CI 7811',
 'Cab-o-sil',
 'Diatomaceous earth calcined',
 'Diatomaceous silica',
 'Dioxosilane',
 'Pigment White 27',
 'Silanox 101',
 'Siliceous earth',
 'Solum diatomeae',
 'White carbon',
 'CAS 7631-86-9',
 'Dioxyde de titane',
 'Titanium dioxide (nano)',
 'Oxyde de titane',
 'bioxyde de titane',
 'E171',
 'TiO2',
 'Cl 77891',
 'cl-77891',
 'ci-77891',
 'CI 77891',
 'TITANIUM DIOXIDE',
 'E133 - Bleu brillant FCF',
 'Bleu brillant FCF',
 'C.I. Acid Blue 9',
 'C

Some strings begin with space, let's use rstrip() to remove them and put every string in lowercase.

In [4]:
toxic_ingredients_list = [string.rstrip().lower() for string in toxic_ingredients_list]

The last string in the list is empty, we'll remove it.

In [5]:
toxic_ingredients_list.remove('')

#### Read the cosmetic file containing our dataset

In [6]:
data = pd.read_csv('./Data/cosmetics-final.csv', sep=";")

#### Let's take a look at our dataframe 

In [7]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452 entries, 0 to 1451
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Label        1452 non-null   object 
 1   Brand        1452 non-null   object 
 2   Name         1452 non-null   object 
 3   Price        1452 non-null   int64  
 4   Rank         1452 non-null   float64
 5   Ingredients  1452 non-null   object 
 6   Combination  1452 non-null   int64  
 7   Dry          1452 non-null   int64  
 8   Normal       1452 non-null   int64  
 9   Oily         1452 non-null   int64  
 10  Sensitive    1452 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 124.9+ KB


## 2. Combining the toxic_ingredients_list into the dataset

#### Let's combine the toxic_ingredients_list with the dataframe 

In [9]:
#We're going to define a function which will return the number of harmful substences.
def toxic(ingredients):
    #creating a list with the ingredients of the product
    ingredients_list = str(ingredients).split(", ")
    #converting them to lowercases 
    ingredients_list = [string.lower() for string in ingredients_list]
    #iniating the number of harmful substances to be counted
    harmful_substances = 0
    #creating a list with the harmful substances (which we won't return, but can be if we need it in the future)
    list_harm=[]
    for ingredient in ingredients_list:
        #All susbtances with 'PEG-' are harmful
        if 'peg-' in ingredient:
            harmful_substances += 1
            list_harm.append(ingredient)
        elif 'peg/' in ingredient:
            harmful_substances += 1
            list_harm.append(ingredient)
        elif ingredient in toxic_ingredients_list:
            harmful_substances += 1
            list_harm.append(ingredient)
    return harmful_substances

In [10]:
#Creating the column with the number of harmful susbtences per product
data['Harmful_Substances'] = data.Ingredients.apply(toxic)

In [11]:
data[data.Harmful_Substances == 0]

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances
7,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil,72,4.4,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0,0
11,Moisturizer,KIEHL'S SINCE 1851,Midnight Recovery Concentrate,47,4.4,Caprylic/Capric Triglyceride Dicaprylyl Carbon...,1,1,1,1,1,0
13,Moisturizer,SUNDAY RILEY,Luna Sleeping Night Oil,105,4.1,"Persea Gratissima (Extra Virgin, Cold Pressed ...",1,1,1,1,1,0
14,Moisturizer,FARMACY,Honeymoon Glow AHA Resurfacing Night Serum wit...,58,4.6,"Water, Lactic Acid, Propanediol, Jojoba Esters...",1,1,1,1,1,0
26,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil Mini,40,4.5,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1438,Sun protect,COOLA,Sport Continuous Spray SPF 30 - Unscented,32,5.0,"Alcohol (Organic), Algae Extract (Organic), Al...",1,1,1,1,1,0
1440,Sun protect,SUPERGOOP!,Perfect Day 2-in-1 Everywear Lotion Broad Spec...,19,4.8,"-Homosalate 10%, Octinoxate 7.5%, Octisalate 5...",1,1,1,1,0,0
1442,Sun protect,COOLA,Summer Duo,36,4.8,"Avobenzone 2.8%, Octisalate 4.9%, Octocrylene ...",0,0,0,0,0,0
1445,Sun protect,URBAN DECAY,Naked Skin Bronzing Beauty Balm Broad Spectrum...,34,4.1,-Pepha® (derived from watermelon extract): Pro...,0,0,0,0,0,0


We are left with 226 "safe" products.

## 3. Data cleaning

#### Evaluer les données catégorielles

In [12]:
data.Ingredients.value_counts()

No Info                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         23
No info                                                                                                                                                                              

#### Supprimer les produits sans ingrédient "No Info"

In [13]:
list_index_NoInfo = data[data.Ingredients == "No Info"].index

In [14]:
list_index_Noinfo = data[data.Ingredients == "No info"].index

In [15]:
# Suppression de 31 produits sans ingrédient
data.drop(list_index_NoInfo, axis=0, inplace=True)
data.drop(list_index_Noinfo, axis=0, inplace=True)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1421 entries, 0 to 1451
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Label               1421 non-null   object 
 1   Brand               1421 non-null   object 
 2   Name                1421 non-null   object 
 3   Price               1421 non-null   int64  
 4   Rank                1421 non-null   float64
 5   Ingredients         1421 non-null   object 
 6   Combination         1421 non-null   int64  
 7   Dry                 1421 non-null   int64  
 8   Normal              1421 non-null   int64  
 9   Oily                1421 non-null   int64  
 10  Sensitive           1421 non-null   int64  
 11  Harmful_Substances  1421 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 144.3+ KB


#### Cleaner les ingrédients avant le bag of words

In [17]:
def clean_up(s):
    #putting everything in lowercase
    s_lower = s.lower()
    #removing numbers
    s_sans_chiffre = re.sub("[0-9]","", s_lower)
    #removing everything that is in between parenthesis
    s_sans_par = re.sub(r'\([^)]*\)', '', s_sans_chiffre)
    return s_sans_par

In [18]:
data['ing_processed'] = data['Ingredients'].apply(clean_up)

In [19]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances,ing_processed
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,10,"algae extract, mineral oil, petrolatum, glyce..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,1,"galactomyces ferment filtrate , butylene glyco..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,1,"water, dicaprylyl carbonate, glycerin, ceteary..."
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,12,"algae extract, cyclopentasiloxane, petrolatum..."
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,5,"water, snail secretion filtrate, phenyl trimet..."


#### Tokeniser les ingrédients

In [20]:
#Création d'une fonction qui split les ingrédients à chaque virgule et ne comptabilise pas les doublons

def ingredients_tokenizer(v):
    return list(set([w.strip() for w in v.split(",")]))

In [21]:
data['ing_processed'][0]

'algae  extract, mineral oil, petrolatum, glycerin, isohexadecane, microcrystalline wax, lanolin alcohol, citrus aurantifolia  extract, sesamum indicum  seed oil, eucalyptus globulus  leaf oil, sesamum indicum  seed powder, medicago sativa  seed powder, helianthus annuus  seedcake, prunus amygdalus dulcis  seed meal, sodium gluconate, copper gluconate, calcium gluconate, magnesium gluconate, zinc gluconate, magnesium sulfate, paraffin, tocopheryl succinate, niacin, water, beta-carotene, decyl oleate, aluminum distearate, octyldodecanol, citric acid, cyanocobalamin, magnesium stearate, panthenol, limonene, geraniol, linalool, hydroxycitronellal, citronellol, benzyl salicylate, citral, sodium benzoate, alcohol denat., fragrance.'

In [22]:
#Création d'une colonne avec ingrédients tokénisés :

data['ing_processed'] = data['ing_processed'].apply(ingredients_tokenizer)

In [23]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances,ing_processed
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,10,"[algae extract, decyl oleate, beta-carotene, ..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,1,"[galactomyces ferment filtrate, water, methylp..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,1,"[, polysorbate, -hexanediol, histidine, heptap..."
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,12,"[phenyl trimethicone, hydroxycitronellal, glyc..."
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,5,"[, phenyl trimethicone, calcium stearate, sorb..."


## 4. Création d'un bag of words

In [24]:
# Extraire ces ingrédients mots de chaque ligne dans une liste commune
liste_extraite = [ing for ing in data['ing_processed']]

In [25]:
#We want to keep only the first part of the string containing "/" as after it's only another name of the chemical substance
def first_part_before_slash(string):
    return string.partition('/')[0]

In [26]:
#We want to keep only the first part of the string containing "*" as after it's only precision about the ingredient
def first_part_before_star(string):
    return string.partition('*')[0]

In [27]:
#We want to add a comma after oil and extract
def add_comma(string):
    if "oil" in string or "extract" in string:
        return string + ","
    else:
        return string

In [28]:
#flatten the list and applying the function first_part_before_slash and first part of the string containing
ingredients_processed =  [first_part_before_star(first_part_before_slash(ing)) for ligne in liste_extraite for ing in ligne]

#applying add_comma
ingredients_processed = [add_comma(ing) for ing in ingredients_processed]

In [29]:
#Removing the double (or more) spaces
def remove_additional_spaces(string):
    return " ".join(string.split())

In [30]:
ingredients_processed = [remove_additional_spaces(string) for string in ingredients_processed]

In [31]:
#Removing the coma, point and hyphen
def remove_coma_point_hyphen(string):
    final_string = string.replace('-','')
    final_string = final_string.replace('.','')
    final_string= final_string.replace(',','')
    return final_string

In [32]:
#Applying the function remove_coma_point_hyphen
ingredients_processed = [remove_coma_point_hyphen(string) for string in ingredients_processed]

In [33]:
#Filtering the empty string
filter_object = filter(lambda x: x != "", ingredients_processed)
ingredients_processed = list(filter_object)

In [34]:
from nltk.probability import FreqDist

In [35]:
frequence_ingredients = FreqDist(ingredients_processed)
Top_1000 = frequence_ingredients.most_common(1000)

In [36]:
frequence_ingredients

FreqDist({'water': 1057, 'glycerin': 1055, 'phenoxyethanol': 943, 'butylene glycol': 816, 'disodium edta': 582, 'dimethicone': 582, 'sodium hyaluronate': 512, 'caprylyl glycol': 478, 'tocopheryl acetate': 457, 'xanthan gum': 446, ...})

In [37]:
Top_1000

[('water', 1057),
 ('glycerin', 1055),
 ('phenoxyethanol', 943),
 ('butylene glycol', 816),
 ('disodium edta', 582),
 ('dimethicone', 582),
 ('sodium hyaluronate', 512),
 ('caprylyl glycol', 478),
 ('tocopheryl acetate', 457),
 ('xanthan gum', 446),
 ('ethylhexylglycerin', 435),
 ('fragrance', 412),
 ('tocopherol', 401),
 ('citric acid', 400),
 ('caprylic', 388),
 ('polysorbate', 357),
 ('potassium sorbate', 336),
 ('carbomer', 314),
 ('linalool', 308),
 ('limonene', 304),
 ('sodium hydroxide', 295),
 ('sodium benzoate', 294),
 ('propanediol', 260),
 ('glyceryl stearate', 254),
 ('silica', 253),
 ('peg', 246),
 ('acrylates', 244),
 ('pentylene glycol', 240),
 ('squalane', 237),
 ('camellia sinensis leaf extract', 225),
 ('caffeine', 223),
 ('peg stearate', 217),
 ('cetearyl alcohol', 207),
 ('titanium dioxide', 206),
 ('bht', 203),
 ('lecithin', 201),
 ('cyclopentasiloxane', 193),
 ('stearic acid', 193),
 ('hexanediol', 192),
 ('panthenol', 183),
 ('alcohol', 182),
 ('mica', 177),
 ('h

## 5. Création du One Hot Encoding

In [38]:
data_onehotencoding = pd.get_dummies(Top_1000, drop_first=True)
data_onehotencoding

Unnamed: 0,"(acacia senegal gum, 9)","(acer saccharum extract, 27)","(acetic acid, 8)","(acetyl dipeptide cetyl ester, 15)","(acetyl glucosamine, 68)","(acetyl hexapeptide, 81)","(acetyl octapeptide, 7)","(acetyl tetrapeptide, 19)","(achillea millefolium extract, 17)","(acrylamide, 27)",...,"(yellow , 13)","(zea mays oil, 7)","(zea mays starch, 15)","(zinc gluconate, 76)","(zinc oxide, 21)","(zinc oxide %water, 9)","(zinc pca, 19)","(zinc sulfate, 16)","(zingiber aromaticus extract, 7)","(zingiber officinale root extract, 40)"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
#Concaténation du One Hot Encoding avec le df initial
data_ML = pd.concat([data, data_onehotencoding], axis=1)

In [40]:
data_ML.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,...,"(yellow , 13)","(zea mays oil, 7)","(zea mays starch, 15)","(zinc gluconate, 76)","(zinc oxide, 21)","(zinc oxide %water, 9)","(zinc pca, 19)","(zinc sulfate, 16)","(zingiber aromaticus extract, 7)","(zingiber officinale root extract, 40)"
0,Moisturizer,LA MER,Crème de la Mer,175.0,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Moisturizer,SK-II,Facial Treatment Essence,179.0,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68.0,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175.0,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38.0,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
#Notre dataset comprend désormais 1012 colonnes
len(data_ML.columns)

1012

Let's save the data into a new file

In [42]:
data_ML.to_csv('./Data/cosmetics_cleaned.csv', index=False)