## 1. Introduction

#### Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phola\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phola\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Read the toxic_beauty_ingredients file

In [2]:
toxic_beauty_ingredients = pd.read_csv('./Data/toxic beauty ingredients.csv', sep=";", encoding='latin1')
#The list of ingredients comes out as the name of the column, we will save it in a list
toxic_ingredients_list = str(toxic_beauty_ingredients.columns[0]).split(";")

Some strings begin with space, let's use rstrip() to remove them and put every string in lowercase.

In [3]:
toxic_ingredients_list = [string.rstrip().lower() for string in toxic_ingredients_list]

The last string in the list is empty, we'll remove it.

In [4]:
toxic_ingredients_list.remove('')

#### Read the cosmetic file containing our dataset

In [5]:
data = pd.read_csv('./Data/cosmetics-final.csv', sep=";")

#### Let's take a look at our dataframe 

In [6]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452 entries, 0 to 1451
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Label        1452 non-null   object 
 1   Brand        1452 non-null   object 
 2   Name         1452 non-null   object 
 3   Price        1452 non-null   int64  
 4   Rank         1452 non-null   float64
 5   Ingredients  1452 non-null   object 
 6   Combination  1452 non-null   int64  
 7   Dry          1452 non-null   int64  
 8   Normal       1452 non-null   int64  
 9   Oily         1452 non-null   int64  
 10  Sensitive    1452 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 124.9+ KB


## 2. Combining the toxic_ingredients_list into the dataset

#### Let's combine the toxic_ingredients_list with the dataframe 

In [8]:
#We're going to define a function which will return the number of harmful substences.
def toxic(ingredients):
    #creating a list with the ingredients of the product
    ingredients_list = str(ingredients).split(", ")
    #converting them to lowercases 
    ingredients_list = [string.lower() for string in ingredients_list]
    #iniating the number of harmful substances to be counted
    harmful_substances = 0
    #creating a list with the harmful substances (which we won't return, but can be if we need it in the future)
    list_harm=[]
    for ingredient in ingredients_list:
        #All susbtances with 'PEG-' are harmful
        if 'peg-' in ingredient:
            harmful_substances += 1
            list_harm.append(ingredient)
        elif 'peg/' in ingredient:
            harmful_substances += 1
            list_harm.append(ingredient)
        elif ingredient in toxic_ingredients_list:
            harmful_substances += 1
            list_harm.append(ingredient)
    return harmful_substances

In [9]:
#Creating the column with the number of harmful susbtences per product
data['Harmful_Substances'] = data.Ingredients.apply(toxic)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452 entries, 0 to 1451
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Label               1452 non-null   object 
 1   Brand               1452 non-null   object 
 2   Name                1452 non-null   object 
 3   Price               1452 non-null   int64  
 4   Rank                1452 non-null   float64
 5   Ingredients         1452 non-null   object 
 6   Combination         1452 non-null   int64  
 7   Dry                 1452 non-null   int64  
 8   Normal              1452 non-null   int64  
 9   Oily                1452 non-null   int64  
 10  Sensitive           1452 non-null   int64  
 11  Harmful_Substances  1452 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 136.2+ KB


In [11]:
data[data.Harmful_Substances == 0]

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances
7,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil,72,4.4,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0,0
11,Moisturizer,KIEHL'S SINCE 1851,Midnight Recovery Concentrate,47,4.4,Caprylic/Capric Triglyceride Dicaprylyl Carbon...,1,1,1,1,1,0
13,Moisturizer,SUNDAY RILEY,Luna Sleeping Night Oil,105,4.1,"Persea Gratissima (Extra Virgin, Cold Pressed ...",1,1,1,1,1,0
14,Moisturizer,FARMACY,Honeymoon Glow AHA Resurfacing Night Serum wit...,58,4.6,"Water, Lactic Acid, Propanediol, Jojoba Esters...",1,1,1,1,1,0
26,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil Mini,40,4.5,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1438,Sun protect,COOLA,Sport Continuous Spray SPF 30 - Unscented,32,5.0,"Alcohol (Organic), Algae Extract (Organic), Al...",1,1,1,1,1,0
1440,Sun protect,SUPERGOOP!,Perfect Day 2-in-1 Everywear Lotion Broad Spec...,19,4.8,"-Homosalate 10%, Octinoxate 7.5%, Octisalate 5...",1,1,1,1,0,0
1442,Sun protect,COOLA,Summer Duo,36,4.8,"Avobenzone 2.8%, Octisalate 4.9%, Octocrylene ...",0,0,0,0,0,0
1445,Sun protect,URBAN DECAY,Naked Skin Bronzing Beauty Balm Broad Spectrum...,34,4.1,-Pepha® (derived from watermelon extract): Pro...,0,0,0,0,0,0


We are left with 257 "safe" products.

## 3. Data cleaning

#### Cleaning the categorial values

Let's take a look at the frequency of each ingredients

In [12]:
data.Ingredients.value_counts()

No Info                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

We can see that some products don't have any ingredients ("No info"), and most of the ingredients are presented like a list of string separated by a comma

#### Let's drop the product with "No info"

In [13]:
list_index_NoInfo = data[data.Ingredients == "No Info"].index

In [14]:
list_index_Noinfo = data[data.Ingredients == "No info"].index

In [15]:
# We are dropping the 31 rows with no ingredients
data.drop(list_index_NoInfo, axis=0, inplace=True)
data.drop(list_index_Noinfo, axis=0, inplace=True)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1421 entries, 0 to 1451
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Label               1421 non-null   object 
 1   Brand               1421 non-null   object 
 2   Name                1421 non-null   object 
 3   Price               1421 non-null   int64  
 4   Rank                1421 non-null   float64
 5   Ingredients         1421 non-null   object 
 6   Combination         1421 non-null   int64  
 7   Dry                 1421 non-null   int64  
 8   Normal              1421 non-null   int64  
 9   Oily                1421 non-null   int64  
 10  Sensitive           1421 non-null   int64  
 11  Harmful_Substances  1421 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 144.3+ KB


#### Let's continue to clean the ingredients column before doing the bag of words

In [17]:
def clean_up(s):
    #putting everything in lowercase
    s_lower = s.lower()
    
    #removing numbers (as we have to many ingredients, we decided to regroup them by removing the numbers)
    s_without_nb = re.sub("[0-9]","", s_lower)
    
    #removing everything that is in between parenthesis
    s_without_par = re.sub(r'\([^)]*\)', '', s_without_nb)
    
    #Removing the double (or more) spaces
    s_only_one_space = " ".join(s_without_par.split())
    
    #Keeping only the first part of the string containing "/" as after it's only another name of the chemical substance
    s_first_part_before_slash = s_only_one_space.partition('/')[0]
    
    #Keeping only the first part of the string containing "" as after it's only precision about the ingredient
    s_first_part_before_star = s_first_part_before_slash.partition('*')[0]
    
    #Keeping only the first part of the string containing ":" as after it's only precision about the ingredient
    s_first_part_before_colon = s_first_part_before_star.partition(':')[0]
    
    #Removing the "%" character
    s_without_percent = s_first_part_before_colon.replace('%', '')
    
    #We want to add a comma after oil and extract
    if "oil" in s_without_percent or "extract" in s_without_percent:
        s_add_comma = s_without_percent + ","
    else:
        s_add_comma = s_without_percent
    
    #We want to delete the double commas
    s_single_comma = s_add_comma.replace(',,', ',')
    
    #Removing the point
    s_without_point = s_single_comma.replace('.', '')
    
    #Removing commas if they are at the beginning of the string or at the end
    if s_without_point[:2] == ', ':
        s_remove_first_comma = s_without_point[2:]
    else:
        s_remove_first_comma = s_without_point
        
    if s_remove_first_comma != '' and s_remove_first_comma[-1] == ',':
        s_remove_last_comma = s_remove_first_comma[:-1]
    else:
        s_remove_last_comma = s_remove_first_comma           
 
    return s_remove_last_comma

In [18]:
data['ing_processed'] = data['Ingredients'].apply(clean_up)

In [19]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances,ing_processed
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,10,"algae extract, mineral oil, petrolatum, glycer..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,1,"galactomyces ferment filtrate , butylene glyco..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,1,"water, dicaprylyl carbonate, glycerin, ceteary..."
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,12,"algae extract, cyclopentasiloxane, petrolatum,..."
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,5,"water, snail secretion filtrate, phenyl trimet..."


#### Tokenise the ingredients

In [20]:
#Let's define a function to create a list of ingredients which won't keep duplicates

def ingredients_tokenizer(v):
    return list(set([w.strip() for w in v.split(",")]))

In [21]:
data['ing_processed'][0]

'algae extract, mineral oil, petrolatum, glycerin, isohexadecane, microcrystalline wax, lanolin alcohol, citrus aurantifolia extract, sesamum indicum seed oil, eucalyptus globulus leaf oil, sesamum indicum seed powder, medicago sativa seed powder, helianthus annuus seedcake, prunus amygdalus dulcis seed meal, sodium gluconate, copper gluconate, calcium gluconate, magnesium gluconate, zinc gluconate, magnesium sulfate, paraffin, tocopheryl succinate, niacin, water, beta-carotene, decyl oleate, aluminum distearate, octyldodecanol, citric acid, cyanocobalamin, magnesium stearate, panthenol, limonene, geraniol, linalool, hydroxycitronellal, citronellol, benzyl salicylate, citral, sodium benzoate, alcohol denat, fragrance'

In [22]:
#Creation of the column with tokenised ingredients

data['ing_processed'] = data['ing_processed'].apply(ingredients_tokenizer)

In [23]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances,ing_processed
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,10,"[zinc gluconate, sodium gluconate, aluminum di..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,1,"[sorbic acid, butylene glycol, galactomyces fe..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,1,"[glycerin, cetearyl olivate, sclerocarya birre..."
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,12,"[cholesterol, polysilicone-, butyrospermum par..."
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,5,"[butylene glycol, dimethicone, phenyl trimethi..."


## Let's save the data into a new file

In [41]:
data = pd.read_csv('./Data/cosmetics_cleaned.csv')