## 1. Introduction

#### Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phola\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phola\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Read the toxic_beauty_ingredients file

In [2]:
toxic_beauty_ingredients = pd.read_csv('./Data/toxic beauty ingredients.csv', sep=";", encoding='latin1')
#The list of ingredients comes out as the name of the column, we will save it in a list
toxic_ingredients_list = str(toxic_beauty_ingredients.columns[0]).split(";")

Some strings begin with space, let's use rstrip() to remove them and put every string in lowercase.

In [3]:
toxic_ingredients_list = [string.rstrip().lower() for string in toxic_ingredients_list]

The last string in the list is empty, we'll remove it.

In [4]:
toxic_ingredients_list.remove('')

#### Read the cosmetic file containing our dataset

In [5]:
data = pd.read_csv('./Data/cosmetics-final.csv', sep=";")

#### Let's take a look at our dataframe 

In [6]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452 entries, 0 to 1451
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Label        1452 non-null   object 
 1   Brand        1452 non-null   object 
 2   Name         1452 non-null   object 
 3   Price        1452 non-null   int64  
 4   Rank         1452 non-null   float64
 5   Ingredients  1452 non-null   object 
 6   Combination  1452 non-null   int64  
 7   Dry          1452 non-null   int64  
 8   Normal       1452 non-null   int64  
 9   Oily         1452 non-null   int64  
 10  Sensitive    1452 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 124.9+ KB


## 2. Combining the toxic_ingredients_list into the dataset

#### Let's combine the toxic_ingredients_list with the dataframe 

In [8]:
#We're going to define a function which will return the number of harmful substences.
def toxic(ingredients):
    #creating a list with the ingredients of the product
    ingredients_list = str(ingredients).split(", ")
    #converting them to lowercases 
    ingredients_list = [string.lower() for string in ingredients_list]
    #iniating the number of harmful substances to be counted
    harmful_substances = 0
    #creating a list with the harmful substances (which we won't return, but can be if we need it in the future)
    list_harm=[]
    for ingredient in ingredients_list:
        #All susbtances with 'PEG-' are harmful
        if 'peg-' in ingredient:
            harmful_substances += 1
            list_harm.append(ingredient)
        elif 'peg/' in ingredient:
            harmful_substances += 1
            list_harm.append(ingredient)
        elif ingredient in toxic_ingredients_list:
            harmful_substances += 1
            list_harm.append(ingredient)
    return harmful_substances

In [9]:
#Creating the column with the number of harmful susbtences per product
data['Harmful_Substances'] = data.Ingredients.apply(toxic)

In [10]:
data[data.Harmful_Substances == 0]

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances
7,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil,72,4.4,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0,0
11,Moisturizer,KIEHL'S SINCE 1851,Midnight Recovery Concentrate,47,4.4,Caprylic/Capric Triglyceride Dicaprylyl Carbon...,1,1,1,1,1,0
13,Moisturizer,SUNDAY RILEY,Luna Sleeping Night Oil,105,4.1,"Persea Gratissima (Extra Virgin, Cold Pressed ...",1,1,1,1,1,0
14,Moisturizer,FARMACY,Honeymoon Glow AHA Resurfacing Night Serum wit...,58,4.6,"Water, Lactic Acid, Propanediol, Jojoba Esters...",1,1,1,1,1,0
26,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil Mini,40,4.5,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1438,Sun protect,COOLA,Sport Continuous Spray SPF 30 - Unscented,32,5.0,"Alcohol (Organic), Algae Extract (Organic), Al...",1,1,1,1,1,0
1440,Sun protect,SUPERGOOP!,Perfect Day 2-in-1 Everywear Lotion Broad Spec...,19,4.8,"-Homosalate 10%, Octinoxate 7.5%, Octisalate 5...",1,1,1,1,0,0
1442,Sun protect,COOLA,Summer Duo,36,4.8,"Avobenzone 2.8%, Octisalate 4.9%, Octocrylene ...",0,0,0,0,0,0
1445,Sun protect,URBAN DECAY,Naked Skin Bronzing Beauty Balm Broad Spectrum...,34,4.1,-Pepha® (derived from watermelon extract): Pro...,0,0,0,0,0,0


We are left with 226 "safe" products.

## 3. Data cleaning

#### Evaluer les données catégorielles

In [11]:
data.Ingredients.value_counts()

No Info                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

#### Supprimer les produits sans ingrédient "No Info"

In [12]:
list_index_NoInfo = data[data.Ingredients == "No Info"].index

In [13]:
list_index_Noinfo = data[data.Ingredients == "No info"].index

In [14]:
# Suppression de 31 produits sans ingrédient
data.drop(list_index_NoInfo, axis=0, inplace=True)
data.drop(list_index_Noinfo, axis=0, inplace=True)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1421 entries, 0 to 1451
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Label               1421 non-null   object 
 1   Brand               1421 non-null   object 
 2   Name                1421 non-null   object 
 3   Price               1421 non-null   int64  
 4   Rank                1421 non-null   float64
 5   Ingredients         1421 non-null   object 
 6   Combination         1421 non-null   int64  
 7   Dry                 1421 non-null   int64  
 8   Normal              1421 non-null   int64  
 9   Oily                1421 non-null   int64  
 10  Sensitive           1421 non-null   int64  
 11  Harmful_Substances  1421 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 144.3+ KB


#### Cleaner les ingrédients avant le bag of words

In [16]:
def clean_up(s):
    
    s_lower = s.lower()
    s_sans_chiffre = re.sub("[0-9]","", s_lower)
    s_sans_par = re.sub(r'\([^)]*\)', '', s_sans_chiffre)
    s_sans_carac = re.sub("\W"," ", s_sans_par)
    return s_sans_carac

In [17]:
data['ing_processed'] = data['Ingredients'].apply(clean_up)

In [18]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances,ing_processed
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,10,algae extract mineral oil petrolatum glyce...
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,1,galactomyces ferment filtrate butylene glyco...
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,1,water dicaprylyl carbonate glycerin ceteary...
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,12,algae extract cyclopentasiloxane petrolatum...
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,5,water snail secretion filtrate phenyl trimet...


#### Tokeniser les ingrédients