## 1. Introduction

#### Libraries

In [50]:
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amandine.gauberville\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amandine.gauberville\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package wordnet is already up-to-date!


#### Read the toxic_beauty_ingredients file

In [51]:
toxic_beauty_ingredients = pd.read_csv('./Data/toxic beauty ingredients.csv', sep=";", encoding='latin1')
#The list of ingredients comes out as the name of the column, we will save it in a list
toxic_ingredients_list = str(toxic_beauty_ingredients.columns[0]).split(";")

Some strings begin with space, let's use rstrip() to remove them and put every string in lowercase.

In [52]:
toxic_ingredients_list = [string.rstrip().lower() for string in toxic_ingredients_list]

The last string in the list is empty, we'll remove it.

In [53]:
toxic_ingredients_list.remove('')

#### Read the cosmetic file containing our dataset

In [54]:
data = pd.read_csv('./Data/cosmetics-final.csv', sep=";")

#### Let's take a look at our dataframe 

In [55]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452 entries, 0 to 1451
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Label        1452 non-null   object 
 1   Brand        1452 non-null   object 
 2   Name         1452 non-null   object 
 3   Price        1452 non-null   int64  
 4   Rank         1452 non-null   float64
 5   Ingredients  1452 non-null   object 
 6   Combination  1452 non-null   int64  
 7   Dry          1452 non-null   int64  
 8   Normal       1452 non-null   int64  
 9   Oily         1452 non-null   int64  
 10  Sensitive    1452 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 124.9+ KB


## 2. Combining the toxic_ingredients_list into the dataset

#### Let's combine the toxic_ingredients_list with the dataframe 

In [57]:
#We're going to define a function which will return the number of harmful substences.
def toxic(ingredients):
    #creating a list with the ingredients of the product
    ingredients_list = str(ingredients).split(", ")
    #converting them to lowercases 
    ingredients_list = [string.lower() for string in ingredients_list]
    #iniating the number of harmful substances to be counted
    harmful_substances = 0
    #creating a list with the harmful substances (which we won't return, but can be if we need it in the future)
    list_harm=[]
    for ingredient in ingredients_list:
        #All susbtances with 'PEG-' are harmful
        if 'peg-' in ingredient:
            harmful_substances += 1
            list_harm.append(ingredient)
        elif 'peg/' in ingredient:
            harmful_substances += 1
            list_harm.append(ingredient)
        elif ingredient in toxic_ingredients_list:
            harmful_substances += 1
            list_harm.append(ingredient)
    return harmful_substances

In [58]:
#Creating the column with the number of harmful susbtences per product
data['Harmful_Substances'] = data.Ingredients.apply(toxic)

In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452 entries, 0 to 1451
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Label               1452 non-null   object 
 1   Brand               1452 non-null   object 
 2   Name                1452 non-null   object 
 3   Price               1452 non-null   int64  
 4   Rank                1452 non-null   float64
 5   Ingredients         1452 non-null   object 
 6   Combination         1452 non-null   int64  
 7   Dry                 1452 non-null   int64  
 8   Normal              1452 non-null   int64  
 9   Oily                1452 non-null   int64  
 10  Sensitive           1452 non-null   int64  
 11  Harmful_Substances  1452 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 136.2+ KB


In [60]:
data[data.Harmful_Substances == 0]

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances
7,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil,72,4.4,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0,0
11,Moisturizer,KIEHL'S SINCE 1851,Midnight Recovery Concentrate,47,4.4,Caprylic/Capric Triglyceride Dicaprylyl Carbon...,1,1,1,1,1,0
13,Moisturizer,SUNDAY RILEY,Luna Sleeping Night Oil,105,4.1,"Persea Gratissima (Extra Virgin, Cold Pressed ...",1,1,1,1,1,0
14,Moisturizer,FARMACY,Honeymoon Glow AHA Resurfacing Night Serum wit...,58,4.6,"Water, Lactic Acid, Propanediol, Jojoba Esters...",1,1,1,1,1,0
26,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil Mini,40,4.5,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1438,Sun protect,COOLA,Sport Continuous Spray SPF 30 - Unscented,32,5.0,"Alcohol (Organic), Algae Extract (Organic), Al...",1,1,1,1,1,0
1440,Sun protect,SUPERGOOP!,Perfect Day 2-in-1 Everywear Lotion Broad Spec...,19,4.8,"-Homosalate 10%, Octinoxate 7.5%, Octisalate 5...",1,1,1,1,0,0
1442,Sun protect,COOLA,Summer Duo,36,4.8,"Avobenzone 2.8%, Octisalate 4.9%, Octocrylene ...",0,0,0,0,0,0
1445,Sun protect,URBAN DECAY,Naked Skin Bronzing Beauty Balm Broad Spectrum...,34,4.1,-Pepha® (derived from watermelon extract): Pro...,0,0,0,0,0,0


We are left with 226 "safe" products.

## 3. Data cleaning

#### Evaluer les données catégorielles

In [11]:
data.Ingredients.value_counts()

No Info                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

#### Supprimer les produits sans ingrédient "No Info"

In [61]:
list_index_NoInfo = data[data.Ingredients == "No Info"].index

In [62]:
list_index_Noinfo = data[data.Ingredients == "No info"].index

In [63]:
# Suppression de 31 produits sans ingrédient
data.drop(list_index_NoInfo, axis=0, inplace=True)
data.drop(list_index_Noinfo, axis=0, inplace=True)

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1421 entries, 0 to 1451
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Label               1421 non-null   object 
 1   Brand               1421 non-null   object 
 2   Name                1421 non-null   object 
 3   Price               1421 non-null   int64  
 4   Rank                1421 non-null   float64
 5   Ingredients         1421 non-null   object 
 6   Combination         1421 non-null   int64  
 7   Dry                 1421 non-null   int64  
 8   Normal              1421 non-null   int64  
 9   Oily                1421 non-null   int64  
 10  Sensitive           1421 non-null   int64  
 11  Harmful_Substances  1421 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 144.3+ KB


#### Cleaner les ingrédients avant le bag of words

In [65]:
def clean_up(s):
    #putting everything in lowercase
    s_lower = s.lower()
    #removing numbers
    s_sans_chiffre = re.sub("[0-9]","", s_lower)
    #removing everything that is in between parenthesis
    s_sans_par = re.sub(r'\([^)]*\)', '', s_sans_chiffre)
    return s_sans_par

In [66]:
data['ing_processed'] = data['Ingredients'].apply(clean_up)

In [67]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances,ing_processed
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,10,"algae extract, mineral oil, petrolatum, glyce..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,1,"galactomyces ferment filtrate , butylene glyco..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,1,"water, dicaprylyl carbonate, glycerin, ceteary..."
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,12,"algae extract, cyclopentasiloxane, petrolatum..."
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,5,"water, snail secretion filtrate, phenyl trimet..."


#### Tokeniser les ingrédients

In [68]:
#Création d'une fonction qui split les ingrédients à chaque virgule et ne comptabilise pas les doublons

def ingredients_tokenizer(v):
    return list(set([w.strip() for w in v.split(",")]))

In [69]:
data['ing_processed'][0]

'algae  extract, mineral oil, petrolatum, glycerin, isohexadecane, microcrystalline wax, lanolin alcohol, citrus aurantifolia  extract, sesamum indicum  seed oil, eucalyptus globulus  leaf oil, sesamum indicum  seed powder, medicago sativa  seed powder, helianthus annuus  seedcake, prunus amygdalus dulcis  seed meal, sodium gluconate, copper gluconate, calcium gluconate, magnesium gluconate, zinc gluconate, magnesium sulfate, paraffin, tocopheryl succinate, niacin, water, beta-carotene, decyl oleate, aluminum distearate, octyldodecanol, citric acid, cyanocobalamin, magnesium stearate, panthenol, limonene, geraniol, linalool, hydroxycitronellal, citronellol, benzyl salicylate, citral, sodium benzoate, alcohol denat., fragrance.'

In [70]:
#Création d'une colonne avec ingrédients tokénisés :

data['ing_processed'] = data['ing_processed'].apply(ingredients_tokenizer)

In [71]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Harmful_Substances,ing_processed
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,10,"[calcium gluconate, copper gluconate, prunus a..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,1,"[sorbic acid., pentylene glycol, butylene glyc..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,1,"[, sorbitan olivate, sclerocarya birrea seed o..."
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,12,"[polyquaternium-, polysilicone-, plankton extr..."
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,5,"[, folic acid, citrus grandis peel oil, -hexa..."


## 4. Création d'un bag of words

In [72]:
# Extraire ces ingrédients mots de chaque ligne dans une liste commune
liste_extraite = [ing for ing in data['ing_processed']]

In [73]:
#We want to keep only the first part of the string containing "/" as after it's only another name of the chemical substance
def first_part_before_slash(string):
    return string.partition('/')[0]

In [74]:
#We want to keep only the first part of the string containing "" as after it's only precision about the ingredient
def first_part_before_star(string):
    return string.partition('*')[0]

In [75]:
#We want to add a comma after oil and extract
def add_comma(string):
    if "oil" in string or "extract" in string:
        return string + ","
    else:
        return string

In [76]:
#flatten the list and applying the function first_part_before_slash and first part of the string containing and add comma
ingredients_processed =  [first_part_before_star(first_part_before_slash(ing)) for ligne in liste_extraite for ing in ligne]
ingredients_processed_withcomma = [add_comma(ing) for ing in ingredients_processed]

len(ingredients_processed_withcomma)
#ingredients_processed_withcomma

47678

In [77]:
#Removing the double (or more) spaces
def remove_additional_spaces(string):
    return " ".join(string.split())

In [78]:
ingredients_processed = [remove_additional_spaces(string) for string in ingredients_processed]

In [80]:
#Removing the coma, point and hyphen
def remove_coma_point_hyphen(string):
    final_string = string.replace('-','')
    final_string = final_string.replace('.','')
    final_string= final_string.replace(',','')
    return final_string

In [81]:
#Applying the function remove_coma_point_hyphen
ingredients_processed = [remove_coma_point_hyphen(string) for string in ingredients_processed]

In [82]:
#Filtering the empty string
filter_object = filter(lambda x: x != "", ingredients_processed)
ingredients_processed = list(filter_object)

In [83]:
from nltk.probability import FreqDist

In [84]:
frequence_ingredients = FreqDist(ingredients_processed)
Top_1000 = frequence_ingredients.most_common(1000)

In [85]:
frequence_ingredients

FreqDist({'water': 1057, 'glycerin': 1055, 'phenoxyethanol': 943, 'butylene glycol': 816, 'disodium edta': 582, 'dimethicone': 582, 'sodium hyaluronate': 512, 'caprylyl glycol': 478, 'tocopheryl acetate': 457, 'xanthan gum': 446, ...})

In [90]:
plusde100 = set([x[0] for x in Top_1000 if x[1] > 100])
len(plusde100)

79

## 5. Création du One Hot Encoding

In [92]:
text_col = [[remove_coma_point_hyphen(
                remove_additional_spaces(
                    first_part_before_star(first_part_before_slash(w)))) for w in row] for row in data['ing_processed']]

In [103]:
text_col = [[w for w in row if w in plusde100] for row in text_col]

In [104]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=lambda x:x,
                     preprocessor=lambda x:x)

In [105]:
bow = cv.fit_transform(text_col)

In [106]:
bow.toarray()

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 1, 0, 1]], dtype=int64)

In [107]:
bow_df = pd.DataFrame(bow.toarray(), columns=cv.get_feature_names())

In [108]:
bow_df.iloc[:, 4].sum()

151

In [110]:
bow_df

Unnamed: 0,acrylates,adenosine,alcohol,alcohol denat,algae extract,allantoin,aloe barbadensis leaf juice,ammonium acryloyldimethyltaurate,betaine,bht,...,stearic acid,sucrose,tetrahexyldecyl ascorbate,titanium dioxide,tocopherol,tocopheryl acetate,trehalose,tromethamine,water,xanthan gum
0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1
3,0,0,0,0,1,0,0,0,0,1,...,0,1,1,0,0,1,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1416,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,1,0
1417,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,1,0,1,0
1418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [114]:
data = data.reset_index().drop('index', axis=1)

In [115]:
#Concaténation du One Hot Encoding avec le df initial
data_ML = pd.concat([data, bow_df], axis=1)

In [118]:
data.isnull().sum().sum()

0

In [119]:
data_ML.isnull().sum().sum()

0

In [None]:
#Notre dataset comprend désormais 1012 colonnes
len(data_ML.columns)

Let's save the data into a new file

In [120]:
data_ML.to_csv('./Data/cosmetics_cleaned.csv', index=False)

Merge

In [121]:
data = pd.read_csv('./Data/cosmetics_cleaned.csv')

In [122]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 92 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Label                             1421 non-null   object 
 1   Brand                             1421 non-null   object 
 2   Name                              1421 non-null   object 
 3   Price                             1421 non-null   int64  
 4   Rank                              1421 non-null   float64
 5   Ingredients                       1421 non-null   object 
 6   Combination                       1421 non-null   int64  
 7   Dry                               1421 non-null   int64  
 8   Normal                            1421 non-null   int64  
 9   Oily                              1421 non-null   int64  
 10  Sensitive                         1421 non-null   int64  
 11  Harmful_Substances                1421 non-null   int64  
 12  ing_pr

In [123]:
data.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,...,stearic acid,sucrose,tetrahexyldecyl ascorbate,titanium dioxide,tocopherol,tocopheryl acetate,trehalose,tromethamine,water,xanthan gum
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,...,0,0,0,0,1,0,0,0,1,1
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,...,0,1,1,0,0,1,1,0,1,0
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,...,1,0,0,0,1,1,0,0,1,0


In [124]:
data.Label.value_counts()

Moisturizer    290
Face Mask      262
Cleanser       260
Treatment      247
Eye cream      199
Sun protect    163
Name: Label, dtype: int64

In [125]:
data.columns[:20]

Index(['Label', 'Brand', 'Name', 'Price', 'Rank', 'Ingredients', 'Combination',
       'Dry', 'Normal', 'Oily', 'Sensitive', 'Harmful_Substances',
       'ing_processed', 'acrylates', 'adenosine', 'alcohol', 'alcohol denat',
       'algae extract', 'allantoin', 'aloe barbadensis leaf juice'],
      dtype='object')

In [126]:
X = data.iloc[:,13:].values

In [127]:
X

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 1, 0, 1]], dtype=int64)

In [129]:
X = X.astype(float)

In [130]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(X, k = 10)

In [132]:
U.shape

(1421, 10)

In [135]:
data['Name']

0                                         Crème de la Mer
1                                Facial Treatment Essence
2                              Protini™ Polypeptide Cream
3                             The Moisturizing Soft Cream
4           Your Skin But Better™ CC+™ Cream with SPF 50+
                              ...                        
1416    Yoghurt Nourishing Fluid Veil Face Sunscreen B...
1417    Daily Deflector™ Waterlight Broad Spectrum SPF...
1418                              Self Tan Dry Oil SPF 50
1419                     Pro Light Self Tan Bronzing Mist
1420    DERMAPROTECT Daily Defense Broad Spectrum SPF 50+
Name: Name, Length: 1421, dtype: object

In [136]:
U[0]

array([ 0.01466501, -0.01544355, -0.025304  , -0.00156107, -0.00526544,
       -0.02678802, -0.00322233, -0.06774709,  0.04241079,  0.01770433])

In [139]:
from sklearn.metrics.pairwise import cosine_similarity

In [141]:
cosine_similarity([U[0]], [U[0]])

array([[1.]])

In [143]:
sim = cosine_similarity([U[0]], U)

In [151]:
for idx in np.argsort(sim)[0][::-1][:10]:
    print(data.iloc[idx]['Name'], np.trunc(sim[0][idx]*100)/100) 

Crème de la Mer 1.0
Little Miss Miracle Limited-Edition Crème de la Mer 1.0
Crème de la Mer Mini 1.0
Tonique Douceur Softening Hydrating Toner with Rose Water 0.92
Vitamin Nectar Antioxidant Face Mist 0.89
Clarifying Mask 0.89
Resurfacing Mask 0.88
Max Complexion Correction Pads 0.87
Photo Finish Primer Water 0.87
Bienfait Teinté Beauty Balm Sunscreen Broad Spectrum SPF 30 0.85
