#### Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phola\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phola\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## MACHINE LEARNING : What are the similar product ?

In [2]:
data = pd.read_csv('./Data/cosmetics_cleaned3.csv')

In [3]:
data.Label.value_counts()

Moisturizer    290
Face Mask      262
Cleanser       260
Treatment      247
Eye cream      199
Sun protect    163
Name: Label, dtype: int64

In [4]:
data.columns[:115]

Index(['Unnamed: 0', 'Label', 'Brand', 'Name', 'Price', 'Rank', 'Ingredients',
       'Combination', 'Dry', 'Normal',
       ...
       'sorbitol', 'squalane', 'stearic acid', 'sucrose', 'titanium dioxide',
       'tocopherol', 'tocopheryl acetate', 'trehalose', 'water',
       'xanthan gum'],
      dtype='object', length=115)

In [5]:
# One Hot Encoding on product types
text_data = np.array(data['Label'].values)
text_data

array(['Moisturizer', 'Moisturizer', 'Moisturizer', ..., 'Sun protect',
       'Sun protect', 'Sun protect'], dtype=object)

In [6]:
# Create the bag of words feature matrix
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bow = count.fit_transform(text_data)

# Show feature matrix
bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]], dtype=int64)

In [7]:
feature_names = count.get_feature_names()

In [8]:
bow_df = pd.DataFrame(bow.toarray(), columns=feature_names)
bow_df

Unnamed: 0,cleanser,cream,eye,face,mask,moisturizer,protect,sun,treatment
0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
1416,0,0,0,0,0,0,1,1,0
1417,0,0,0,0,0,0,1,1,0
1418,0,0,0,0,0,0,1,1,0
1419,0,0,0,0,0,0,1,1,0


In [9]:
#Concatenate the One Hot Encoding with the dataframe
data_ML = pd.concat([data, bow_df], axis=1)

In [10]:
data_ML

Unnamed: 0.1,Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,...,xanthan gum,cleanser,cream,eye,face,mask,moisturizer,protect,sun,treatment
0,0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,...,0,0,0,0,0,0,1,0,0,0
1,1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,...,0,0,0,0,0,0,1,0,0,0
2,2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,...,0,0,0,0,0,0,1,0,0,0
3,3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,...,0,0,0,0,0,0,1,0,0,0
4,4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1416,1416,Sun protect,KORRES,Yoghurt Nourishing Fluid Veil Face Sunscreen B...,35,3.9,"Water, Alcohol Denat., Potassium Cetyl Phospha...",1,1,1,...,0,0,0,0,0,0,0,1,1,0
1417,1417,Sun protect,KATE SOMERVILLE,Daily Deflector™ Waterlight Broad Spectrum SPF...,48,3.6,"Water, Isododecane, Dimethicone, Butyloctyl Sa...",0,0,0,...,0,0,0,0,0,0,0,1,1,0
1418,1418,Sun protect,VITA LIBERATA,Self Tan Dry Oil SPF 50,54,3.5,"Water, Dihydroxyacetone, Glycerin, Sclerocarya...",0,0,0,...,0,0,0,0,0,0,0,1,1,0
1419,1419,Sun protect,ST. TROPEZ TANNING ESSENTIALS,Pro Light Self Tan Bronzing Mist,20,1.0,"Water, Dihydroxyacetone, Propylene Glycol, PPG...",0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [11]:
#Add a column to calculate the part of harmful ingredients per product
data_ML["Harmful_part"] = data.Harmful_Substances / data.Ingredients.str.len() * 100

In [12]:
data_ML.columns[109:]

Index(['titanium dioxide', 'tocopherol', 'tocopheryl acetate', 'trehalose',
       'water', 'xanthan gum', 'cleanser', 'cream', 'eye', 'face', 'mask',
       'moisturizer', 'protect', 'sun', 'treatment', 'Harmful_part'],
      dtype='object')

In [13]:
#Drop the columns water, price, rank
#'cream','eye','face','mask','moisturizer','sun','treatment'

list_columns_float = ['cleanser', 'cream', 'eye', 'face', 'mask', 'moisturizer', 'protect',
       'sun', 'treatment', '-hexanediol', 'acrylates', 'acrylates copolymer', 'adenosine',
       'alcohol', 'alcohol denat', 'algae extract', 'allantoin',
       'aloe barbadensis leaf juice', 'ammonium acryloyldimethyltaurate',
       'aqua', 'arginine', 'ascorbic acid', 'ascorbyl glucoside',
       'ascorbyl palmitate', 'behenyl alcohol', 'betaine', 'bht',
       'butylene glycol', 'butyloctyl salicylate',
       'butyrospermum parkii butter', 'c- alkyl benzoate', 'caffeine',
       'camellia sinensis leaf extract', 'caprylic', 'caprylyl glycol',
       'carbomer', 'cetearyl alcohol', 'cetyl alcohol', 'chlorphenesin', 'ci',
       'citric acid', 'citronellol', 'cocamidopropyl betaine',
       'cucumis sativus fruit extract', 'cyclohexasiloxane',
       'cyclopentasiloxane', 'dimethicone', 'dipotassium glycyrrhizate',
       'dipropylene glycol', 'disodium edta', 'ethylhexylglycerin',
       'fragrance', 'geraniol', 'glycereth-', 'glycerin', 'glyceryl stearate',
       'glycolic acid', 'glycyrrhiza glabra root extract',
       'helianthus annuus seed oil', 'hexylene glycol',
       'hydroxyethyl acrylate', 'iron oxides', 'isododecane',
       'isononyl isononanoate', 'kaolin', 'lactic acid', 'lecithin',
       'limonene', 'linalool', 'magnesium aluminum silicate',
       'methyl gluceth-', 'methylparaben', 'mica', 'niacinamide', 'panthenol',
       'parfum', 'peg', 'peg-', 'peg- dimethicone',
       'peg- hydrogenated castor oil', 'peg- stearate', 'pentylene glycol',
       'phenoxyethanol', 'phenyl trimethicone', 'polymethylsilsesquioxane',
       'polysilicone-', 'polysorbate', 'potassium hydroxide',
       'potassium sorbate', 'propanediol', 'propylene glycol',
       'retinyl palmitate', 'salicylic acid', 'silica', 'sodium benzoate',
       'sodium chloride', 'sodium citrate', 'sodium hyaluronate',
       'sodium hydroxide', 'sodium pca', 'sorbitol', 'squalane',
       'stearic acid', 'sucrose', 'titanium dioxide', 'tocopherol', 'tocopheryl acetate', 'trehalose', 'xanthan gum']

In [14]:
X = data_ML.loc[:, list_columns_float].values

In [15]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
X = X.astype(float)

In [17]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(X, k = 10)

In [18]:
U.shape

(1421, 10)

In [19]:
data['Name']

0                                         Crème de la Mer
1                                Facial Treatment Essence
2                              Protini™ Polypeptide Cream
3                             The Moisturizing Soft Cream
4           Your Skin But Better™ CC+™ Cream with SPF 50+
                              ...                        
1416    Yoghurt Nourishing Fluid Veil Face Sunscreen B...
1417    Daily Deflector™ Waterlight Broad Spectrum SPF...
1418                              Self Tan Dry Oil SPF 50
1419                     Pro Light Self Tan Bronzing Mist
1420    DERMAPROTECT Daily Defense Broad Spectrum SPF 50+
Name: Name, Length: 1421, dtype: object

In [20]:
U[0]

array([ 0.05290772, -0.04555552, -0.0070136 ,  0.05144729,  0.01084423,
        0.04266688,  0.01189289, -0.02836686, -0.00335185,  0.02833186])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
cosine_similarity([U[0]], [U[0]])

array([[1.]])

In [23]:
sim = cosine_similarity([U[2]], U)

In [24]:
for idx in np.argsort(sim)[0][::-1][:10]:
    print(data.iloc[idx]['Name'], np.trunc(sim[0][idx]*100)/100) 

Protini™ Polypeptide Cream 1.0
Super Multi-Corrective Cream 0.97
Wild Rose + Vitamin C Advanced Brightening Sleeping Facial 0.97
R.N.A. POWER Face Cream 0.96
Vinosource Intense Moisture Rescue Cream 0.96
Lotus Youth Preserve Moisturizer 0.96
Resveratrol Lift Night Infusion Cream 0.96
Water Drop Hydrating Moisturizer 0.96
Ultra Repair® Hydrating Serum 0.94
Watermelon Pink Juice Moisturizer 0.93


In [25]:
#Find similar product with an input and sort by price
def similar_product_byprice(x):
    sim = cosine_similarity([U[x]], U)
    
    similar_product = [(data.iloc[idx][['Name','Price']], np.trunc(sim[0][idx]*100)/100) for idx in np.argsort(sim)[0][::-1][:10]]
    return similar_product

In [26]:
similar_product_byprice(4)

[(Name     Your Skin But Better™ CC+™ Cream with SPF 50+
  Price                                               38
  Name: 4, dtype: object,
  1.0),
 (Name     Your Skin But Better™ CC+Illumination™ Cream w...
  Price                                                   38
  Name: 70, dtype: object,
  0.99),
 (Name     Plantscription™ SPF 25 Power Anti-Aging Cream
  Price                                               60
  Name: 160, dtype: object,
  0.99),
 (Name     Dr. Andrew Weil For Origins™ Mega-Bright SPF 3...
  Price                                                   56
  Name: 154, dtype: object,
  0.99),
 (Name     Cicapair ™ Tiger Grass Color Correcting Treatm...
  Price                                                   52
  Name: 63, dtype: object,
  0.98),
 (Name     Black Label Detox BB Beauty Balm
  Price                                  36
  Name: 212, dtype: object,
  0.98),
 (Name     Color Control Cushion Compact Broad Spectrum S...
  Price                                 

In [27]:
data["Harmful_part"] = data.Harmful_Substances / data.Ingredients.str.len() * 100

In [28]:
#Find similar product with an input and sort by part of harmful ingredients
def similar_product_byharmfulpart(x):
    sim = cosine_similarity([U[x]], U)
    
    similar_product = [(data.iloc[idx][['Name','Harmful_part']], np.trunc(sim[0][idx]*100)/100) for idx in np.argsort(sim)[0][::-1][:10]]
    return similar_product

In [29]:
similar_product_byharmfulpart(2)

[(Name            Protini™ Polypeptide Cream
  Harmful_part                      0.093633
  Name: 2, dtype: object,
  1.0),
 (Name            Super Multi-Corrective Cream
  Harmful_part                        0.469484
  Name: 117, dtype: object,
  0.97),
 (Name            Wild Rose + Vitamin C Advanced Brightening Sle...
  Harmful_part                                            0.0944287
  Name: 127, dtype: object,
  0.97),
 (Name            R.N.A. POWER Face Cream
  Harmful_part                    1.10803
  Name: 19, dtype: object,
  0.96),
 (Name            Vinosource Intense Moisture Rescue Cream
  Harmful_part                                    0.865801
  Name: 224, dtype: object,
  0.96),
 (Name            Lotus Youth Preserve Moisturizer
  Harmful_part                            0.396825
  Name: 10, dtype: object,
  0.96),
 (Name            Resveratrol Lift Night Infusion Cream
  Harmful_part                                 0.496278
  Name: 164, dtype: object,
  0.96),
 (Name    