# Data cleaning

#### Product pages content have been scrapped, they now need to be cleaned.

Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
with open('Pages.txt', encoding="utf8") as f:
    #read all the lines of the text file and return them as a list of strings
    lines = f.readlines()

In [3]:
len(lines)

16862

In [4]:
facial_care_df = pd.read_csv('./Data/beautetest_facial_care_products.csv')

In [5]:
len(facial_care_df)

16862

Let's take a look at our data

In [6]:
lines[10611]

"Beauté Test et ses partenaires souhaitent utiliser des cookies ou traceurs pour stocker et accéder à des données personnelles, comme votre visite sur ce site, pour :\t- Mesure d'audience,\t- Fonctionnalités liées aux réseaux sociaux,\t- Contenu personnalisé et mesure de performance du contenu,\t- Publicités personnalisées, mesure de performance des publicités et données d'audience,\t- Développer et améliorer les produits,\t- Données de géolocalisation précises et identification par analyse du terminal,\tVous pouvez autoriser ou refuser tout ou partie de ces traitements de données qui sont basés sur votre consentement ou sur l'intérêt légitime de nos partenaires, à l'exception des cookies et/ou traceurs nécessaires au fonctionnement de ce site. Vous pouvez modifier vos choix à tout moment. Pour en savoir plus, consultez notre politique de protection des données personnelles.\tParamétrer\tJe n'accepte rien\tJ'accepte tout\tMENU\tAccueil Soins du visage Baumes lèvres\tLip Sleeping Mask -

We want to extract :
- Texture
- Types of skin (Type de peau)
- Price per liter (Prix au litre)
- Lauching date (Sortie en)
- Product replacement for (Ce produit remplace)
- Ingredients

In [7]:
products_basic_info = [line[line.index('Catégorie') : line.index('Contenance')] for line in lines]

In [8]:
#Let's extract texture
txt_lenght = len('Texture\t')
products_texture = [ele[ele.index('Texture') + txt_lenght :ele.index('Texture') + txt_lenght 
                         + ele[ele.index('Texture') + txt_lenght :].index('\t')] for ele in products_basic_info]

In [9]:
#We are putting it in the dataframe
facial_care_df['Textures'] = products_texture

In [10]:
#We can see that 2  products didn't have a texture provided, we're going to change them
facial_care_df['Textures'].value_counts()

Crème           6370
Fluide          3185
Gel             1936
Liquide         1466
Autre           1028
Huile            688
Baton            621
Tissu            352
Mousse           286
Baume            285
Eau              237
Patch            189
Lingette         147
Savon             54
Stick              9
Lingettes          7
Type de peau       2
Name: Textures, dtype: int64

In [11]:
facial_care_df.replace(to_replace='Type de peau', value='Non renseigné', inplace=True)

In [12]:
facial_care_df['Textures'].value_counts()

Crème            6370
Fluide           3185
Gel              1936
Liquide          1466
Autre            1028
Huile             688
Baton             621
Tissu             352
Mousse            286
Baume             285
Eau               237
Patch             189
Lingette          147
Savon              54
Stick               9
Lingettes           7
Non renseigné       2
Name: Textures, dtype: int64

Texture\tCrème\tType de peau\tToutes\tHydratant\tOui\tAnti-âge\tOui\tAnti-rides\tOui\tPrix indicatif\t17.90 €\tContenance\t50 ml\tPrix au litre :\t358 €/l\tSortie en\tJanvier 2021\tCe produit remplace :\tSoin Revolumisant Anti-âge Jour - Revitalift Filler [H.A]\t414 avis\

In [13]:
txt_lenght = len('Type de peau\t')
products_types_of_skin = [ele[ele.index('Type de peau') + txt_lenght :ele.index('Type de peau') + txt_lenght 
                         + ele[ele.index('Type de peau') + txt_lenght :].index('\t')] 
                          if 'Type de peau' in ele
                          else 'Non applicable/Non renseigné'
                          for ele in products_basic_info]

In [14]:
#We are putting it in the dataframe
facial_care_df['Types of skin'] = products_types_of_skin

In [15]:
facial_care_df['Types of skin'].value_counts()

Toutes                          10022
Non applicable/Non renseigné     2845
Sèche                            1190
Sensible                          795
Normale                           720
Mixte                             591
Grasse                            409
Déshydratée                       290
Name: Types of skin, dtype: int64

In [16]:
#Let's extract the lauching date of the product (this information isn't present on every product page)
txt_lenght = len('Sortie en\t')
product_launch_date = [line[line.index('Sortie en\t') + txt_lenght:][:line[line.index('Sortie en\t') 
                                                                           + txt_lenght:].index('\t')]
                       if ('Sortie en') in line
                       else 'No lauching date provided'
                       for line in lines]

In [17]:
facial_care_df['Launch dates'] = product_launch_date

In [18]:
facial_care_df['Launch dates'].value_counts()

No lauching date provided    15797
Septembre 2020                  42
Juin 2016                       31
Octobre 2014                    30
Janvier 2017                    27
                             ...  
Mai 2017                         1
Mars 2009                        1
Octobre 2017                     1
Août 2109                        1
Octobre 2018                     1
Name: Launch dates, Length: 140, dtype: int64

In [19]:
#Let's extract if the product is a remplacement for another one
txt_lenght = len('Ce produit remplace :\t')
product_replacement_for = [line[line.index('Ce produit remplace') 
                                + txt_lenght:][:line[line.index('Ce produit remplace') + txt_lenght:].index('\t')]
                           if ('Ce produit remplace') in line
                           else ''
                           for line in lines]

In [20]:
facial_care_df['Replacement for'] = product_replacement_for

In [21]:
facial_care_df['Replacement for'].value_counts()

                                                  14667
Lait Démaquillant                                     4
Eau Micellaire                                        4
Lait Démaquillant Douceur                             4
Crème-Green Lift - SpiruLines                         3
                                                  ...  
Hydra Écume Eau Démaquillante Oligo-Micellaire        1
Cleanser                                              1
Crème de Coton - Idéal Douceur                        1
Soin Gommant à la Boue                                1
Merveillance Expert Enrichie                          1
Name: Replacement for, Length: 2137, dtype: int64

In [22]:
#Let's the price per kg or per liter
txt_lenght_l = len('Prix au litre :\t')
txt_lenght_k = len('Prix au kg :\t')
txt_lenght_u =  len('Prix unitaire :\t')
product_price_per_l_k_u = ['Non renseigné' if '€/' not in line else 
                           line[line.index('Prix au litre :') + txt_lenght_l : line.index('€/') + 3] 
                           if ('Prix au litre :') in line
                           else line[line.index('Prix au kg :') + txt_lenght_k : line.index('€/') + 4]
                           if ('Prix au kg :') in line
                           else line[line.index('Prix unitaire :') + txt_lenght_u : line.index('€/') + 7]
                           for line in lines]

In [23]:
facial_care_df['Price per liter/kilogram/unit'] = product_price_per_l_k_u

In [24]:
ingredients_uncleaned = [line[line.index('Formule\t'):].split('\t')[line[line.index('Formule\t'):].split('\t').index('Formule') + 1]
                         if 'Formule\t' in line
                         else 'Non renseigné'
                         for line in lines]

In [25]:
inci_df = pd.read_csv('./Data/incibeauty_ingredients.csv')
inci_list_of_ing = inci_df['Names'].tolist()

In [26]:
ingredients_uncleaned

['782974 33 - Ingredients : Aqua / water • glycerin • dimethicone • isohexadecane • silica • hydroxyethylpiperazine ethane sulfonic acid • alcohol denat. • dipropylene glycol • synthetic wax • adenosine • disodium edta •sodium acetylated hyaluronate • sodium hyaluronate • disteardimonium hectorite • caprylic/capric triglyceride • caprylyl glycol • ceteareth-6 • ci 77163 / bismuth oxychloride • ci 77891 / titanium dioxide • ethylhexyl hydroxystearate • nylon-12 • peg-30 dipolyhydroxystearate • sodium acrylates copolymer • sodium citrate • sorbitan oleate • stearyl alcohol • synthetic fluorphlogopite • tocopherol • dimethicone/peg-10/15 crosspolymer • dimethicone/polyglycerin-3 crosspolymer • peg-10 dimethicone • citronellol • linalool • phenoxyethanol • parfum / fragrance. (F.I.L. B258055/1)',
 'Aqua, Dibutyl Adipate, Glycerin, Ethylhexyl Salicylate, Titanium Dioxide (Nano), Cetearyl Alcohol, Diethylamino Hydroxybenzoyl Hexyl Benzoate, Glyceryl, Stearate Citrate, Propylheptyl Caprylate,

In [27]:
i=0
for ings in ingredients_uncleaned:
    if len(ings)<60 and ings != 'Non renseigné':
        print(ings)
        print(i)
    i+=1

Extrait d'avoine Hydratant + Rose Apaisante
11
Principaux actifs:
41
Aqua* : eau dé-ionisée par osmose inverse.
70
Composition :
86
Sans paraben
100
ACTIVE INGREDIENTS: Zinc Oxide 20.0%
101
Ginseng : Stimulant, tonifiant et énergisant
125
Ingredients :
140
98,8 % des ingrédients sont d'origine naturelle
143
Sans paraben
157
Testé sous contrôle dermatologique. Non comédogène.
161
Composition :
169
Ingrédients actifs :
193
Aqua* : eau dé-ionisée par osmose inverse.
198
Aqua* : eau dé-ionisée par osmose inverse.
226
Actifs :
231
Active Ingredients : Zinc Oxide = 18.90 %
232
Lavendula Angustifolia Flower -
236
SANS PARABEN
273
Actifs :
278
Le Glycoléol à l'action nutritive et hautement relipidante
288
Algue Tisseuse : restructure le réservoir en eau de la peau
296
Nouvelle formule fluide et sans paraben :
297
Testé sous contrôle dermatologique.
340
Contient notamment : Extrait d'essence de « Tea Tree ».
357
Composition et principes actifs
381
Actif Végétal : 30 Huiles Précieuses
382
Princi

Extrait de Bourgeon de Hêtre
5465
Le 1er soin anti-âge garanti et certifié :
5466
αHA - Acide Glycolique > dépigmentant et anti-âge
5467
Test¨¦ sous contrôle ophtalmologique.
5469
Testé sous contrôle ophtalmologique.
5475
98.8 % du total des ingrédients sont d'origine naturelle.
5478
Principes actifs :
5479
Complexe breveté ISO 3-R
5484
C.L.C
5494
Soumis à des test d'allergie
5507
Soumis à des tests d'allergie
5508
Actifs :
5514
Dérivé de silicium:4,65%
5522
Principaux composants :
5534
Principes actifs :
5539
50% d'Aloe Vera, extrait de magnolia.
5548
Actifs principaux :
5555
Carbomère 980 : 2 mg pour 1 g de solution.
5562
Sans parabène, sulfate, silicone et colorant
5574
Les actifs :
5591
100 % des ingrédients sont d'origine naturelle,
5634
Ingredients par flacon
5635
Sans Paraben
5636
Composition :
5639
Principes actifs :
5665
Ingrédients Clés :
5708
Testé sous contrôle dermatologique
5728
Principaux ingrédients actifs :
5729
Principes actifs
5736
Pour une dose (0,4ml) :
5738
Le con

Contient de l'allantoïne qui exfolie la peau.
14166
Ingrédients : Reine des Prés, Edelweiss, Helichryse
14206
Ingrédients clés
14208
Huile d'olive BIO hydratante
14241
Sans paraben
14247
Actifs
14255
Ingrédients actifs :
14259
98% du total des ingrédients sont d'origine naturelle
14264
Aloe Vera + Glycérine : hydratant
14272
Ingrédient vedette :
14273
Aloe Vera + Glycérine : hydratant
14274
Aloe Vera + Glycérine : hydratant
14275
Sans huile - Acide salicylique à 2 %
14320
Acide glycolique 15%
14330
Acide glycolique 6%
14343
Extrait de myrtilles : protège
14347
Dunaliella Salina de Camargue - Brevet déposé
14365
Organically Produced*
14370
Sans conservateurs
14384
98,9% du total des ingrédients sont d'origine naturelle.
14407
Sans alcool
14411
> pH physiologique
14416
Testé sous contrôle dermatologique
14427
Actifs : Myrtille*, Géranium*
14456
Composants clés
14467
Composition :
14468
Huile de noix de coco* *Issue de l'agriculture biologique
14476
Ingrédients-clés: Bio-Schisandra
14493


In [28]:
ingredients_uncleaned[11] ='AQUA / WATER,GLYCERIN,CYCLOHEXASILOXANE,ALCOHOL DENAT.,HYDROGENATED POLYISOBUTENE,AMMONIUM POLYACRYLDIMETHYLTAURAMIDE / AMMONIUM POLYACRYLOYLDIMETHYL TAURATE,SILICA,MYRISTYL MYRISTATE,ALLANTOIN,ALOE BARBADENSIS / ALOE BARBADENSIS LEAF JUICE,AVENA SATIVA FLOUR / OAT KERNEL FLOUR,CITRIC ACID,COPPER PCA,GLYCERYL STEARATE SE,PHENOXYETHANOL,POTASSIUM CETYL PHOSPHATE,SODIUM BENZOATE,SODIUM HYDROXIDE,STEARIC ACID,TOCOPHERYL ACETATE,PARFUM / FRAGRANCE (F.I.L. B34569/1)'
ingredients_uncleaned[41] = 'Non renseigné'
ingredients_uncleaned[70] = 'Aqua,Caprylic/Capric triglycerides,Cetearyl alcohol,Glycerin ,Isoamyl laurate,Linoleic acid,Glyceryl stearate citrate,Cetearyl glucoside,Tamarindus Indica Seed Polysaccharide,Oleic acid,Lysolecithin,Palmitic acid,Tocopherol,Heliantus annuus (Sunflower) seed oil,Pullulan,Glucose,Linolenic acid,Silica,Parfum (fragrance),Sclerotium gum,Sodium stearoyl glutamate,Xanthan gum,Stearic acid,Tetrasodium glutamate diacetate,Citric acid,Benzyl alcohol,Déhydroacetic acid,Linalool,Limonene,Potassium sorbate,'
ingredients_uncleaned[86] = 'Water (Aqua), Carbonate C12-15 Alkyl Benzoate, Caprylic/Capric Triglyceride Diethylhexyl Butamido Triazone, Tribehenin Peg-20 Esters, Glycerin, Betaine Alluminum Starch Octenylsuccinate, Bis-Ethylhexyloxyphenol, Methoxyphenyl Triazine, Methylene Bis-Benzotriazolyl Tetramethyl Butyphenol, Propylene Glycol, Butyl Methoxy Dibenzoylmethane, (Butyrospermum Parkii (Shea Butter)(Butyrospermum Parkii Butter), Avena Sativa (Oat)Leaf/Stem Extract (Avena Sativa Leaf/Stem Extract)*, Benzoic Acid, Caprylyl Glycol, Carbomer, Decyl Glucoside, Disodium Edta, Fragrance (Parfum), Sodium Hydroxide, Tocopherol, Tocopheryl Acetate, Xanthan Gum.'
ingredients_uncleaned[100] ='Non renseigné'
ingredients_uncleaned[101] ='Aqua,Caprylic/Capric triglycerides,Cetearyl alcohol,Glycerin ,Isoamyl laurate,Linoleic acid,Glyceryl stearate citrate,Cetearyl glucoside,Tamarindus Indica Seed Polysaccharide,Oleic acid,Lysolecithin,Palmitic acid,Tocopherol,Heliantus annuus (Sunflower) seed oil,Pullulan,Glucose,Linolenic acid,Silica,Parfum (fragrance),Sclerotium gum,Sodium stearoyl glutamate,Xanthan gum,Stearic acid,Tetrasodium glutamate diacetate,Citric acid,Benzyl alcohol,DAloe Barbadensis Juice, Simmondsia Chinensis (Jojoba) and Helianthus Annuus (Sunflower) Oils, Sorbitan Stearate, Glyceryl Stearate, Fruit Stem Cells (Malus Domestsica, Solar Vitis) and BioActive 8 Berry Complex, Resveratrol, Ubiquinone (CoQ10), Caprylic/Capric Triglycerides, Cocos Nucifera (Coconut), Borago Officinalis (Borage) and Rosa Canina (Rosehip) Oils, Algae and Cucumis Melo (Melon) Extracts, Magnesium Sulfate, Zea Mays (Corn) Starch, Camellia Sinensis (White Tea), Aspalathus Linearis (Rooibos) and Hibiscus Sabdariffa Extracts^, Tocopherol (Vitamin E), Iron Oxide, Phenethyl Alcohol, Ethylhexylglycerin, Citrus Aurantium Dulcis (Orange) and Citrus Tangerina (Tangerine) Oilséhydroacetic acid,Linalool,Limonene,Potassium sorbate'
ingredients_uncleaned[125] ='AQUA/WATER,CYCLOMETHICONE,GLYCERIN,TITANIUM DIOXIDE,HEXYL LAURATE,DIPROPYLENE GLYCOL,PEG-10 DIMETHICONE,ETHYLHEXYL METHOXYCINNAMATE,TALC,DIMETHICONE,ZINC OXIDE-[nano],TITANIUM DIOXIDE-[nano],ISOEICOSANE,DISTEARDIMONIUM HECTORITE,BETAINE,CENTELLA ASIATICA EXTRACT,POLYGONUM CUSPIDATUM ROOT EXTRACT,SCUTELLARIA BAICALENSIS ROOT EXTRACT,CAMELLIA SINENSIS LEAF EXTRACT,GLYCYRRHIZA GLABRA (LICORICE) ROOT EXTRACT,CHAMOMILLA RECUTITA (MATRICARIA) FLOWER EXTRACT,PORTULACA OLERACEA EXTRACT,ROSMARINUS OFFICINALIS (ROSEMARY) LEAF EXTRACT,HYDROGENATED STARCH HYDROLYSATE,ETHYLHEXYLGLYCERIN,TOCOPHERYL ACETATE,MAGNESIUM SULFATE,SILICA,GLYCOSYL TREHALOSE,ALUMINUM HYDROXIDE,PALMITIC ACID,STEARIC ACID,BUTYLENE GLYCOL,PHENOXYETHANOL,VINYL DIMETHICONE/METHICONE SILSESQUIOXANE CROSSPOLYMER,ACRYLATES/DIMETHICONE COPOLYMER,DIMETHICONE/VINYL DIMETHICONE CROSSPOLYMER,METHICONE,CI 77492/IRON OXIDES,CI 77499/IRON OXIDES,CI 77491/IRON OXIDES'
ingredients_uncleaned[140] = 'Aqua, cyclopentasiloxane, dicaprylyl carbonate, mangifera indica seed butter, cyclohexasiloxane, pentaerythrityl tetraisostearate, hydroxyetrhyl acrylate/sodium acryloyldimethyl taurate copolymer, butylene glycol, imperata cylindrica root extract, glycerin, disteardimonium hectorite, chlorphenesin, tocopheryl acetate, propylene carbonate, parfum, polysorbate 60, sorbitan isostearate, citric acid, laureth-3, hydroxyethylcellulose, sodium hyaluronate, caprylyl glycol, carbomer, acetyl dipeptide-1 cetyl ester, oxothiazolidine, methylisothiazolinone, potassium sorbate, tocopherol, acrylates/c10-30 alkyyl acrylate crosspolymer, sodium benzoate.'
ingredients_uncleaned[143] = 'ALOE BARBADENSIS LEAF JUICE*, AQUA (WATER), GLYCERIN, DICAPRYLYL ETHER, COCO-CAPRYLATE/CAPRATE, CELLULOSE, CETEARYL OLIVATE, SORBITAN OLIVATE, MAGNESIUM ALUMINUM SILICATE (MONTMORILLONITE)(BENTONITE), C10-18 TRIGLYCERIDES, CETEARYL ALCOHOL, ORBIGNYA OLEIFERA SEED OIL*, RIBOSE, PARFUM (FRAGRANCE), BENZYL ALCOHOL, LEVULINIC ACID, BUTYROSPERMUM PARKII (SHEA) BUTTER*, SODIUM STEAROYL GLUTAMATE, XANTHAN GUM, TOCOPHEROL, SODIUM BENZOATE, ARGANIA SPINOSA KERNEL OIL*, BORAGO OFFICINALIS SEED OIL*, OENOTHERA BIENNIS OIL*, P-ANISIC ACID, PRUNUS ARMENIACA (ARPICOT) KERNEL OIL*, SIMMONDSIA CHINENSIS (JOJOBA) SEED OIL*, SODIUM GLUCONATE, VITIS VINIFERA (GRAPE) SEED OIL*, SODIUM LEVULINATE, CITRIC ACID, HELIANTHUS ANNUUS (SUNFLOWER) SEED OIL, SODIUM HYDROXIDE, POLYGONUM AVICULARE EXTRACT, MALTODEXTRIN, MORINGA OLEIFERA SEED EXTRACT, POTASSIUM SORBATE, GLYCINE SOJA (SOYBEAN) OIL, CITRONELLOL, COUMARIN, GERANIOL, LINALOOL.'
ingredients_uncleaned[157] ='Non renseigné'
ingredients_uncleaned[161] ='Non renseigné'
ingredients_uncleaned[169] ='AQUA (WATER), GLYCERIN, GLYCERYL STEARATE, PEG-100 STEARATE, PROPYLENE GLYCOL DICAPRYLATE / DICAPRATE, PEG-8, BETAINE SALICYLATE, OCTYLDODECANOL, POTASSIUM CETYL PHOSPHATE, CETEARYL ALCOHOL, NYLON-12, DICAPRYLYL CARBONATE, BUTYLENE GLYCOL, PENTYLENE GLYCOL, PENTAERYTHRITYL DISTEARATE, HYDROXYPROPYL STARCH PHOSPHATE, NIACINAMIDE, CYCLOPENTASILOXANE, METHYL METHACRYLATE CROSSPOLYMER, NELUMBIUM SPECIOSUM FLOWER EXTRACT (NELUMBO NUCIFERA FLOWER WATER), ARGANIA SPINOSA KERNEL OIL, SERENOA SERRULATA FRUIT EXTRACT, SESAMUM INDICUM (SESAME) SEED EXTRACT, SODIUM C8-16 ISOALKYLSUCCINYL LACTOGLOBULIN SULFONATE, PARFUM (FRAGRANCE), PEG-60 ALMOND GLYCERIDES, TOCOPHEROL, CARBOMER, NORDIHYDROGUAIARETIC ACID, SODIUM METABISULFITE, OLEANOLIC ACID, CHLORPHENESIN, DISODIUM EDTA, PHENOXYETHANOL, GLYCERYL HYDROXYSTEARATE, SODIUM HYDROXIDE, CAPRYLYL GLYCOL, BETA SITOSTEROL'

In [29]:
ingredients_uncleaned[193]= 'AQUA (WATER), CAPRYLIC/CAPRIC TRIGLYCERIDE, PENTYLENE GLYCOL, CETEARYL ALCOHOL, COCO CAPRYLATE/CAPRATE, GLYCERIN, C12-13 ALKYL LACTATE, OENOTHERA BIENNIS (EVENINIG PRIMROSE) OIL, GLYCERYL STEARATE, PEG-100 STEARATE, PLUKENETIA VOLUBILIS SEED OIL, OPUNTIA FICUS-INDICA FRUIT EXTRACT, PUNICA GRANATUM FRUIT EXTRACT, EVENING PRIMROSE OIL / PALM OIL AMINOPROPANEDIOL ESTERS, TOCOPHERYL ACETATE, CHLORPHENESIN, CARBOMER, PARFUM (FRAGRANCE), DISODIUM EDTA, SODIUM HYDROXIDE, CITRIC ACID, SODIUM BENZOATE, POTASSIUM SORBATE, TOCOPHEROL, LIMONENE, GERANIOL'
ingredients_uncleaned[198]='Aqua,Isoamyl laurate,Caprylic/Capric triglycerides,Cetearyl alcohol,Glycerin ,Glyceryl stearate citrate,Cetearyl glucoside,Crocus sativus flower extract,Heliantus annuus (Sunflower) seed oil,Tocopherol,Pullulan,Parfum (fragrance),Lysolecithin,Sclerotium gum,Sodium stearoyl glutamate,Xanthan gum,Silica,Tetrasodium glutamate diacetate,Citric acid,Benzyl alcohol,Déhydroacetic acid,Potassium sorbate,Sodium benzoate'
ingredients_uncleaned[226]='Aqua,Isoamyl laurate,Caprylic/Capric triglycerides,Cetearyl alcohol,Glycerin ,Glyceryl stearate citrate,Cetearyl glucoside,Zinc gluconate,Humulus Lupulus (Hops) Cone Extract,Arctium Majus Root Extract,Calendula Officinalis Flower Extract,Citrus limon (Lemon) Fruit Extract,Hypericum Perforatum Extract,Salvia Officinalis (Sage) Leaf Extract,Saponaria Officinalis Extract,Heliantus annuus (Sunflower) seed oil,Tocopherol,Silica,Parfum (fragrance),Lysolecithin,Sclerotium gum,Sodium stearoyl glutamate,Pullulan,Xanthan gum,Tetrasodium glutamate diacetate,Citric acid,Déhydroacetic acid,Potassium sorbate,Sodium benzoate,Benzyl alcohol'
ingredients_uncleaned[231]='Aqua (water), Cyclopentasiloxane, Dimethicone, Cyclohexasiloxane, Pentylene glycol, CI 77891 (titanium dioxide), Sodium stearoyl glutamate, Butyrospermum parkii (shea) butter, Glycerin, Caprylic/capric triglyceride, Cetearyl alcohol, Xylitylglucoside, Hectorite, Acrylamide/sodium acryloyldimethyltaurate copolymer, Anhydroxylitol, CI 77492 (iron oxides), Isohexadecane, Sodium benzoate, Xylitol, Acrylates/ammonium methacrylate copolymer, Dimethicone crosspolymer, Chlorphenesin, Parfum (fragrance), CI 77491 (iron dioxide), Polysorbate 80, Xanthan gum, Ci 77499 (iron oxides), o-Cymen-5-ol, Tocopheryl acetate, Citric acid, Sorbitan oleate, Boron nitride, CI 77491 (iron oxides), Menyanthes trifoliata leaf extract, Undaria pinnatifida extract, Actinidia chinensis (kiwi) fruit extract, Hydroxycitronellal, Linalool, Disodium stearoyl glutamate, Aluminum hydroxide'
ingredients_uncleaned[232]='Zinc Oxide, Aloe Barbadensis Leaf Juice, Arachidyl Alcohol, Arachidyl Glucoside, Avena Sativa (Oat) Bran Extract, Behenyl Alcohol, Bisabolol, Butyloctyl Salicylate, Caprylhydroxamic Acid, Capryloyl Glycerin/Sebacic Acid Copolymer, Caprylyl Glycol, Citric Acid, Citrus Aurantium Dulcis (Orange) Peel Extract, Copernicia Cerifera (Carnauba) Wax, Diheptyl Succinate, Ethyl Ferulate, Ethyl Macadamiate, Ethylhexyl Olivate, Erythritol, Glycerin, Glyceryl Stearate, Hydrogenated Castor Oil, Jasminum Officinale (Jasmine) Oil, Morinda Citrifolia Fruit Extract, Plantago Major (Plantain) Extract, Polyhydroxystearic Acid, Propanediol, Ricinus Communis (Castor) Seed Oil, Sodium Gluconate, Sodium Hyaluronate, Squalane, Tocopherol, Water, Xanthan Gum'
ingredients_uncleaned[236]='Lavendula Angustifolia Flower Distilate (Lavender Flower Water),Helianthus Annuus Hypericum Perforatum (Sunflower Oil),Simmondsia Chinensis (Jojoba oil),Sesamum Indicum Daucus Carota (Sesame with Carrot Oil),Cera Alba (Beeswax),Emulsifying Wax (Vegetable based-used to combine Flower Water and oil),Theobroma Cacao (Cocoa Butter),Citrus Grandis (Grapefruit Seed Extract),Tocopherol (Vitamin E),Essential Oils of Lavendula,Angustifolia (Lavender),Pelargonium Graveolens (Geranium),Cymbopogon Martinii (Palmerosa)'
ingredients_uncleaned[273]='Non renseigné'
ingredients_uncleaned[278]='WATER (AQUA),ETHYLHEXYL PALMITATE,PROPYLENE GLYCOL,GLYCERYL BORAGO OFFICINALIS (BORAGE) SEED OIL,ZEA MAYS (CORN) OIL,GLYCERYL STEARATE,PEG 100 STEARATE,CETEARETH-25,STEARIC ACID,CETEARYL ALCOHOL,HELIANTHUS ANNUUS (SUNFLOWER) SEED OIL,GLYCINE SOJA (SOYBEAN) OIL,HYDROLYZED SOY FLOUR,LACTOFERRIN,THOIXANTINE,URIC ACID,SORBITOL,CARBOXYMETHYL CHITIN,ALGAE,PANTHENOL,SODIUM HYALURONATE,PHENOXYETHANOL,METHYLPARABEN,BUTYLPARABEN,ETHYLPARABEN,PROPYLPARABEN,ISOBUTYLPARABEN,BUTYLENE GLYCOL,TOCOPHERYL ACETATE,RETINYL PALMITATE,CARBOMER,MAGNESIUM ASCORBYL PHOSPHATE,GLYCERYL DISTEARATE,ATELOCOLLAGEN,SODIUM CHONDROITIN SULFATE,FRAGRANCE (PARFUM),HYDROLYZED  WHEAT PROTEIN (TRITICUM VULGARE),CHONDRUS CRISPUS (CARRAGEENAN,MANNITOL,DEXTRIN,DEXTRAN,UREA,ETHYLHEXYL DIMETHYL PABA,SODIUM BORATE,BHT,ASCORBYL PALMITATE,CITRIC ACID,ALPHA-METHYL IONOME,BENZYL BENZOATE,BENZYL SALICYLATE,BUTYLPHENYL METHYLPROPIONAL,CINNAMYL ALCOHOL,CITRONELLOL,EUGENOL,GERANIOL,HEXYL CINNAMAL,HYDROXYCITRONELLAL,HYDROXYISOHEXYL 3-CYCLOHEXENE CARBOXALDEHYDE,LIMONENE,LINALOOL,[MAY CONTAIN +/- TITANIUM DIOXIDE (CI 77891),IRON OXIDES (CI 77491,CI 77492,CI 7499),FD&C YELLOW 5 (CI 19140 : 1)]'
ingredients_uncleaned[288]='Avene thermal spring water (avene aqua),caprylic/capric triglyceride,diethylhexyl butamido triazone,coco-caprylate,dimethicone,tribehenin peg-20 esters,polymethyl methacrylate,bis-ethylhexyloxyphenol methoxyphenyl triazine,ascorbyl glucoside,ppg-15 stearyl ether,glycerin,glyceryl linoleate,cetearyl alcohol,methylene bis-benzotriazolyl tetramethylbutylphenol [nano],water (aqua),1,2-hexanediol,acrylates/c10-30 alkyl acrylate crosspolymer,benzoic acid,bht,caprylyl glycol,citric acid,decyl glucoside,dimethiconol,fragrance (parfum),glyceryl linolenate,glycine soja (soybean) oil (glycine soja oil),helianthus annuus (sunflower) seed oil (helianthus annuus seed oil),propylene glycol,sodium citrate,sodium hyaluronate,sodium hydroxide,tocopherol,tocopheryl glucoside,xanthan gum'
ingredients_uncleaned[296]='AQUA (WATER) / EAU, GLYCERIN, TRIDECYL TRIMELLITATE, DIPROPYLENE GLYCOL, CETYL ALCOHOL, PRUNUS AMYGDALUS DULCIS (SWEET ALMOND) OIL, DIMETHICONE, ISOPROPYL PALMITATE, PHENOXYETHANOL, ZEA MAYS (CORN) OIL, STEARIC ACID, PALMITIC ACID, SODIUM STEAROYL GLUTAMATE, CARBOMER, CHLORPHENESIN, PARFUM (FRAGRANCE), CAPRYLYL GLYCOL, GLYCINE SOJA (SOYBEAN) OIL, HYDROXYETHYL ACRYLATE / SODIUM ACRYLOYLDIMETHYLTAURATE COPOLYMER, PANTHENOL, SQUALANE, SODIUM HYDROXIDE, HELIANTHUS ANNUUS (SUNFLOWER) SEED OIL, MARIS SAL (SEA SALT) / SEL MARIN, XANTHAN GUM, CAPRYLIC/CAPRIC TRIGLYCERIDE, SODIUM PHYTATE, HIPPOPHAE RHAMNOIDES SEED OIL, HYDROLYZED ALGIN, POLYSORBATE 60, UNDARIA PINNATIFIDA EXTRACT, MARIS AQUA (SEA WATER) / EAU DE MER, CHLORELLA VULGARIS EXTRACT, BHT, ALCOHOL, CITRIC ACID, PROPYL GALLATE, TOCOPHEROL, ROSMARINUS OFFICINALIS (ROSEMARY) LEAF EXTRACT'
ingredients_uncleaned[297]='AQUA/WATER/EAU,GLYCERIN,ISODODECANE,CYCLOPENTASILOXANE,DIPROPYLENE GLYCOL,NIACINAMIDE,SQUALANE,POLYMETHYLSILSESQUIOXANE,C14-22 ALCOHOLS,HDI/TRIMETHYLOL HEXYLLACTONE CROSSPOLYMER,AMMONIUM ACRYLOYLDIMETHYLTAURATE/VP COPOLYMER,C30-45 ALKYL CETEARYL DIMETHICONE CROSSPOLYMER,CARBOMER,PENTYLENE GLYCOL,TOCOPHERYL ACETATE,C12-20 ALKYL GLUCOSIDE,STEARETH-21,DISODIUM EDTA,SALICYLIC ACID,SODIUM HYDROXIDE,MANNITOL,XYLITOL,HEXYLDECANOL,PEG/PPG-18/18 DIMETHICONE,RHAMNOSE,MALACHITE EXTRACT,PYRUS MALUS (APPLE) SEED EXTRACT,BRASSICA CAMPESTRIS (RAPESEED) STEROLS,TOCOPHEROL,FRAGRANCE (PARFUM). [BI 714],'
ingredients_uncleaned[340]='Aqua (Eau), Glycerin, Polyacrylamide, Butylene Glycol, C13-14 Isoparaffin, Polysorbate 60, PVP, Pyrus Malus (Apple) Fruit Extract, Rumex Occidentalis Extract, Aesculus Hippocastanum (Horse Chestnut) Seed Extract, Sodium Hyaluronate, Avena Sativa (Oat) Kernel Extract, Tocophersolan, Niacinamide, Panthenol, Caffeine, Biotin, Parfum, Ethylhexylglycerin, Laureth-7, Faex (Extrait de levure), Ammonium Glycyrrhizate, Propylene Glycol, Zinc Gluconate, Citric Acid, Phenoxyethanol, Methylparaben, Propylparaben, Butylparaben, Ethylparaben, Isobutylparaben, BHT, CI 19140 (Yellow 5), CI 42090 (Blue 1), Benzyl Alcohol'
ingredients_uncleaned[357]='Non renseigné'
ingredients_uncleaned[381]='Non renseigné'
ingredients_uncleaned[382]='AQUA/WATER/EAU,DIMETHICONE,GLYCERIN,METHYLPROPANEDIOL,STEARYL ALCOHOL,GLYCERYL STEARATE,ETHYLHEXYL STEARATE,OLUS OIL/VEGETABLE OIL/HUILE VEGETALE (Huile Vegetale),STEARETH-2,COCOS NUCIFERA (COCONUT) OIL (Huile De Noix De Coco),STEARETH-21,BRASSICA CAMPESTRIS (RAPESEED) SEED OIL (Huile De Colza),HYDROGENATED COCONUT OIL (Huile De Coprah Hydrogenee),HELIANTHUS ANNUUS (SUNFLOWER) SEED OIL (Huile De Tournesol),MACADAMIA TERNIFOLIA SEED OIL (Huile De Macadamia),BUTYROSPERMUM PARKII (SHEA) BUTTER (Beurre De Karite),CENTAUREA CYANUS FLOWER WATER (Eau De Fleurs De Bleuet),SIMMONDSIA CHINENSIS (JOJOBA) SEED OIL (Huile De Jojoba),APHLOIA THEIFORMIS LEAF EXTRACT (Extrait De Feuilles D\'Aphloia),LECITHIN,HYDROXYACETOPHENONE,PARFUM/FRAGRANCE,SESAMUM INDICUM (SESAME) SEED OIL,HYDROGENATED VEGETABLE OIL (Huile Vegetale Hydrogene),ETHYL LINOLEATE,XANTHAN GUM (Gomme De Xanthane),PANTHENOL,OLEA EUROPAEA (OLIVE) FRUIT OIL (Huile D\'Olive),ZEA MAYS (CORN) GERM OIL (Huile De Germe De Maïs),RETINYL PALMITATE,GLYCINE SOJA (SOYBEAN) STEROLS (Sterols De Soja),PRUNUS AMYGDALUS DULCIS (SWEET ALMOND) OIL (Huile D\'Amande Douce),CARTHAMUS TINCTORIUS (SAFFLOWER) SEED OIL (Huile De Carthame),CORYLUS AVELLANA (HAZELNUT) SEED OIL,TOCOPHERYL ACETATE,SORBIC ACID,RICINUS COMMUNIS (CASTOR) SEED OIL (Huile De Ricin),CANDELILLA CERA/EUPHORBIA CERIFERA (CANDELILLA) WAX/CIRE DE CANDELILLA,PRUNUS PERSICA (PEACH) KERNEL OIL (Huile De Noyau De Peche),PRUNUS ARMENIACA (APRICOT) KERNEL OIL (Huile De Noyau D\'Abricot),PISTACIA VERA SEED OIL (Huile De Pistache),PERSEA GRATISSIMA (AVOCADO) OIL (Huile D\'Avocat),ORYZA SATIVA (RICE) GERM OIL (Huile De Riz),ORBIGNYA OLEIFERA SEED OIL (Huile De Babassu),MANGIFERA INDICA (MANGO) SEED OIL (Huile De Noyau De Mangue),JUGLANS REGIA (WALNUT) SEED OIL (Huile De Noix),GOSSYPIUM HERBACEUM (COTTON) SEED OIL (Huile De Coton),CAMELLIA OLEIFERA SEED OIL (Huile De Camelia),TRISODIUM ETHYLENEDIAMINE DISUCCINATE,SILYBUM MARIANUM SEED OIL (Huile De Chardon Marie),ROSA CANINA FRUIT OIL (Huile De Rose),OENOTHERA BIENNIS (EVENING PRIMROSE) SEED EXTRACT (Huile D\'Onagre),LIMNANTHES ALBA (MEADOWFOAM) SEED OIL (Huile De Limnanthe),CARAPA GUAIANENSIS SEED OIL (Huile D\'Andiroba),CAMELINA SATIVA SEED OIL (Huile De Cameline),ARGANIA SPINOSA KERNEL OIL (Huile D\'Argan),SODIUM HYDROXIDE,ROSA DAMASCENA EXTRACT (Absolue De Rose De Damas),SODIUM BENZOATE,TOCOPHEROL,POTASSIUM SORBATE,CITRIC ACID,ALCOHOL,PROPYL GALLATE'
ingredients_uncleaned[394]='Aqua (water), propylene glycol, glycerin, alcohol denat.,aloe barbadensis leaf extract* (aloe barbadensis leaf juice), sodium hyaluronate, retinyl palmitate, hydrolyzed collagen, glycine soja (soybean) seed extract, dipeptide diaminobutyroyl benzylamide diacetate, ascorbyl palmitate, tocopheryl acetate, caviar extract, allantoin, sodium chondroitin sulfate, disodium edta, carbomer, triethanolamine, PEG-40 hydrogenated castor oil, ceratonia siliqua gum (carob bean gum), hydroxyethylcellulose, tropolone, sodium benzoate, potassium sorbate, imidazolidinyl urea, caprylyl glycol, 1,2-hexanediol, parfum (fragrance), butylphenyl methylpropional, linalool, geraniol, citronellol'
ingredients_uncleaned[407]='Water (Aqua), Coco-Caprylate/Caprate, Glyceryl Stearate SE, Dicaprylyl Carbonate, Ethyl Macadamiate, Ethylhexyl Methoxycinnamate, Butyl Methoxydibenzoylmethane, Dipalmitoyl Hydroxyproline, Lauryl Laurate, Sesamum Indicum (Sesame) Seed Oil, Glycerin, Phenoxyethanol, Sodium Polyacrylate, Ethylhexylglycerin, Fragrance (Parfum), Benzophenone-3, Cetyl Alcohol, Potassium Cetyl Phosphate, Sclerotium Gum, Stearyl Alcohol, Persea Gratissima (Avocado) Oil, Caprylyl Glycol, Disodium EDTA, Phytosterols, Smithsonite Extract, Sodium Salicylate, Helianthus Annuus (Sunflower) Seed Oil, Olea Europaea (Olive) Fruit Oil, Padina Pavonica Thallus Extract, Tocopherol, Pinus Pinaster Bark Extract, Sorbitan Oleate, Malic Acid, BHT, Caviar Extract, Ascorbyl Palmitate, Tocopheryl Acetate, Glycine Soja (Soybean) Oil'
ingredients_uncleaned[410]='Non renseigné'
ingredients_uncleaned[414]='AQUA (WATER),ALOE BARBADENSIS LEAF WATER,GLYCERIN,HELIANTHUS ANNUUS (SUNFLOWER) SEED OIL,CETEARYL ALCOHOL,HYDROGENATED OLIVE OIL STEARYL ESTERS,PRUNUS DOMESTICA (PLUM) SEED OIL,SILICA,PARFUM (FRAGRANCE),CETEARYL GLUCOSIDE,BETULA ALBA JUICE,FAGUS SYLVATICA BUD EXTRACT,CAMELINA SATIVA SEED OIL,VITIS VINIFERA (GRAPE) SEED OIL,CANNABIS SATIVA (HEMP) SEED OIL,CHAMOMILLA RECUTITA (MATRICARIA) FLOWER/LEAF/STEM WATER,CANDELILLA CERA,BORAGO OFFICINALIS SEED OIL,OLEA EUROPAEA (OLIVE) FRUIT OIL,PRUNUS AMYGDALUS DULCIS (SWEET ALMOND) OIL,LINUM USITATISSIMUM (LINSEED) SEED OIL,ACMELLA OLERACEA EXTRACT,UNDARIA PINNATIFIDA EXTRACT,SODIUM HYALURONATE,TOCOPHEROL,BETA-SITOSTEROL,SQUALENE,XYLITYLGLUCOSIDE,ANHYDROXYLITOL,SODIUM LEVULINATE,XANTHAN GUM,XYLITOL,SODIUM ANISATE,SALICYLIC ACID,MICA,TITANIUM DIOXIDE,ALCOHOL'

In [30]:
ingredients_uncleaned[1083] = 'Aqua, Ethylhexyl Methoxycinnamate, Dimethicone, Caprylic/Capric Triglyceride, Titanium Dioxide/CI 77891, Glycerin, Octocrylene, Isohexadecane, Dicaprylyl Carbonate, Zinc Oxide, Cetearyl Alcohol, Glyceryl Stearate, PEG-100 Stearate, Methylene Bis-Benzotriazolyl Tetramethylbutylphenol, Hibiscus Sabdariffa Flower Extract, Hoya Lacunosa Flower Extract, Benzyl Alcohol, Hectorite, Parfum, Ceteareth-33, Cetearyl Glucoside, Tocopheryl Acetate, Hydrogenated Lecithin, Decyl Glucoside, Salicylic Acid, Silica, Propylene Glycol, Sorbic Acid, Limonene, BHT, Alpha-Isomethyl Ionone, Xanthan Gum, Potassium Sorbate, Sodium Benzoate, Tocopherol, CI 77492, CI 77491, CI 77499'

In [31]:
facial_care_df.loc[414]

Names                                                                Crème de Jour
Links                            https://www.beaute-test.com/creme-jour-on-the-...
Types                                                                  cremes_jour
Brands                                                            On The Wild Side
Prices                                                                     47.00 €
Capacities                                                                   50 ml
Textures                                                                     Crème
Types of skin                                                               Toutes
Launch dates                                             No lauching date provided
Replacement for                                                                   
Price per liter/kilogram/unit                                              940 €/l
Name: 414, dtype: object

In [33]:
ingredients_uncleaned_df = pd.DataFrame(ingredients_uncleaned, columns=['Ingredients'])
ingredients_uncleaned_df.to_csv('./Data/list_ingredients_to_becleaned.csv', index=False)

In [34]:
facial_care_df.head()

Unnamed: 0,Names,Links,Types,Brands,Prices,Capacities,Textures,Types of skin,Launch dates,Replacement for,Price per liter/kilogram/unit
0,Soin Revolumisant Intense Anti-Âge Jour - Revi...,https://www.beaute-test.com/soin-revolumisant-...,cremes_jour,L'Oréal Paris,11.08 €,50 ml,Crème,Toutes,Janvier 2021,Soin Revolumisant Anti-âge Jour - Revitalift F...,358 €/l
1,Crème de Jour Anti-Rides Q10 - Cien,https://www.beaute-test.com/day_cream_q10_-_ci...,cremes_jour,Lidl,12.00 €,50 ml,Crème,Toutes,No lauching date provided,Crème de Jour - Cien Beauty,59.80 €/l
2,BB Crème,https://www.beaute-test.com/soin_miracle_perfe...,cremes_jour,Garnier,5.49 €,50 ml,Crème,Toutes,Septembre 2011,,198 €/l
3,BB Skin Detox Fluid SPF 25,https://www.beaute-test.com/bb-skin-detox-flui...,cremes_jour,Clarins,26.90 €,45 ml,Crème,Toutes,No lauching date provided,,833 €/l
4,Soin Global Anti-Rides Jour - Lift+ Algo Rétinol,https://www.beaute-test.com/soin-global-anti-r...,cremes_jour,Diadermine,12.05 €,50 ml,Crème,Toutes,Janvier 2021,,246 €/l


In [35]:
facial_care_df.to_csv('./Data/bt_facial_care_products_without_ings.csv', index=False)