# Web Scraping

#### We're web scrapping the skin care cosmetics from Beauté test to get all the cosmetics names, their ingredients, their size content, their price, for which skin types it's more appropirate, and other features that may be interesting.

Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
# from pprint import pprint
# from lxml import html
# from lxml.html import fromstring
# import urllib.request
# from urllib.request import urlopen
# import random
# import scrapy

We're going to first load the data we got from part1.

In [2]:
facial_care_df = pd.read_csv('./Data/beautetest_facial_care_products.csv')

In [3]:
facial_care_df.head()

Unnamed: 0,Names,Links,Types,Brands,Prices,Capacities
0,Soin Revolumisant Intense Anti-Âge Jour - Revi...,https://www.beaute-test.com/soin-revolumisant-...,cremes_jour,L'Oréal Paris,11.08 €,50 ml
1,Crème de Jour Anti-Rides Q10 - Cien,https://www.beaute-test.com/day_cream_q10_-_ci...,cremes_jour,Lidl,12.00 €,50 ml
2,BB Crème,https://www.beaute-test.com/soin_miracle_perfe...,cremes_jour,Garnier,5.49 €,50 ml
3,BB Skin Detox Fluid SPF 25,https://www.beaute-test.com/bb-skin-detox-flui...,cremes_jour,Clarins,26.90 €,45 ml
4,Soin Global Anti-Rides Jour - Lift+ Algo Rétinol,https://www.beaute-test.com/soin-global-anti-r...,cremes_jour,Diadermine,12.05 €,50 ml


In [4]:
def list_ings_cleaned(page_content):
    end_index = page_content.index('Voir les fiches composants')
    """
    The list of ingredients doesn't appear with the same separator. Sometimes, it's a big dot (which when scrapped appear as
    ' \x95'), a coma, a dot, a dash or sometimes just a space. It's quite difficult to get the ingredients. Some products
    also may not have any ingredient which are provided.
    For the product where there are no mention of "Ingrédients" or "Formule" we will treat it as if there are no ingredients
    provided. The same will be applied on the products with spaces as separators.
    """
    if 'Formule' in page_content:
        beginning_index = page_content.index('Formule') + len('Formule')
        ingredients = page_content[beginning_index : end_index].replace(' \x95',',').strip()
    elif 'Ingrédients' in page_content:
        beginning_index = page_content.index('Ingrédients :') + len('Ingrédients :')
        ingredients = page_content[beginning_index : end_index].replace(' \x95',',').strip()
    elif 'Ingredients' in page_content:
        beginning_index = page_content.index('Ingredients :') + len('Ingredients :')
        ingredients = page_content[beginning_index : end_index].replace(' \x95',',').strip()  
    else:
        ingredients = ['NOT PROVIDED']
        
    if ' \x95' not in ingredients:
    #Sometimes there are some ingredients that have dots as a separator but it could be also used to shorten the ingredient. We'll replace it if there are more than 3.
        ingredients = ingredients.replace('.',',')
        
    #Creating the list of ingredients   
    ingredients = [ing.strip().upper() for ing in ingredients.split(',') if ing.strip().upper()]
    
    
    #In case we have a string for ingredients that we couldn't slip because spaces were the separators, we will treat it 
    #as if there are no ingredients provided.
    if ingredients != ['NOT PROVIDED'] and ',' not in ingredients:
        ingredients = ['NOT PROVIDED']
    
    #Cleaning the list    
    if ":" in ingredients[0]:
        ingredients[0] = ingredients[0].partition(':')[1].strip()
    elif '.' in ingredients[-1]:
        ingredients[-1] = ingredients[-1][:ingredients[-1].index('.')]
    elif "*" in ingredients:
        ingredients = [ing.partition('*')[0] for ing in ingredients]

    ingredients = [ing[:-2] for ing in ingredients if ing[-2:] == ' 1']

    #Some ingredients match INCI naming with their other naming which is placed in braquets so we want to keep those
    inci_df = pd.read_csv('./Data/incibeauty_ingredients.csv')
    inci_list_of_ing = inci_df['Names'].tolist()
    
    for ing in ingredients:
        if '(' and ')' in ing:
            if ing[ing.index('(') + 1 : ing.index(')')] in inci_list_of_ing:
                ing = ing[ing.index('(') + 1 : ing.index(')')]
            else:
                to_delete = ing[ing.index('(') + 1 : ing.index(')')]
                ing = ing.replace(to_delete, '').replace('  ', ' ')
        elif '(' in ing:
            if ing[:ing.index('(')].rstrip() in inci_list_of_ing:
                ing = ing[:ing.index('(')].strip()
            else:
                ing = ing[ing.index('(') + 1 :]
        elif ')' == ing[-1]:
            ing = ing[:-1]
            
            
    for ing in ingredients:
        if '/' in ing:
            if ing not in inci_list_of_ing:
                ing = [ele.strip() for ele in ing.partition('/') if ele.strip() in inci_list_of_ing][0]
                
    return ingredients    

In [5]:
def get_ingredients(url):
    html = requests.get(url).content
    soup = BeautifulSoup(html, "html")
    page_content = soup.find("div", 
                             attrs={"class": "col-xs-12 col-sm-12 sticky-container"}).getText().strip().replace('\n', '')
    if 'Voir les fiches composants' in page_content:
        ingredients = list_ings_cleaned(page_content)
    else:
        ingredients = ['NOT PROVIDED']
   
    return ingredients

In [7]:
len(facial_care_df['Links'].tolist())

16862

In [13]:
facial_care_df['Ingredients'] = facial_care_df['Links'].apply(get_ingredients)

AttributeError: 'NoneType' object has no attribute 'getText'

In [9]:
facial_care_df.loc[10710:10715]

Unnamed: 0,Names,Links,Types,Brands,Prices,Capacities
10710,Baume à Lèvres - La véritable crème de Laponie,https://www.beaute-test.com/baume_a_levres_-_l...,baumes_levres,Polaar,25.94 €,15 g
10711,Baume Lèvres Propolis Karité,https://www.beaute-test.com/baume_levres_propo...,baumes_levres,Propolia,5.08 €,4 ml
10712,Tender Balm Crème Universelle,https://www.beaute-test.com/tender_balm_creme_...,baumes_levres,Oriflame,7.50 €,15 ml
10713,Baume à Lèvres - Fanta Topical,https://www.beaute-test.com/baume_a_levres_-_f...,baumes_levres,Lip Smacker,4.00 €,4 g
10714,Rosey Lip Balm,https://www.beaute-test.com/rosey_lip_balm_de_...,baumes_levres,De Mamiel,20.00 €,10 ml
10715,Hydracutane Stick,https://www.beaute-test.com/hydracutane-stick-...,baumes_levres,Laboratoire Roche,8.40 €,3.5 g


In [17]:
facial_care_df.Names.tolist().index('Lotion Tonique - Derma Pureté')

16048