In [1]:
import requests
import re
import pandas as pd 
import logging

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Read in dataset -- Sources from a kaggle dataset on skincare products on Lookfantastic.com 
sc_data = pd.read_csv('skincare_products_2021.csv')

In [3]:
#Rename the ingredients column 
sc_data = sc_data.rename(columns={'clean_ingreds': 'ingredients'})

#Ensure price column is simply an amount, exclude the currency indicator
sc_data['price'] = sc_data['price'].str.replace('£', '', regex=False).astype(float)

In [4]:
session = requests.Session()

# Iterate through the product URLs
for idx, url in enumerate(sc_data['product_url']):
    try:
        response = session.get(url)
        response.raise_for_status()  # Raises an HTTPError if the status code is 4XX/5XX

        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find and extract the product price
        price_container = soup.find("div", class_="athenaProductPage_productDetailsContainer")
        price_text = None
        if price_container:
            product_price_element = price_container.find("div", class_="productPrice")
            if product_price_element:
                price_text = product_price_element.text.strip()

        # Extract price if available
        if price_text:
            match = re.search(r"\d+\.\d+|\d+", price_text)  # Match decimals or integers
            if match:
                price = match.group()
                sc_data.at[idx, 'updated_price'] = price
        else:
            sc_data.at[idx, 'updated_price'] = None

        # Find and extract the product rating
        product_main = soup.find('main', {'id': 'mainContent'})
        if product_main and product_main.has_attr('data-product-star-rating'):
            star_rating = product_main['data-product-star-rating']
            sc_data.at[idx, 'product_rating'] = star_rating
        else:
            sc_data.at[idx, 'product_rating'] = None
            
        # Extract the product image URL
        image_tag = soup.find('img', class_='athenaProductImageCarousel_image')  # Adjust the class name as needed
        if image_tag and 'src' in image_tag.attrs:
            image_url = image_tag['src']
            sc_data.at[idx, 'product_image_url'] = image_url  # Store the image URL in a new column
        else:
            sc_data.at[idx, 'product_image_url'] = None  # Set to None if image is not found


    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            # Handle 404 error, set the price to None
            sc_data.at[idx, 'updated_price'] = None
        else:
            # General HTTP error, move on to the next product
            continue
    except Exception as e:
        # Handle any other errors and move to the next product
        continue

# Calculate price change (ensure both columns are of float type)
sc_data['updated_price'] = sc_data['updated_price'].astype(float, errors='ignore')
sc_data['price_change'] = sc_data['updated_price'] - sc_data['price'].astype(float, errors='ignore')

In [5]:
def trim_to_one_space(string):
  return re.sub(r'\s+', ' ', string)

url = 'https://www.lookfantastic.com/brands.list'

response = requests.get(url)
html_content = response.content

soup = BeautifulSoup(html_content, 'html.parser')

brand_elements = soup.find_all('div', class_='responsiveBrandsPageScroll_panel')
brand_names = [brand.text.strip().replace('\n',' ') for brand in brand_elements]

#Clean up the list, get the brand names
def clean_brands(brand_list):
    cleaned_brands = []
    
    for brand_entry in brand_list:
        brands = list(filter(None, brand_entry.split('   ')))  # Triple space 
        cleaned_brands.extend(brands)
    
    return cleaned_brands

cleaned_brands = clean_brands(brand_names)
brand_list = sorted(cleaned_brands, key=len, reverse=True)

print(brand_list)

['Kylie Cosmetics by Kylie Jenner', 'Obsessive Compulsive Cosmetics', 'Jean Paul Gaultier Fragrance', 'The Vintage Cosmetic Company', 'Lab Series Skincare for Men', 'Anthony Logistics for Men', 'Experimental Perfume Club', 'Shiseido Skincare For Men', 'Teeth Whitening Products', 'Wella Professionals Care', 'Anastasia Beverly Hills', 'Aromatherapy Associates', 'Eilish by Billie Eilish', 'NYX Professional Makeup', 'Self Glow by James Read', 'SOSU by Suzanne Jackson', 'Le Couvent des Minimes', 'Radiant Glow Botanical', 'Sebastian Professional', 'Shu Uemura Art of Hair', 'Akro By Olivier Cresp', 'Bellápierre Cosmetics', 'Compagnie de Provence', 'Garnier Ambre Solaire', 'Glasshouse Fragrances', 'Jamaican Mango & Lime', "L'Oréal Professionnel", 'Margaret Dabbs London', 'TIGI Bed Head For Men', 'Baxter of California', 'Spectrum Collections', 'The Konjac Sponge Co', 'Abercrombie & Fitch', 'Australian Bodycare', 'Centrum Supplements', 'Laboratory Perfumes', 'Spotlight Oral Care', 'System Profes

In [6]:
sc_data['brand'] = sc_data['product_name'].str.lower()

# def replace_brand(product_name, brand_list):
#     for brand in brand_list:
#         if brand.lower() in product_name.lower():
#             return brand.title()
#     return None  

def normalize_text(text):
    # Remove special characters (e.g., hyphens, punctuation) and extra spaces
    return re.sub(r'\W+', ' ', text).strip().lower()

def replace_brand(product_name, brand_list):
    normalized_product_name = normalize_text(product_name)
    
    for brand in brand_list:
        normalized_brand = normalize_text(brand)
        if normalized_brand in normalized_product_name:
            return brand.title()  # Return the original brand with proper title case
    return None


# Apply the function to the 'brand' column
sc_data['brand'] = sc_data['brand'].apply(lambda x: replace_brand(x, brand_list))

In [7]:
manually_added_brands = ['Avene','Erno Laszlo','NIOD','DECLÉOR','Darphin','GLAMGLOW','Freezeframe','MONU','Jurlique','Salcura','Benton','FARMACY','Instant Effects','RapidEye','MÁDARA','Bubble T','Sea Magik','ManCave','Love Boo']

def replace_null_brand(product_name, manually_added_brands):
    for brand in manually_added_brands:
        if brand.lower() in product_name.lower():
            return brand.title()
    return None

# Apply the function to the 'brand' column
sc_data.loc[sc_data['brand'].isnull(), 'brand'] = sc_data.loc[sc_data['brand'].isnull(), 'product_name'].apply(lambda x: replace_null_brand(x, manually_added_brands))

sc_data['brand'] = sc_data['brand'].replace(['Avene'],'Avène')
sc_data['brand'] = sc_data['brand'].replace(['Bloom And Blossom'],'Bloom & Blossom')
sc_data['brand'] = sc_data['brand'].replace(["L'Oréal Men Expert", 'L’oréal Paris','L’Oréal Professionnel'], "L'oréal Paris")

sc_data.head()

Unnamed: 0,product_name,product_url,product_type,ingredients,price,updated_price,product_rating,product_image_url,price_change,brand
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,"['capric triglyceride', 'cetyl alcohol', 'prop...",5.2,6.1,4.52,https://static.thcdn.com/images/large/original...,0.9,The Ordinary
1,CeraVe Facial Moisturising Lotion SPF 25 52ml,https://www.lookfantastic.com/cerave-facial-mo...,Moisturiser,"['homosalate', 'glycerin', 'octocrylene', 'eth...",13.0,13.2,4.33,https://static.thcdn.com/images/small/original...,0.2,Cerave
2,The Ordinary Hyaluronic Acid 2% + B5 Hydration...,https://www.lookfantastic.com/the-ordinary-hya...,Moisturiser,"['sodium hyaluronate', 'sodium hyaluronate', '...",6.2,8.8,4.68,https://static.thcdn.com/images/large/original...,2.6,The Ordinary
3,AMELIORATE Transforming Body Lotion 200ml,https://www.lookfantastic.com/ameliorate-trans...,Moisturiser,"['ammonium lactate', 'c12-15', 'glycerin', 'pr...",22.5,15.0,4.66,https://static.thcdn.com/images/large/original...,-7.5,Ameliorate
4,CeraVe Moisturising Cream 454g,https://www.lookfantastic.com/cerave-moisturis...,Moisturiser,"['glycerin', 'cetearyl alcohol', 'capric trigl...",16.0,13.6,4.73,https://static.thcdn.com/images/large/original...,-2.4,Cerave


In [8]:
corrected_brands_df = pd.read_csv('corrected_brands_dictionary.csv',index_col = 0)

merged_df = sc_data.merge(corrected_brands_df, on='product_name', how='left', suffixes=('', '_new'))
sc_data['brand'] = sc_data['brand'].combine_first(merged_df['brand_new'])

In [9]:
sc_data.head()

Unnamed: 0,product_name,product_url,product_type,ingredients,price,updated_price,product_rating,product_image_url,price_change,brand
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,"['capric triglyceride', 'cetyl alcohol', 'prop...",5.2,6.1,4.52,https://static.thcdn.com/images/large/original...,0.9,The Ordinary
1,CeraVe Facial Moisturising Lotion SPF 25 52ml,https://www.lookfantastic.com/cerave-facial-mo...,Moisturiser,"['homosalate', 'glycerin', 'octocrylene', 'eth...",13.0,13.2,4.33,https://static.thcdn.com/images/small/original...,0.2,Cerave
2,The Ordinary Hyaluronic Acid 2% + B5 Hydration...,https://www.lookfantastic.com/the-ordinary-hya...,Moisturiser,"['sodium hyaluronate', 'sodium hyaluronate', '...",6.2,8.8,4.68,https://static.thcdn.com/images/large/original...,2.6,The Ordinary
3,AMELIORATE Transforming Body Lotion 200ml,https://www.lookfantastic.com/ameliorate-trans...,Moisturiser,"['ammonium lactate', 'c12-15', 'glycerin', 'pr...",22.5,15.0,4.66,https://static.thcdn.com/images/large/original...,-7.5,Ameliorate
4,CeraVe Moisturising Cream 454g,https://www.lookfantastic.com/cerave-moisturis...,Moisturiser,"['glycerin', 'cetearyl alcohol', 'capric trigl...",16.0,13.6,4.73,https://static.thcdn.com/images/large/original...,-2.4,Cerave


In [11]:
# sc_data.to_csv('skincare_products_2024_v2.csv', index=False)
# sc_data = pd.read_csv('Downloads/skincare_products_2024_v2.csv', index_col = 0)

In [12]:
df = pd.read_csv('skincare_products_2024_v2.csv', index_col=0)
df.head()

Unnamed: 0_level_0,product_url,product_type,ingredients,price,updated_price,product_rating,product_image_url,price_change,brand
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
The Ordinary Natural Moisturising Factors + HA 30ml,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,"['capric triglyceride', 'cetyl alcohol', 'prop...",5.2,6.1,4.52,https://static.thcdn.com/images/large/original...,0.9,The Ordinary
CeraVe Facial Moisturising Lotion SPF 25 52ml,https://www.lookfantastic.com/cerave-facial-mo...,Moisturiser,"['homosalate', 'glycerin', 'octocrylene', 'eth...",13.0,13.2,4.33,https://static.thcdn.com/images/small/original...,0.2,Cerave
The Ordinary Hyaluronic Acid 2% + B5 Hydration Support Formula 30ml,https://www.lookfantastic.com/the-ordinary-hya...,Moisturiser,"['sodium hyaluronate', 'sodium hyaluronate', '...",6.2,8.8,4.68,https://static.thcdn.com/images/large/original...,2.6,The Ordinary
AMELIORATE Transforming Body Lotion 200ml,https://www.lookfantastic.com/ameliorate-trans...,Moisturiser,"['ammonium lactate', 'c12-15', 'glycerin', 'pr...",22.5,15.0,4.66,https://static.thcdn.com/images/large/original...,-7.5,Ameliorate
CeraVe Moisturising Cream 454g,https://www.lookfantastic.com/cerave-moisturis...,Moisturiser,"['glycerin', 'cetearyl alcohol', 'capric trigl...",16.0,13.6,4.73,https://static.thcdn.com/images/large/original...,-2.4,Cerave
