## Data Scraping/Data Cleaning

In [25]:
from bs4 import BeautifulSoup
import pandas as pd

In [60]:
# web scrape brands, names, prices, popularity, and rating of moisturizers from the Sephora website using BeautifulSoup

with open('./data/Sephora_Moisturizers.html', encoding = 'utf-8') as f:
    text = f.read()
dom = BeautifulSoup(text, 'html.parser')
boxes = dom.find_all('div', attrs={'class':'css-1qe8tjm'})
brands = []
names = []
prices = []
popularity = []
ratings = []
for box in boxes:
    brands.append(box.find('span', attrs={'class':'css-12z2u5 eanm77i0'}).text)
    names.append(box.find('span', attrs={'class':'ProductTile-name css-h8cc3p eanm77i0'}).text)
    prices.append(box.find('b', attrs={'class':'css-1f35s9q'}).text)
    popularity.append(box.find('div', attrs={'class':'css-1xk97ib'}).text)
    ratings.append(box.find('span',{'aria-label': True}).get('aria-label'))

In [61]:
# keep more expensive price and edit rating 

import re
updated_prices = []
for price in prices:
    match = re.search(r'\$[\d\.]+\s*-\s*\$([\d\.]+)',price)
    if match:
        max_price = match.group(1)
        updated_prices.append(max_price)
    else:
        updated_prices.append(price[1:])
        
updated_ratings = []
for rate in ratings:
    match = re.search(r'([\d.]+)', rate)
    updated_ratings.append(match[0])

In [62]:
# create csv file to enter scraped moisturizer data

import csv
file = open('scrapes_moisturizers.csv', 'w')
writer = csv.writer(file)
writer.writerow(['Name', 'Brand', 'Price', 'Popularity', 'Rating', 'Store', 'Concern 1', 'Concern 2', 'Concern 3'])

72

In [37]:
# get skin concerns for each moisturizer using selenium

from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

'''
driver.get("https://amazon.com")
driver.maximize_window()

# searching for Children books in search bar
driver.implicitly_wait(2)
searchBox = driver.find_element("id", "twotabsearchtextbox")
searchBox.send_keys("children Books")

# to click the search button
searchBtn = driver.find_element("id", "nav-search-submit-button")
searchBtn.click()
'''
driver = webdriver.Chrome()
driver.get(f'https://www.sephora.com/search?keyword=skin%20care%20moisturizers&currentPage={1}')
driver.maximize_window()
time.sleep(2)

In [47]:
# https://stackoverflow.com/questions/70515250/selenium-webscrape-not-scraping-all-item-information-on-amazon
elements = driver.find_elements(By.CLASS_NAME, 'css-1qe8tjm')
for elem in elements:
    print(elem.text)





Quicklook
Youth To The People
Superfood Air-Whip Lightweight Face Moisturizer with Hyaluronic Acid
1.3K
$48.00
NEW
Biossance
Squalane + Probiotic Balancing Gel Moisturizer
457
$54.00
iNNBEAUTY PROJECT
Next Level Moisturizer
96
$24.00
ONLINE ONLY
iNNBEAUTY PROJECT
Face Glaze Skin Barrier Protect & Glow Moisturizer
256
$25.00
iNNBEAUTY PROJECT
10 + 10 Moisturizer with 10% Vitamin C + 10% Peptide Complex + Ceramides
361
$35.00
Farmacy
Honey Halo Ultra-Hydrating Ceramide Moisturizer
1.6K
$25.00 - $74.00
fresh
Mini Rose & Hyaluronic Acid Deep Hydration Moisturizer
69
$18.00
fresh
Lotus Anti- Aging Daily Moisturizer
1.1K
$21.00 - $54.00
fresh
Lotus Anti-Aging Night Moisturizer
1.7K
$22.00 - $56.00
Topicals
Like Butter Moisturizer for Dry, Sensitive & Eczema-Prone Skin
375
$34.00
Biossance
Squalane + Omega Repair Deep Hydration Moisturizer
2.2K
$20.00 - $78.00 ($120.00 value)
alpyn beauty
Melt Moisturizer with Bakuchiol and Squalane
560
$25.00 - $60.00






































In [None]:
'''
for i in range(1,9):
    driver = webdriver.Chrome()
    driver.get(f'https://www.sephora.com/search?keyword=skin%20care%20moisturizers&currentPage={i}')
    driver.maximize_window()
    driver.implicitly_wait(3)
    driver.quit()

driver.implicitly_wait(2)
search = driver.find_element('id', 'site_search_input')
search.send_keys('skin care moisturizers')
search.send_keys(Keys.ENTER)
moisturizers = []
resultsXpath = "//div[@data-component-type='s-search-result']"
WebDriverWait(driver, 25).until(EC.visibility_of_all_elements_located((By.XPATH, resultsXpath)))
results = driver.find_elements(By.XPATH, resultsXpath)


driver = webdriver.Chrome()
local_file = '/Users/nanamisantacruz/Downloads/Moisturizer-Recommender/moisturizer-recommender/data/Sephora_Moisturizers.html'
driver.get(f'file://{local_file}')
driver.maximize_window()
grid = driver.find_element(By.CLASS_NAME,'css-1322gsb')
moisturizer_elems = grid.find_elements(By.CLASS_NAME,'css-1qe8tjm')
driver.implicitly_wait(2)
moisturizer_targets = []
for elem in moisturizer_elems:
    elem.click()
    driver.implicitly_wait(2)
    indiv_html = driver.page_source
    indiv_soup = BeautifulSoup(indiv_html, 'html.parser')
    highlights = indiv_soup.find('div', attrs={'class':'css-h2sczi eanm77i0'})
    concerns = highlights.find_all('div', attrs={'class':'css-10t9fn0 eanm77i0'})
    if concerns:
        skin_concerns = []
        for concern in concerns:
            skin_concerns.append(concern.text)
        moisturizer_targets.append(skin_concerns)
    driver.back()
    driver.implicitly_wait(3)
driver.quit()
'''

In [22]:
for target in moisturizer_targets:
    print(target)

['Good for: Loss of firmness', 'Good for: Anti-Aging', 'Good for: Dryness']


In [63]:
skin_concerns = ['fine lines', 'dryness', 'dullness', 'loss of firmness', 'uneven texture', 'redness', 'dark spots']

In [64]:
import random
for name, brand, price, pop, rating in zip(names,brands,updated_prices,popularity, updated_ratings):
    concern1 = random.choice(skin_concerns)
    concern2 = random.choice(skin_concerns)
    while concern2 == concern1:
        concern2 = random.choice(skin_concerns)
    concern3 = random.choice(skin_concerns)
    while concern3 == concern1 or concern3 == concern2:
        concern3 = random.choice(skin_concerns)
    writer.writerow([name, brand, price, pop, rating, 'Sephora', concern1, concern2, concern3])

In [65]:
import pandas as pd
df = pd.read_csv('scrapes_moisturizers.csv')
df

Unnamed: 0,Travel Size Toleriane Double Repair Face Moisturizer with Niacinamide,La Roche-Posay,10.99,"1,306",4.0,Ulta,dryness,uneven texture,dullness
0,Rose Deep Hydration Face Cream,fresh,46.0,2191,4.7,Ulta,uneven texture,fine lines,dark spots
1,Hello Results Wrinkle-Reducing Daily Retinol S...,IT Cosmetics,69.0,2235,4.3,Ulta,dryness,dark spots,fine lines
2,Ultimate Miracle Worker SPF 30 Moisturizer,Philosophy,82.0,1507,4.5,Ulta,dullness,uneven texture,dryness
3,Calm Water Gel Weightless Moisturizer,Dermalogica,55.0,1212,4.6,Ulta,dryness,fine lines,dullness
4,Super Rich Repair Moisturizer,Dermalogica,94.0,1078,4.7,Ulta,loss of firmness,dryness,dullness
5,Oil and Pore Control Mattifier Broad Spectrum ...,Murad,49.0,269,3.9,Ulta,uneven texture,redness,dark spots
6,24-7 Moisture Intense Ultra Hydrating Day & Ni...,Tula,58.0,40,4.4,Ulta,uneven texture,loss of firmness,dullness
7,Hope In A Jar Water Cream Hyaluronic Glow Mois...,Philosophy,31.5,606,4.7,Ulta,dullness,dark spots,loss of firmness
8,Play Everyday Lotion SPF 50 with Sunflower Ext...,Supergoop!,22.0,1864,4.5,Ulta,loss of firmness,redness,dullness
9,Anti-Wrinkle Miracle Worker+ Line Correcting M...,Philosophy,68.0,620,4.4,Ulta,dark spots,dullness,redness


In [66]:
from bs4 import BeautifulSoup

with open('./data/Ulta_Moisturizers.html', encoding = 'utf-8') as f:
    text = f.read()
    
soup = BeautifulSoup(text, 'html.parser')
ulta_boxes = soup.find_all('li', attrs={'class':'ProductListingResults__productCard'})

In [67]:
ulta_names = []
ulta_brands = []
ulta_prices = []
ulta = []

for box in ulta_boxes:
    ulta_names.append(box.find('span', attrs={'class':'ProductCard__product'}).text)
    ulta_brands.append(box.find('span', attrs={'class':'ProductCard__brand'}).text)
    ulta_prices.append(box.find('div', attrs={'class':'ProductPricing'}).text)
    ulta.append(box.find('div', attrs={'class':'ReviewStarsCard'}).text)

In [68]:
import re

updated_ulta_prices = []
for price in ulta_prices:
    if price[:4]=='Sale':
        if len(price)>60:
            m = re.search(r'Sale Price (\$\d+\.\d+) - (\$\d+\.\d+)',price)
            updated_ulta_prices.append(m[2][1:])
        else:
            m = re.search(r'Sale Price (\$\d+\.\d+)\$\d+\.\d+ Original Price (\$\d+\.\d+)\$\d+\.\d+', price)
            updated_ulta_prices.append(m[1][1:])
    else:
        match = re.search(r'\$[\d\.]+\s*-\s*\$([\d\.]+)',price)
        if match:
            max_price = match.group(1)
            updated_ulta_prices.append(max_price)
        else:
            updated_ulta_prices.append(price[1:])

In [69]:
print(len(updated_ulta_prices))

96


In [70]:
import re

ulta_ratings = []

for rate in ulta:
    match = re.search(r'(\d\.\d)', rate)
    if match:
        ulta_ratings.append(match[0])
    else:
        new_rate = rate[0]+'.0'
        ulta_ratings.append(new_rate)

In [71]:
import re

ulta_popularity = []
for rate in ulta:
    match = re.search(r'^([\d.]+) out of 5 stars ; (\d+) reviews\(([\d,]+)\)$', rate)
    ulta_popularity.append(match[3])

In [72]:
import csv
file = open('scrapes_moisturizers.csv', 'a')
writer = csv.writer(file)

In [73]:
for name, brand, price, pop, rate in zip(ulta_names,ulta_brands,updated_ulta_prices,ulta_popularity, ulta_ratings):
    concern1 = random.choice(skin_concerns)
    concern2 = random.choice(skin_concerns)
    while concern2 == concern1:
        concern2 = random.choice(skin_concerns)
    concern3 = random.choice(skin_concerns)
    while concern3 == concern1 or concern3 == concern2:
        concern3 = random.choice(skin_concerns)
    writer.writerow([name, brand, price, pop, rate, 'Ulta', concern1, concern2, concern3])

In [79]:
import pandas as pd
df = pd.read_csv('scrapes_moisturizers.csv')
df

Unnamed: 0,Name,Brand,Price,Popularity,Rating,Store,Concern 1,Concern 2,Concern 3
0,Lala Retro™ Whipped Refillable Moisturizer wit...,Drunk Elephant,62.00,2.1K,4.0,Sephora,uneven texture,fine lines,dryness
1,Plum Plump Refillable Hyaluronic Acid Moisturizer,Glow Recipe,39.00,1.6K,4.5,Sephora,dryness,redness,uneven texture
2,Benefiance Wrinkle Smoothing Day Cream SPF 23,Shiseido,75.00,210,4.5,Sephora,fine lines,dullness,dark spots
3,Protini™ Polypeptide Firming Refillable Moistu...,Drunk Elephant,68.00,6.8K,4.0,Sephora,dark spots,uneven texture,fine lines
4,The Dewy Skin Cream Plumping & Hydrating Moist...,Tatcha,85.00,3.3K,4.0,Sephora,loss of firmness,dullness,uneven texture
...,...,...,...,...,...,...,...,...,...
129,Natural Moisturizing Factors + PhytoCeramides ...,The Ordinary,22.50,741,4.5,Ulta,dullness,dark spots,loss of firmness
130,Pro-Collagen Marine Cream,ELEMIS,138.00,1893,4.6,Ulta,dark spots,loss of firmness,redness
131,Face Freezie Cooling Hydration Moisturizer + P...,NYX Professional Makeup,20.00,455,4.4,Ulta,dryness,dark spots,uneven texture
132,Toleriane Dermallegro Ultra Soothing Repair Fa...,La Roche-Posay,31.99,404,4.3,Ulta,dark spots,redness,fine lines


In [80]:
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Rating'] = np.round(df['Rating'], 1)

In [81]:
df

Unnamed: 0,Name,Brand,Price,Popularity,Rating,Store,Concern 1,Concern 2,Concern 3
0,Lala Retro™ Whipped Refillable Moisturizer wit...,Drunk Elephant,62.00,2.1K,4.0,Sephora,uneven texture,fine lines,dryness
1,Plum Plump Refillable Hyaluronic Acid Moisturizer,Glow Recipe,39.00,1.6K,4.5,Sephora,dryness,redness,uneven texture
2,Benefiance Wrinkle Smoothing Day Cream SPF 23,Shiseido,75.00,210,4.5,Sephora,fine lines,dullness,dark spots
3,Protini™ Polypeptide Firming Refillable Moistu...,Drunk Elephant,68.00,6.8K,4.0,Sephora,dark spots,uneven texture,fine lines
4,The Dewy Skin Cream Plumping & Hydrating Moist...,Tatcha,85.00,3.3K,4.0,Sephora,loss of firmness,dullness,uneven texture
...,...,...,...,...,...,...,...,...,...
129,Natural Moisturizing Factors + PhytoCeramides ...,The Ordinary,22.50,741,4.5,Ulta,dullness,dark spots,loss of firmness
130,Pro-Collagen Marine Cream,ELEMIS,138.00,1893,4.6,Ulta,dark spots,loss of firmness,redness
131,Face Freezie Cooling Hydration Moisturizer + P...,NYX Professional Makeup,20.00,455,4.4,Ulta,dryness,dark spots,uneven texture
132,Toleriane Dermallegro Ultra Soothing Repair Fa...,La Roche-Posay,31.99,404,4.3,Ulta,dark spots,redness,fine lines


In [94]:
def alter_popularity(pop):
    if 'K' in pop:
        return int(float(pop.replace('K',''))*1000)
    if ',' in pop:
        return int(float(pop.replace(',','')))
    else:
        return int(pop)

In [95]:
df['Popularity'] = df['Popularity'].apply(alter_popularity)

In [96]:
df['Popularity']

0      2100
1      1600
2       210
3      6800
4      3300
       ... 
129     741
130    1893
131     455
132     404
133    1714
Name: Popularity, Length: 134, dtype: int64

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        134 non-null    object 
 1   Brand       134 non-null    object 
 2   Price       134 non-null    float64
 3   Popularity  134 non-null    int64  
 4   Rating      134 non-null    float64
 5   Store       134 non-null    object 
 6   Concern 1   134 non-null    object 
 7   Concern 2   134 non-null    object 
 8   Concern 3   134 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 9.5+ KB


In [98]:
df['Name'].value_counts()

Confidence in a Cream Anti-Aging Hydrating Moisturizer           2
Ultra Repair Face Moisturizer                                    2
Intensive Moisture Balance Moisturizer                           1
SA Cream                                                         1
Active Moist Oil-Free Moisturizer                                1
                                                                ..
Daily Greens Oil-Free Gel Moisturizer with Moringa and Papaya    1
The True Cream Moisturizing Bomb With Oak Husk and Vitamin B     1
Ultra Repair Oil-Control Moisturizer                             1
Squalane + Omega Repair Deep Hydration Moisturizer               1
Confidence In A Gel Lotion Oil-Free Moisturizer                  1
Name: Name, Length: 132, dtype: int64

In [99]:
df_unique = df.drop_duplicates(subset='Name', keep='first')

In [100]:
df_unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 0 to 133
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        132 non-null    object 
 1   Brand       132 non-null    object 
 2   Price       132 non-null    float64
 3   Popularity  132 non-null    int64  
 4   Rating      132 non-null    float64
 5   Store       132 non-null    object 
 6   Concern 1   132 non-null    object 
 7   Concern 2   132 non-null    object 
 8   Concern 3   132 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 10.3+ KB


In [101]:
df_unique['Name'].value_counts()

Lala Retro™ Whipped Refillable Moisturizer with Ceramides       1
Dramatically Different Face Moisturizing Gel                    1
SA Cream                                                        1
Active Moist Oil-Free Moisturizer                               1
Clinique Smart Clinical Repair Wrinkle Correcting Face Cream    1
                                                               ..
Ultra Repair Oil-Control Moisturizer                            1
Squalane + Omega Repair Deep Hydration Moisturizer              1
Ultra Repair Face Moisturizer                                   1
Strength Trainer Peptide Boost Firming Moisturizer              1
Confidence In A Gel Lotion Oil-Free Moisturizer                 1
Name: Name, Length: 132, dtype: int64

In [102]:
import csv
file = open('moisturizer_data.csv', 'w')
writer = csv.writer(file)
writer.writerow(['Name', 'Brand', 'Price', 'Popularity', 'Rating', 'Store', 'Concern 1', 'Concern 2', 'Concern 3'])

72

In [103]:
file_path = 'moisturizer_data.csv'
df_unique.to_csv(file_path, index=False)

https://www.pythonguis.com/tutorials/create-gui-tkinter/
https://www.tutorialspoint.com/how-can-i-create-a-dropdown-menu-from-a-list-in-tkinter#:~:text=Let%20us%20suppose%20we%20want,value%20of%20the%20dropdown%20menu.


## Building the GUI

In [106]:
import pandas as pd
df = pd.read_csv('moisturizer_data.csv')
minPrice = df['Price'].min()
maxPrice = df['Price'].max()
print(minPrice, maxPrice)

7.99 260.0


In [107]:
from tkinter import *
root = Tk()
root.title('Moisturizer Recommender')
root.minsize(400,400)
root.maxsize(800,800)
root.geometry('600x600+50+50')

''

### Get skin concerns

In [108]:
def getSelectedConcerns():
    selected_concerns=[]
    for concern, var in concern_dict.items():
        if var.get()==1:
            selected_concerns.append(concern)
    print(selected_concerns)

skin_concerns = ['fine lines', 'dryness', 'dullness', 'loss of firmness', 'uneven texture', 'redness', 'dark spots']
concern_dict={}
for concern in skin_concerns:
    concern_dict[concern] = IntVar()

Label(root, text='Select Skin Concerns (up to 3)').grid(row=0, column=0, columnspan=2, padx=10, pady=5)

rowNum = 1
for concern in skin_concerns:
    Checkbutton(root, text=concern, variable=concern_dict[concern]).grid(row=rowNum, column=1, sticky=W, padx=10, pady=5)
    rowNum+=1

### Get budget by connecting psycopg2 to postgres

In [109]:
import psycopg2

conn = psycopg2.connect(host='localhost', database='moisturizer_data', user='postgres', password='Emscagg2003.')
cursor = conn.cursor()
cursor.execute('SELECT MIN(price) AS min_price, MAX(price) AS max_price FROM moisturizers;')
result = cursor.fetchone()
cursor.close()
conn.close()
minPrice, maxPrice = result[0], result[1]

Label(root, text='Select the budget range').grid(row=rowNum, column=1, columnspan=2, padx=5, pady=2)
rowNum+=1
minBudget = Scale(root, from_=minPrice, to=maxPrice, orient=HORIZONTAL, label='Minimum', resolution=0.05)
minBudget.set(minPrice)
minBudget.grid(row=rowNum, column=0, columnspan=2, padx=5, pady=2)
maxBudget = Scale(root, from_=minPrice, to=maxPrice, orient=HORIZONTAL, label='Maximum', resolution=0.05)
maxBudget.set(maxPrice)
maxBudget.grid(row=rowNum, column=3, columnspan=2, padx=5, pady=2)

### Get number of recommendations you want

In [None]:
menu = StringVar()
menu.set('Select the number of recommendations you want')
rowNum+=1
drop = OptionMenu(root, menu, '1', '3', '5', '10').grid(row=rowNum, column=1, columnspan=2, padx=10, pady=5)

rowNum+=1
Button(root, text='Get Recommendations', command=getSelectedConcerns).grid(row=rowNum, column=1, columnspan=2, padx=10, pady=5)

root.mainloop()