In [4]:
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
import pandas as pd
import regex
import time

try:
    driver_path = r"C:\Program Files (x86)\ChromeDriver\chromedriver.exe"
    website = "https://www.jomashop.com/fragrances.html"
    
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1280,1440")
                            
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service)
    
    driver.get(website)

    # Hitting ESC on the body element to close the modal that appears whenever you load the page.
    time.sleep(4)
    driver.find_element(by='xpath', value='//body').send_keys(Keys.ESCAPE)
    time.sleep(2)
    
    frags = driver.find_elements(by='xpath', value='//div[@class="product-details"]')
    types = [
        "Eau De Toilette",
        "Eau De Parfum",
        "Eau De Cologne",
        "Extrait De Parfum",
        "Cologne",
        "EDC",
        "EDP",
        "EDT",
        "Parfum"
    ]
    
    # Scroll the page incrementally 6 times and stop when the bottom is reached.
    def scroll_page(driver):
        scroll_count = 6
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_increment = last_height // scroll_count

        for _ in range(scroll_count):
            driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
            time.sleep(1.5)
            new_height = driver.execute_script("return document.body.scrollHeight")

            if new_height == last_height:  
                break
            last_height = new_height
    
    def find_frags(driver):
        seen_frags = set()
        frag_brand = []
        frag_name = []
        frag_type = []
        #frag_gender = []
        frag_price = []
        frag_retail_price = []
        frag_discount = []
        page_counter = 0
        
        name_pattern = regex.compile(r"^(?:(Men's|Ladies|Unisex)\s+)?(?:-\s*)?([^/]+?)(?=\s?Spray\s?|\s*/|\s+by|\s+\(|\s+\d+\.\d+|\s+(?:Eau De Parfum|Eau De Cologne|Eau De Toilette|Extrait De Parfum|EDP|EDT|EDC|Cologne)|$)|\((m|u|w)\)", regex.IGNORECASE)
        type_pattern = regex.compile(r"(Eau De Parfum|Eau De Cologne|Eau De Toilette|Extrait De Parfum|EDP|EDT|EDC|Cologne)", regex.IGNORECASE)
        
        """ 
        Use "while True" when finished testing as it will run until the last page is found (based on logic further down)
        """
        # while True:
        while page_counter < 1: # Set to 1 for testing.
            scroll_page(driver)

            new_frags = driver.find_elements(by='xpath', value='//div[contains(@class, "product-details")]')
            for frag in new_frags:
                if frag not in seen_frags:
                    # Avoids duplication by only adding unique fragrances to the set. Using a set as opposed to a list since we won't be pulling dupes and need this to not be slow. There will be like 250 pages lol.
                    seen_frags.add(frag)
                try:
                    orig_name = frag.find_element(by='xpath', value='.//span[@class="name-out-brand"]').text
                    name_match = regex.match(name_pattern, orig_name)
                    type_match = regex.search(type_pattern, orig_name)

                    for ftype in types:
                        if ftype not in orig_name:
                            continue
                        else:
                            break
                            
                    if name_match:
                        cleaned_name = name_match.group(2).strip()   
                    else:
                        cleaned_name = orig_name
                    if type_match:
                        extr_type = type_match.group(1).strip()
                    else:
                        extr_type = ""

                    pricing_elements = frag.find_elements(by='xpath', value='.//div[@class="price-wrapper"]')

                    for el in pricing_elements:
                        print(f"{el.text}\n")
                    
                    frag_name.append(cleaned_name)    
                    if "parfum" in cleaned_name.lower():
                        extr_type = "Parfum"
                    frag_type.append(extr_type)
                    #frag_gender.append(gender)
                    frag_brand.append(frag.find_element(by='xpath', value='.//span[@class="brand-name"]').text)
                    frag_price.append(frag.find_element(by='xpath', value='.//div[@class="now-price"]').text)
                    frag_retail_price.append(frag.find_element(by='xpath', value='.//div[@class="was-wrapper"]').text)
                    frag_discount.append(frag.find_element(by='xpath', value='.//span[@class="tag-item discount-label"]').text)
                except Exception as e:
                    print(f"Scraping Error: {e}")
                    continue
            try:
                next_page = driver.find_element(by='xpath', value="//li[contains(@class, 'pagination-next page-item')]//a[contains(@class, 'page-link')]")
                next_page.click()
                page_counter += 1
                time.sleep(2)
            except NoSuchElementException:
                print(f"Last page reached, scraping complete. Last item scraped: {cleaned_name}")
                break

        # Ensuring all cols are of equal length to fit inside the DataFrame.
        max_length = max(len(frag_brand), len(frag_name), len(frag_type), len(frag_price), len(frag_retail_price), len(frag_discount))
        frag_brand.extend([''] * (max_length - len(frag_brand)))
        frag_name.extend([''] * (max_length - len(frag_name)))
        frag_type.extend([''] * (max_length - len(frag_type)))
       #frag_gender.extend([''] * (max_length - len(frag_gender)))
        frag_price.extend([''] * (max_length - len(frag_price)))
        frag_retail_price.extend([''] * (max_length - len(frag_retail_price)))
        frag_discount.extend([''] * (max_length - len(frag_discount)))

        df_frags = pd.DataFrame({
            'BRAND': frag_brand,
            'NAME': frag_name,
            'TYPE': frag_type,
            #'GENDER': frag_gender, 
            'RETAIL_PRICE': frag_retail_price,
            'DISCOUNT': frag_discount,
            'PRICE': frag_price
        })

        df_frags.to_csv('frags.csv', index=False)

        # returning the dataframe in order to access it in the find_notes function.
        return df_frags
        
    df_frags_full = find_frags(driver)

    # Navigates to Parfumo (similar to fragrantica), searches for the fragrance, click the result, and scrape the notes & accords from the page.
    def find_notes(driver, frag_names):
        driver.get("https://www.parfumo.com/Perfumes")
    
        # Cookies modal pops up when loading the page for the first time, clicking the accept button.
        cookies_iframe = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "//iframe[contains(@id, 'sp_message_iframe')]")))                                                 
        driver.switch_to.frame(cookies_iframe)
        driver.find_element(By.XPATH, "//button[contains(@class, 'sp_choice_type_11')]").click()
        driver.switch_to.default_content()

        name_pattern_no_conc = regex.compile(r"^(?:(Men's|Ladies|Unisex)\s+)?(?:-\s*)?([^/]+?)(?=\s?Spray\s?|\s*/|\s+by|\s+\(|\s+\d+\.\d+|\s+(?:Eau De Parfum|Eau De Cologne|Eau De Toilette|Extrait De Parfum|EDP|EDT|EDC|Cologne)|$)|\((m|u|w)\)")
                                     
        # Some fragrances start with the same name, but have a different perfume concentration (EDP, EDT, Cologne, etc)
        results_already_scraped = set()

    # find_notes(driver, df_frags_full["NAME"])
    
except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()

$495.00
$299.99
$40.00 coupon
$259.99 after coupon
$80.00
$39.99
$125.00
$98.95
$375.00
$337.50
$460.00
$235.99
$20.00 coupon
$215.99 after coupon
$125.00
$49.99
$20.00 coupon
$29.99 after coupon
$125.00
$84.99
$138.00
$99.99
$295.00
$109.99
$20.00 coupon
$89.99 after coupon
$120.00
$86.99
$165.00
$123.75
$140.00
$88.99
$445.00
$193.99
$80.00
$29.99
$155.00
$109.99
$200.00
$99.99
$190.00
$89.99
$168.00
$105.00
$130.00
$59.99
$115.00
$58.99
$470.00
$219.95
$168.00
$88.99
$95.00
$25.99
$335.00
$185.00
$165.00
$114.99
$105.00
$49.99
$470.00
$229.00
$110.00
$49.99
$470.00
$219.00
$125.00
$47.99
$240.00
$149.99
$495.00
$229.99
$185.00
$109.99
$155.00
$116.25
$220.00
$84.99
$110.00
$69.99
$220.00
$59.99
$275.00
$164.99
$140.00
$89.99
$137.00
$92.99
$49.99
$20.00 coupon
$29.99 after coupon
Scraping Error: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//div[@class="was-wrapper"]"}
  (Session info: chrome=131.0.6778.205)
Stacktrace:
	GetHandleVerifier [0x000