In [5]:
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
import pandas as pd
import regex
import time

try:
    driver_path = r"C:\Program Files (x86)\ChromeDriver\chromedriver.exe"
    website = "https://www.jomashop.com/fragrances.html"
    
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1280,1440")
                            
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service)
    
    driver.get(website)

    # Hitting ESC on the body element to close the modal that appears whenever you load the page.
    time.sleep(4)
    driver.find_element(by='xpath', value='//body').send_keys(Keys.ESCAPE)
    time.sleep(2)
    
    frags = driver.find_elements(by='xpath', value='//div[@class="product-details"]')

    # These are the different types of fragrance strength, application (like what it is used in / for).
    fragrance_types = [
        "Aftershave", "Bath and Shower Products", "Body Spray", "Car Diffuser", "Cleansers",
        "Deodorant", "Diffuser", "Eau De Parfum", "Eau De Cologne", "Eau De Toilette", "Extrait de Parfum", "Eau de Cologne", "Eau de Parfum", "Eau de Toilette",
        "Extrait De Parfum", "Free Water", "Gift Set", "Lotion & Oils", "Lotions & Creams",
        "Mist", "Oil & Serums", "Parfum", "Perfume Oil", "Room Fragrance", "Room Spray",
        "Scented Candle", "Scented Cards", "Scrubs, Foams & Exfoliants", "Shower Gels", "Soap",
        "Solid Parfum", "Tools", "Wash", "EDP", "EDT"
    ]
    
    def find_frags(driver):
        seen_frags = set()
        frag_brand = []
        frag_name = []
        frag_type = []
        frag_size = []
        frag_gender = []
        frag_price = []
        frag_retail_price = []
        frag_discount = []
        page_counter = 0
        
        # Scroll the page incrementally 6 times and stop when the bottom is reached.
        def scroll_page(driver):
           
            scroll_count = 6
            last_height = driver.execute_script("return document.body.scrollHeight")
            scroll_increment = last_height // scroll_count
    
            for _ in range(scroll_count):
                driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
                time.sleep(1.5)
                new_height = driver.execute_script("return document.body.scrollHeight")
    
                if new_height == last_height:  
                    break
                last_height = new_height
        
        # I have the fragrance types trimmed down so it's easier to test, but will include them all after.

        # Currently having an issue with parsing out the "m" "w" "u" text at the end of some fragrances. Any that don't have "Men's" "Ladies" or "Unisex" in front
        # have either of the three single-character indicators, so it's not like anything could even be overwritten. Python recognizes that there are four groups, but it isn't parsing them. 
        # They get captured in testers I've used on Debuggex and Regex101 so I am not sure what the deal is here.
        pattern = regex.compile(
            r"^(?:(Men's|Ladies|Unisex)\s+)?([^/]+?)(?=\s*/|\s+by|\s+\(|\s+\d+\.\d+|\s+(?:Eau De Parfum|Eau De Cologne|Eau De Toilette|EDP|EDT)|$)|\((m|u|w)\)$", regex.IGNORECASE
        )
        # Set to 1 for testing, would technically be just any higher number than the total page count I guess (total results / 60 which is the # of items per page) since it stops when it reaches the last page anyway.
        while page_counter < 1:
            scroll_page(driver)

            new_frags = driver.find_elements(by='xpath', value='//div[contains(@class, "product-details")]')
            for frag in new_frags:
                if frag not in seen_frags:
                    # Avoids duplication by only adding unique fragrances to the set. Using a set as opposed to a list since we won't be pulling dupes and need this to not be slow. There will be like 250 pages lol.
                    seen_frags.add(frag)
                try:
                    orig_name = frag.find_element(by='xpath', value='.//span[@class="name-out-brand"]').text
                    match = regex.match(pattern, orig_name)
                    if match:
                        # Group 1 = "Men's", "Ladies", "Unisex"
                        # Group 2 = Fragrance Name, excluding the brand.
                        # Group 3 = "(m), "(w)", "(u)" (case insensitive) 
                        """ This portion is not working for some reason. It gets captured in various testers, but not here.
                            I have tried various forms of the patterns, eventually reducing the number of groups down to three. Python recognizes there are three groups but doesn't parse what it captures in the 3rd group."""
                        gender = match.group(1) or match.group(3)
                        cleaned_name = match.group(2).strip()
                             
                        print("Matched groups:", match.groups())
                    else:
                        cleaned_name = orig_name
                        
                    # Adding each found element to the list.
                    frag_name.append(cleaned_name)
                    frag_gender.append(gender)
                    frag_brand.append(frag.find_element(by='xpath', value='.//span[@class="brand-name"]').text)
                    frag_price.append(frag.find_element(by='xpath', value='.//div[@class="now-price"]').text)
                    frag_retail_price.append(frag.find_element(by='xpath', value='.//div[@class="was-wrapper"]').text)
                    frag_discount.append(frag.find_element(by='xpath', value='.//span[@class="tag-item discount-label"]').text)
                    frag_type.append("")
                    frag_size.append("")
                except Exception as e:
                    print(f"Scraping Error: {e}")
                    continue
    
            try:
                # Moving to the next page. 
                next_page = driver.find_element(by='xpath', value="//li[contains(@class, 'pagination-next page-item')]//a[contains(@class, 'page-link')]")
                next_page.click()
                page_counter += 1
                time.sleep(2)
            except Exception:
                break

        # Ensuring all cols are of equal length to fit inside the DataFrame.
        max_length = max(len(frag_brand), len(frag_name), len(frag_type), len(frag_size), len(frag_gender), len(frag_price), len(frag_retail_price), len(frag_discount))
        frag_brand.extend([''] * (max_length - len(frag_brand)))
        frag_name.extend([''] * (max_length - len(frag_name)))
        frag_type.extend([''] * (max_length - len(frag_type)))
        frag_size.extend([''] * (max_length - len(frag_size)))
        frag_gender.extend([''] * (max_length - len(frag_gender)))
        frag_price.extend([''] * (max_length - len(frag_price)))
        frag_retail_price.extend([''] * (max_length - len(frag_retail_price)))
        frag_discount.extend([''] * (max_length - len(frag_discount)))

        df_frags = pd.DataFrame({
            'BRAND': frag_brand,
            'NAME': frag_name,
            'TYPE': frag_type,
            'SIZE': frag_size,
            'GENDER': frag_gender, 
            'RETAIL_PRICE': frag_retail_price,
            'PRICE': frag_price,
            'DISCOUNT': frag_discount
        })
    
        df_frags.to_csv('frags.csv', index=False)
    
    find_frags(driver)

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()

Matched groups: (None, 'Aventus', None)
Matched groups: (None, 'Y', None)
Matched groups: ("Men's", 'French Avenue Liquid Brun', None)
Matched groups: ("Men's", 'Layton', None)
Matched groups: (None, 'Emporio Stronger With You Intensely', None)
Matched groups: ('Unisex', 'Oud Wood', None)
Matched groups: ('Ladies', 'Delina Exclusif', None)
Matched groups: ("Men's", 'Hero', None)
Matched groups: ('Unisex', 'Club De Nuit Untold', None)
Matched groups: (None, 'Spicebomb Extreme', None)
Matched groups: ("Men's", 'The Most Wanted Parfum Spray', None)
Matched groups: ("Men's", 'Uomo Born In Roma Coral Fantasy', None)
Matched groups: (None, 'Instant Crush', None)
Matched groups: ('Unisex', 'Greenwich Village', None)
Matched groups: ("Men's", 'Myslf', None)
Matched groups: (None, 'The Most Wanted', None)
Matched groups: ('Unisex', 'Khamrah', None)
Matched groups: ('Unisex', 'Kirke Extrait de Parfum Spray', None)
Matched groups: ("Men's", 'Replica Jazz Club', None)
Matched groups: ('Ladies', 'D