In [3]:
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
import pandas as pd
import regex
import time

try:
    driver_path = r"C:\Program Files (x86)\ChromeDriver\chromedriver.exe"
    website = "https://www.jomashop.com/fragrances.html"
    
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1280,1440")
                            
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service)
    
    driver.get(website)

    # Hitting ESC on the body element to close the modal that appears whenever you load the page.
    time.sleep(4)
    driver.find_element(By.XPATH, '//body').send_keys(Keys.ESCAPE)
    time.sleep(2)
    
    frags = driver.find_elements(By.XPATH, '//div[@class="product-details"]')
    types = [
        "Eau De Toilette",
        "Eau De Parfum",
        "Eau De Cologne",
        "Extrait De Parfum",
        "Cologne",
        "EDC",
        "EDP",
        "EDT",
        "Parfum"
    ]
    
    # (((except the discount %)))
    def append_pricing_info(frag, frag_price, frag_coupon, frag_price_after_coupon, frag_retail_price=None):
        try:
            # The original / retail price before a discount is applied.
            try:
                was_price_wrapper = frag.find_element(By.XPATH, './/div[contains(@class, "was-price-wrapper")]')
                was_wrapper = frag.find_element(By.XPATH, './/div[contains(@class, "was-wrapper")]')
            except NoSuchElementException:
                was_price_wrapper = None
                was_wrapper = None
            # The current price with any coupons / discounts applied.
            try:
                now_price = frag.find_element(By.XPATH, './/div[contains(@class, "now-price")]')
            except NoSuchElementException:
                now_price = None
            # The coupon amount in dollars.
            try:
                coupon_tag = frag.find_element(By.XPATH, './/div[contains(@class, "coupon-tag")]')
            except NoSuchElementException:
                coupon_tag = None
            # The now-price after the coupon is applied.
            try:
                after_price = frag.find_element(By.XPATH, './/div[contains(@class, "after-price")]')
            except NoSuchElementException:
                after_price = None
    
            if was_wrapper:
                frag_retail_price.append(was_wrapper.text)
            else:
                frag_retail_price.append('')
            
            if now_price:
                frag_price.append(now_price.text)
            else:
                frag_price.append('')
            
            if coupon_tag:
                pattern = regex.compile(r"(\s+coupon)", regex.IGNORECASE)
                coupon = regex.sub(pattern, "", coupon_tag.text)
                frag_coupon.append(coupon)
            else:
                frag_coupon.append('')
                return
                
            if after_price:
                pattern = regex.compile(r"(\s+after\s+coupon)", regex.IGNORECASE)
                price_after_coupon = regex.sub(pattern, "", after_price.text)
                frag_price_after_coupon.append(price_after_coupon)
            else:
                frag_price_after_coupon.append('')
                return
        except Exception as e:
            print(f"Error appending pricing info: {e}")
            
    # Scroll the page incrementally 6 times and stop when the bottom is reached.
    def scroll_page(driver):
        scroll_count = 6
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_increment = last_height // scroll_count

        for _ in range(scroll_count):
            driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
            time.sleep(1.5)
            new_height = driver.execute_script("return document.body.scrollHeight")

            if new_height == last_height:  
                break
            last_height = new_height
    
    def find_frags(driver):
        seen_frags = set()
        frag_brand = []
        frag_name = []
        frag_type = []
        #frag_gender = []
        frag_price = []
        frag_retail_price = []
        frag_discount = []
        frag_coupon = []
        frag_price_after_coupon = []
        
        name_pattern = regex.compile(r"^(?:(Men's|Ladies|Unisex)\s+)?(?:-\s*)?([^/]+?)(?=\s?Spray\s?|\s*/|\s+by|\s+\(|\s+\d+\.\d+|\s+(?:Eau De Parfum|Eau De Cologne|Eau De Toilette|Extrait De Parfum|EDP|EDT|EDC|Cologne)|$)|\((m|u|w)\)", regex.IGNORECASE)
        type_pattern = regex.compile(r"(Eau De Parfum|Eau De Cologne|Eau De Toilette|Extrait De Parfum|EDP|EDT|EDC|Cologne)", regex.IGNORECASE)

        page_counter = 0

        # while True:
        while page_counter < 1: # Set to 1 for testing.
            scroll_page(driver)

            new_frags = driver.find_elements(By.XPATH, '//div[contains(@class, "product-details")]')
            for frag in new_frags:

                was_price_wrapper = WebDriverWait(driver, 0.5).until(EC.visibility_of_element_located((By.XPATH, './/div[contains(@class, "was-price-wrapper")]'))) 
                was_wrapper = WebDriverWait(driver, 0.5).until(EC.visibility_of_element_located((By.XPATH, './/div[contains(@class, "was-wrapper")]')))
                now_price = WebDriverWait(driver, 0.5).until(EC.visibility_of_element_located((By.XPATH, './/div[contains(@class, "now-price")]')))
                coupon_tag = WebDriverWait(driver, 0.5).until(EC.visibility_of_element_located((By.XPATH, './/div[contains(@class, "coupon-tag")]')))
                after_price = WebDriverWait(driver, 0.5).until(EC.visibility_of_element_located((By.XPATH, './/div[contains(@class, "after-price")]')))
                    
                if frag not in seen_frags:
                    # Avoids duplication by only adding unique fragrances to the set. Using a set as opposed to a list since we won't be pulling dupes and need this to not be slow. There will be like 250 pages lol.
                    seen_frags.add(frag)
                try:
                    orig_name = frag.find_element(By.XPATH, './/span[@class="name-out-brand"]').text
                    brand = frag.find_element(By.XPATH, './/span[@class="brand-name"]').text
                    name_match = regex.match(name_pattern, orig_name)
                    type_match = regex.search(type_pattern, orig_name)

                    for ftype in types:
                        if ftype not in orig_name:
                            continue
                        else:
                            break
                            
                    if name_match:
                        cleaned_name = name_match.group(2).strip()   
                    else:
                        cleaned_name = orig_name
                    if type_match:
                        extr_type = type_match.group(1).strip()
                    else:
                        extr_type = ""

                    # If the original price that has has a discount applied exists include it in the appending function (frag_retail_price)
                    if was_price_wrapper:
                        append_pricing_info(frag, frag_price, frag_coupon, frag_price_after_coupon, frag_retail_price)
                    elif coupon_tag and after_price:
                        append_pricing_info(frag, frag_price, frag_coupon, frag_price_after_coupon)
                    else:
                        return
                    frag_brand.append(brand)
                    frag_name.append(cleaned_name)    
                    if "parfum" in cleaned_name.lower():
                        extr_type = "Parfum"
                    frag_type.append(extr_type)
                    #frag_gender.append(gender)
                    frag_discount.append(frag.find_element(By.XPATH, './/span[@class="tag-item discount-label"]').text)
                except Exception as e:
                    print(f"Scraping Error: {e}")
                    continue
            try:
                next_page = driver.find_element(By.XPATH, "//li[contains(@class, 'pagination-next page-item')]//a[contains(@class, 'page-link')]")
                next_page.click()
                page_counter += 1
                time.sleep(2)
            except NoSuchElementException:
                print(f"Last page reached, scraping complete. Last item scraped: {cleaned_name}")
                break

        # Ensuring all cols are of equal length to fit inside the DataFrame.
        max_length = max(len(frag_brand), len(frag_name), len(frag_type), len(frag_price), len(frag_retail_price), len(frag_discount), len(frag_coupon), len(frag_price_after_coupon))
        frag_brand.extend([''] * (max_length - len(frag_brand)))
        frag_name.extend([''] * (max_length - len(frag_name)))
        frag_type.extend([''] * (max_length - len(frag_type)))
       #frag_gender.extend([''] * (max_length - len(frag_gender)))
        frag_price.extend([''] * (max_length - len(frag_price)))
        frag_retail_price.extend([''] * (max_length - len(frag_retail_price)))
        frag_discount.extend([''] * (max_length - len(frag_discount)))
        frag_coupon.extend([''] * (max_length - len(frag_coupon)))
        frag_price_after_coupon.extend([''] * (max_length - len(frag_price_after_coupon)))
        
        df_frags = pd.DataFrame({
            'BRAND': frag_brand,
            'NAME': frag_name,
            'TYPE': frag_type,
            #'GENDER': frag_gender, 
            'RETAIL_PRICE': frag_retail_price,
            'DISCOUNT': frag_discount,
            'COUPON' : frag_coupon,
            'PRICE_AFTER_COUPON' : frag_price_after_coupon,
            'PRICE': frag_price
        })

        df_frags.to_csv('frags.csv', index=False)

        # returning the dataframe in order to access it in the find_notes function.
        return df_frags
        
    df_frags_full = find_frags(driver)

    def find_notes(driver, frag_names):
        driver.get("https://www.parfumo.com/Perfumes")
    
        # Cookies modal pops up when loading the page for the first time, clicking the accept button.
        cookies_iframe = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "//iframe[contains(@id, 'sp_message_iframe')]")))                                                 
        driver.switch_to.frame(cookies_iframe)
        driver.find_element(By.XPATH, "//button[contains(@class, 'sp_choice_type_11')]").click()
        driver.switch_to.default_content()

        name_pattern_no_conc = regex.compile(r"^(?:(Men's|Ladies|Unisex)\s+)?(?:-\s*)?([^/]+?)(?=\s?Spray\s?|\s*/|\s+by|\s+\(|\s+\d+\.\d+|\s+(?:Eau De Parfum|Eau De Cologne|Eau De Toilette|Extrait De Parfum|EDP|EDT|EDC|Cologne)|$)|\((m|u|w)\)")
                                     
        # Some fragrances start with the same name, but have a different perfume concentration (EDP, EDT, Cologne, etc)
        results_already_scraped = set()

    # find_notes(driver, df_frags_full["NAME"])
    
except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()