### Selenium using *Remote* webdriver

remote webdriver is being used because some development is being done on Apple arm64 silicon with insufficient support native (browser) drivers such as geckodriver or chromedriver


https://github.com/SeleniumHQ/docker-selenium

docker run -d -p 4444:4444 -p 7900:7900 --shm-size="2g" selenium/standalone-firefox:latest

Point your WebDriver tests to http://localhost:4444
To see what is happening inside the container, head to http://localhost:7900 (password is secret)


In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from re import sub as re_sub
from collections import namedtuple
from decimal import Decimal
import time
import pickle
import pandas as pd

In [None]:
# data structures needed for holding scrapped data
ProductHierarchy = namedtuple("ProductHierarchy", "category subcategory href")
Products = namedtuple("Products", "name code subcategory url description unit_price")

In [None]:
# setup remote webserver 
options = FirefoxOptions()
driver = webdriver.Remote(
    command_executor='http://172.17.0.3:4444',
    options=options
)


In [None]:
driver.get('https://www.mikroe.com/click')
menu=driver.find_element(By.XPATH, "(//*[contains(@class, 'tree dynamized')])")
submenu=menu.find_element(By.PARTIAL_LINK_TEXT,"Click Boards")

In [None]:
product_hierarchy = []

category_elements = submenu.find_elements(By.XPATH, './../ul/*')
for element in category_elements:
    category = element.find_element(By.TAG_NAME, 'a').get_attribute("textContent")
    category = re_sub(r"\([^()]*\)", "", category).strip()
    if category in ["Click Bundles", "Click Shields", "Legacy"]:
        continue
    subcategory_elements = element.find_elements(By.XPATH, 'ul/*')

    for element in subcategory_elements:
        subcategory = element.find_element(By.TAG_NAME, 'a').get_attribute("textContent")
        subcategory = re_sub(r"\([^()]*\)", "", subcategory).strip()
        href = element.find_element(By.TAG_NAME, 'a').get_attribute("href")
        
        product_hierarchy.append(ProductHierarchy(category, subcategory, href))


In [None]:
with open('mikroe_product_hierarchy.pickle', 'wb') as f:
    pickle.dump(product_hierarchy, f)

product_hierarchy_df = pd.DataFrame.from_records(
   product_hierarchy,
   columns=ProductHierarchy._fields
)

In [None]:
from selenium.common.exceptions import NoSuchElementException

products = []
subcategory_count = len(product_hierarchy)
for _ in product_hierarchy:
    driver.get(str(_.href))
    subcategory = _.subcategory
    
    try: 
        view = driver.find_element(By.XPATH, "(//*[contains(@class, 'product-view')])")
        view.find_element(By.ID, "list").click()
    except NoSuchElementException:
        print(f"Skipping {subcategory} - Cannot set the display to list view.")
        continue

    try: 
        dropdown=driver.find_element(By.CLASS_NAME, "product-count").find_element(By.ID, "nb_item")
        options = [x for x in dropdown.find_elements(By.TAG_NAME, "option")]
    except NoSuchElementException:
        print(f"Skipping {subcategory} - Cannot set the number of products displayed.")
        continue

    max = -1
    for _ in options:
        value = int(_.get_attribute("value"))
        if value > max:
            max = value
    Select(dropdown).select_by_value(str(max))
    time.sleep(2)
    
    try:
        product_elements=driver.find_elements(By.XPATH, "(//*[contains(@class, 'product-container')])")
    except NoSuchElementException:
        print(f"Skipping {subcategory} - No product container on page.")
        continue

    product_count = len(product_elements)
    print(f"{subcategory_count} - {subcategory} has {product_count} products")
    
    for element in product_elements:
        code = element.find_element(By.CSS_SELECTOR, "div.content_price span.product-reference").get_attribute("textContent")
        name = element.find_element(By.CLASS_NAME, "product-name").get_attribute("textContent")
        url = element.find_element(By.CLASS_NAME, "product-name").get_attribute("href")
        description = element.find_element(By.CLASS_NAME, "product-desc").get_attribute("textContent")
        unit_price = element.find_element(By.CLASS_NAME, "me-product-price").get_attribute("textContent")
        unit_price = Decimal(re_sub(r'[^\d.]', '', unit_price))
        
        products.append(Products(name, code, subcategory, url, description, unit_price))
    
    subcategory_count = subcategory_count-1

driver.quit()

In [None]:
with open('mikroe_products.pickle', 'wb') as f:
    pickle.dump(products, f)

products_df = pd.DataFrame.from_records(
   products,
   columns=Products._fields
)

In [None]:
df = products_df.merge(product_hierarchy_df[["category", "subcategory"]], how="left", on="subcategory")
df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
cols = ['category', 'subcategory', 'code', 'name', 'url', 'description', 'unit_price']
df = df[cols]

df.to_csv("mikroe_all_products.tsv", sep="\t")
df.to_pickle("mikroe_all_products.pickle")

In [None]:
_ = products_df.groupby(['subcategory'])['subcategory'].agg(cnt='count').reset_index()
df2 = product_hierarchy_df.merge(_, how="left", on="subcategory").fillna(0)
df2 = df2.astype({'cnt': 'int32'})
df2.loc[df2['cnt'] == 0]

In [None]:
with open('mikroe.pickle', 'wb') as f:
    pickle.dump(products, f)

with open('mikroe.pickle', 'rb') as f:
    products = pickle.load(f)

In [None]:
# things = driver.find_elements(By.XPATH, "//div[@class='af_pl_wrapper']")
# things = driver.find_elements(By.CSS_SELECTOR, "li.ajax_block_product")
# things = driver.find_elements(By.CSS_SELECTOR, "div.product-container.clearfix")
# things = driver.find_elements(By.XPATH, "(//*[contains(@class, 'product-container')])")
# x = things[0].find_element(By.CSS_SELECTOR, "p.product-desc")