# Ring Lord Scraping

## Imports and Setup

In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from lxml import etree
from time import sleep
import datetime
import csv

# Selenium Scraping imports
from selenium import webdriver 
from selenium.webdriver import Chrome 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By 
from webdriver_manager.chrome import ChromeDriverManager

# import Selenium exceptions
import selenium.common.exceptions as sel_exceptions

# Set Variables
SLEEP_TIME = 5
STARTING_LINK = 'https://theringlord.com/rings/'
LINK_FILE_PATH = 'ring_lord_products.csv'

## Get Data

### Find All Product Pages

The layout of The Ring Lord's website can be described as a series of collection and product pages where a collection page's links of interest are to other collection pages or product pages. As we start on a collection page and the number of collection pages to reach a prodcut page is not fixed but not too high I feel that recursion is the most reasonable strategy to find all the product pages.


In [2]:

# Create a set to check for uniqueness in product pages found
links_encountered = set()

# Open output file
out_file = open(LINK_FILE_PATH, 'w', newline='')
writer = csv.writer(out_file)


# Define a function to find all the product pages
def page_parser(url: str) -> None:
    # Ensure responsibility interval is respected
    sleep(SLEEP_TIME)

    # Print basic info
    now = datetime.datetime.now()
    print(f'{now}: Checking {url}: ', end='')

    # Get soup object from URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Determine if page is a product or collection
    products = soup.find_all('li', {'class': 'product'})
    collection = len(products) > 0

    if collection:
        print('collection')
        
        # Get all unique product links
        unique_links = set()
        for product in products:
            for a in product.find_all('a'):
                link = a['href']
                if 'http' in link:
                    unique_links.add(link)
        
        # Parse all product links
        for link in unique_links:
            page_parser(link)

    else:
        print('product')
        if url not in links_encountered:
            links_encountered.add(url)
            writer.writerow([url])

try: 
    page_parser(STARTING_LINK)
finally:
    out_file.close()


2023-05-30 18:32:44.014042: Checking https://theringlord.com/rings/: collection
2023-05-30 18:32:50.954367: Checking https://theringlord.com/rings/rubber/: collection
2023-05-30 18:32:57.261073: Checking https://theringlord.com/epdm-rubber-rings-16-ga-5-16-id/: product
2023-05-30 18:33:05.756311: Checking https://theringlord.com/milky-clear-rubber-rings/: product
2023-05-30 18:33:13.898246: Checking https://theringlord.com/glow-in-the-dark-silicone-rubber-rings-16-ga-1-4-id/: product
2023-05-30 18:33:22.298764: Checking https://theringlord.com/glow-in-the-dark-silicone-rubber-rings-14-ga-5-16-id/: product
2023-05-30 18:33:30.562299: Checking https://theringlord.com/epdm-rubber-rings-16-ga-1-4-id/: product
2023-05-30 18:33:38.976192: Checking https://theringlord.com/16g-3-8-black-epdm-rings-sold-by-the-ounce/: product
2023-05-30 18:33:47.101019: Checking https://theringlord.com/epdm-rubber-rings-16-ga-3-16-id/: product
2023-05-30 18:33:55.357684: Checking https://theringlord.com/epdm-ru

### Get Data From Product Page

Now that there is a way to find product pages we need to find a way to scrape all of the following data from the product page:
* SKU
* Material
* Price
* Quantity
* Color(If applicable)
* Wire Gauge
* Wire Diameter
* Ring Aspect Ratio
* Stock

#### Using Beautiful Soup

As a first start and to learn how to parse values from the page I will use Beautiful Soup to learn how to scrape data From the page.


##### Get Sample Data


In [None]:
# To reduce the number of requests to the website the following code is run to download and store repeatable sample pages for testing.

# Determine sample pages
product_urls = [
    # Image option pages
    "https://theringlord.com/enameled-copper-20ga-7-64-id/",
    "https://theringlord.com/enameled-copper-20ga-3-32-id/",
    "https://theringlord.com/enameled-copper-19ga-1-8-id/",

    # Radio button option pages
    "https://theringlord.com/stainless-steel-24g/",
    "https://theringlord.com/saw-cut-stainless-steel-20g/",
    "https://theringlord.com/saw-cut-stainless-steel-22g/",
]

# Get Sample pages and store as local variable
product_pages = list()
for product_url in product_urls:
    response = requests.get(product_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    product_pages.append(soup)
    sleep(5)


#### Test Sample Data

In [None]:
# The purpose of this cell is to test the function I wrote to parse the data on the test pages.


# Define a function to parse the product page
def parse_product(soup: BeautifulSoup) -> None:

    # Get item with product information from page
    product_view = soup.find_all('div', {'class': 'productView'})[0]

    # Get and clean up title
    title = product_view.select('h1[class=productView-title]')[0].get_text()
    clean_title = ' '.join([x for x in title.split(' ') if x != ''])

    # Print out item title
    print(clean_title)

    

    # Get price currency
    currency = soup.find_all('main')[0]['data-currency-code']

    # Get price
    price = product_view.select('span[data-product-price-without-tax]')[0].get_text()
    print(f"\t{price} {currency}")

    # Get Quantity
    description_list = product_view.find_all('dl')[0]
    quantity_description = description_list.find(lambda tag: tag.name == 'dt' and 'Quantity:' in tag.text)
    if quantity_description:
        quantity_data = quantity_description.next_sibling.next_sibling
        quantity = ''.join([c for c in quantity_data.text if c.isdigit()])
    else:
        quantity = 'not provided'

    print(f"\t{quantity}")

    # Get wire outer diameters
    description_list = product_view.find_all('dl')[0]
    wire_diameter_description = description_list.find(lambda tag: tag.name == 'dt' and 'Wire OD:' in tag.text)
    wire_diameter = wire_diameter_description.next_sibling.next_sibling

    wire_text = wire_diameter.text

    wds = wire_text.replace('(', '').replace(')', '').replace('= ', '').split(' ')

    diameters = {
        'imperial': wds[0],
        'metric': wds[1],
        'gauge': f"{wds[2]} {wds[3]}"
    }

    print(f"\t{diameters}")

    # Get ring material
    breadcrumb = soup.find_all('nav', attrs={'aria-label': 'Breadcrumb'})[0]
    material_link = breadcrumb.find_all('li')[2]
    material = material_link.find_all('span')[0].text

    print(f'\t{material}')

    # Get SKU
    description_list = product_view.find_all('dl')[0]
    sku_dt = description_list.find(lambda tag: tag.name == 'dt' and 'SKU:' in tag.text)
    sku_tag = sku_dt.next_sibling.next_sibling
    sku = sku_tag.text

    print(f"\t{sku}")

    # Get option type
    options = product_view.find_all('section', attrs={'class': 'product-options'})[0]
    color_options = len(options.find_all('div', attrs={'class': 'form-option-wrapper'})) > 0
    option_type = 'color' if color_options else 'size'

    print(f"\t{option_type}")



# Run parse function on test products
for product_page in product_pages:
    parse_product(product_page)


#### Experimentation

In [None]:

# Get productView
page = product_pages[3]
product_view = page.find_all('div', attrs={'class': 'productView'})[0]


options = product_view.find_all('section', attrs={'class': 'product-options'})[0]
color_options = len(options.find_all('div', attrs={'class': 'form-option-wrapper'})) > 0
option_type = 'color' if color_options else 'size'

print(option_type)




#### Using Selenium

As The Ring Lord uses pages that dynamically change using javascript we will use Selenium to allow us to iterate over options on a page to get all the data.


##### Start Webdriver

Below we define a function to start the driver 

In [None]:

def start_driver():
    options = webdriver.ChromeOptions()
    options.headless = False
    options.page_load_strategy = 'none'

    chrome_path = ChromeDriverManager().install()
    chrome_service = Service(chrome_path)

    driver = Chrome(options=options, service=chrome_service)
    driver.implicitly_wait(5)

    return driver


##### Test Sample Data


In [None]:

# Determine sample pages
product_urls = [
    # Image option pages
    "https://theringlord.com/enameled-copper-20ga-7-64-id/",
    "https://theringlord.com/enameled-copper-20ga-3-32-id/",
    "https://theringlord.com/enameled-copper-19ga-1-8-id/",

    # Radio button option pages
    "https://theringlord.com/stainless-steel-24g/",
    "https://theringlord.com/saw-cut-stainless-steel-20g/",
    "https://theringlord.com/saw-cut-stainless-steel-22g/",
]

driver = start_driver()


def get_options_specific_values(option, per_option_quantity, per_option_id, diameters):

    # Get SKU
    sku = driver.find_element(By.XPATH, "//dt[contains(text(), 'SKU:')]/following-sibling::dd").text
    print(f"\t{sku}")


    # Get and print currency info
    product_view = driver.find_element(By.CSS_SELECTOR, "div[class=productView]")
    price = product_view.find_element(By.CSS_SELECTOR, 'span[data-product-price-without-tax]').text
    currency = driver.find_element(By.CSS_SELECTOR, 'main').get_attribute('data-currency-code')
    print(f"\t\t{price} {currency}")


    # Get product stock
    stock = int(driver.find_element(By.CSS_SELECTOR, "span[data-product-stock]").text)
    print(f"\t\t{stock} bags in stock")


    # Get Quantity if available
    if per_option_quantity:
        qty_text = option.text.split('~')[1]
        qty = int(''.join([i for i in qty_text if i.isdigit()]))
        print(f"\t\t{qty} rings per bag")


    # Get Internal Diamater if applicable
    if per_option_id:
        text = option.text
        id_in = float(text.split(' ')[8].strip('"'))
        id_mm = float(text.split(' ')[9].strip('('))
        print(f"\t\tActual ID: {id_in} inches | {id_mm} milimeters")

        # Add calculation of Aspect Ratio
        od_in = float(diameters['imperial'].strip('"'))
        ar = id_in / od_in
        print(f"\t\tAspect Ratio: {ar}")


def parse_page(page):
    # Get the page and wait for it to load
    driver.get(page)
    sleep(2)

    
    # Get and print out the item title
    title = driver.find_element(By.CSS_SELECTOR, 'h1[class=productView-title]').text
    clean_title = ' '.join([x for x in title.split(' ') if x != ''])
    print(clean_title)


    # Get ring material
    bread_crumb = driver.find_element(By.CSS_SELECTOR, 'nav[aria-label=Breadcrumb]')
    material_link = bread_crumb.find_elements(By.TAG_NAME, 'li')[2]
    material = material_link.find_element(By.TAG_NAME, 'span').text
    print(f"\t{material}")


    # Get wire diameters
    product_view = driver.find_element(By.CSS_SELECTOR, "div[class=productView]")
    wd_text = product_view.find_element(By.XPATH, "//dt[contains(text(), 'Wire OD:')]/following-sibling::dd").text
    wds = wd_text.replace('(', '').replace(')', '').replace('= ', '').split(' ')
    diameters = {
        'imperial': wds[0],
        'metric': wds[1],
        'gauge': f"{wds[2]} {wds[3]}"
    }
    print(f"\t{diameters}")    # Iterate through options


    # Get quantity per bag
    per_option_qty = False
    try:
        product_view = driver.find_element(By.CSS_SELECTOR, "div[class=productView]")
        qty_text = product_view.find_element(By.XPATH, "//dt[contains(text(), 'Quantity:')]/following-sibling::dd").text
        qty = int(''.join([i for i in qty_text if i.isdigit()]))
        print(f"\t{qty} rings per bag")
    except sel_exceptions.NoSuchElementException:
        try:
            option_container = driver.find_element(By.CSS_SELECTOR, "div.productView-options")
            option = option_container.find_elements(By.CSS_SELECTOR, "label[data-product-attribute-value]")[0]
            per_option_qty = ('~' in option.text)
        except sel_exceptions.NoSuchElementException:
            print('\tno quantity info available')
    

    # Get Internal Diameter of weaves
    per_option_id = False
    try:
        internal_diameter = driver.find_element(By.XPATH, "//dt[contains(text(), 'Actual ID:')]/following-sibling::dd").text
        ids = internal_diameter.split('"')
        id_in = float(ids[0])
        id_mm = float(''.join([i for i in ids[1] if (i.isdigit() or i=='.')]))
        print(f"\tActual ID: {id_in} inches | {id_mm} milimeters")

        # Add calculation of Aspect Ratio
        od_in = float(diameters['imperial'].strip('"'))
        ar = id_in / od_in
        print(f"\tAspect Ratio: {ar}")

    except sel_exceptions.NoSuchElementException:
        try:
            option_container = driver.find_element(By.CSS_SELECTOR, "div.productView-options")
            option = option_container.find_elements(By.CSS_SELECTOR, "label[data-product-attribute-value]")[0]
            per_option_id = ' ID]' in option.text
        except sel_exceptions.NoSuchElementException:
            print('\tno internal diameter information available')


    # Get option type
    product_view = driver.find_element(By.CSS_SELECTOR, "div[class=productView]")
    options = product_view.find_element(By.CSS_SELECTOR, "section.product-options")
    color_options = len(options.find_elements(By.CSS_SELECTOR, "div.form-option-wrapper")) > 0
    option_type = 'color' if color_options else 'size'


    ## Color options
    if option_type == 'color':
        option_container = driver.find_element(By.CSS_SELECTOR, "div.productView-options")
        options = option_container.find_elements(By.CSS_SELECTOR, "div.form-option-wrapper")

        for option in options:
            option.click()
            sleep(3)
            get_options_specific_values(option, per_option_qty, per_option_id, diameters)

    ## Size Options
    else:
        option_container = driver.find_element(By.CSS_SELECTOR, "div.productView-options")
        options = option_container.find_elements(By.CSS_SELECTOR, "label[data-product-attribute-value]")

        for option in options:
            option.click()
            sleep(3)
            get_options_specific_values(option, per_option_qty, per_option_id, diameters)


    # Wait to continue scraping
    sleep(5)

for page in product_urls:
    parse_page(page)


##### Experiment with Webdiver


In [None]:
### Start driver and get page
driver = start_driver()

driver.get("https://theringlord.com/enameled-copper-20ga-7-64-id/")
# driver.get("https://theringlord.com/stainless-steel-24g/")
# driver.get("https://theringlord.com/saw-cut-stainless-steel-20g/")
sleep(2)


In [None]:

"""
product_view = driver.find_element(By.CSS_SELECTOR, "div[class=productView]")
wd_text = product_view.find_element(By.XPATH, "//dt[contains(text(), 'Wire OD:')]/following-sibling::dd").text
"""

# # Get material
# bread_crumb = driver.find_element(By.CSS_SELECTOR, 'nav[aria-label=Breadcrumb]')
# material_link = bread_crumb.find_elements(By.TAG_NAME, 'li')[2]
# material = material_link.find_element(By.TAG_NAME, 'span').text
# print(f"\t{material}")

# # Get sku
# sku = driver.find_element(By.XPATH, "//dt[contains(text(), 'SKU:')]/following-sibling::dd").text
# print(f"\t{sku}")

# # Get option type
# product_view = driver.find_element(By.CSS_SELECTOR, "div[class=productView]")
# options = product_view.find_element(By.CSS_SELECTOR, "section.product-options")
# color_options = len(options.find_elements(By.CSS_SELECTOR, "div.form-option-wrapper")) > 0
# option_type = 'color' if color_options else 'size'
# print(f"\t{option_type}")

# # Iterate through options

# ## Color options
# if option_type == 'color':
#     option_container = driver.find_element(By.CSS_SELECTOR, "div.productView-options")
#     options = option_container.find_elements(By.CSS_SELECTOR, "div.form-option-wrapper")

#     for option in options:
#         option.click()
#         sleep(3)

# ## Size Options
# else:
#     option_container = driver.find_element(By.CSS_SELECTOR, "div.productView-options")
#     options = option_container.find_elements(By.CSS_SELECTOR, "label[data-product-attribute-value]")

#     for option in options:
#         option.click()
#         sleep(3)

# try:
#     product_view = driver.find_element(By.CSS_SELECTOR, "div[class=productView]")
#     qty_text = product_view.find_element(By.XPATH, "//dt[contains(text(), 'Quantity:')]/following-sibling::dd").text
#     qty = int(''.join([i for i in qty_text if i.isdigit()]))
#     print(f"\t{qty} rings per bag")
# except sel_exceptions.NoSuchElementException:
#     try:
#         option_container = driver.find_element(By.CSS_SELECTOR, "div.productView-options")
#         option = option_container.find_elements(By.CSS_SELECTOR, "label[data-product-attribute-value]")[0]
#         per_option_qty = '~' in option.text
#     except sel_exceptions.NoSuchElementException:
#         print('\tno quantity info available')


# stock = int(driver.find_element(By.CSS_SELECTOR, "span[data-product-stock]").text)
# print(f"\t{stock} bags in stock")


# Get wire diameters
product_view = driver.find_element(By.CSS_SELECTOR, "div[class=productView]")
wd_text = product_view.find_element(By.XPATH, "//dt[contains(text(), 'Wire OD:')]/following-sibling::dd").text
wds = wd_text.replace('(', '').replace(')', '').replace('= ', '').split(' ')
diameters = {
    'imperial': wds[0],
    'metric': wds[1],
    'gauge': f"{wds[2]} {wds[3]}"
}
print(f"\t{diameters}") 


try:
    internal_diameter = driver.find_element(By.XPATH, "//dt[contains(text(), 'Actual ID:')]/following-sibling::dd").text
    ids = internal_diameter.split('"')
    id_in = float(ids[0])
    id_mm = float(''.join([i for i in ids[1] if (i.isdigit() or i=='.')]))
    print(f"\tActual ID: {id_in} inches | {id_mm} milimeters")
except sel_exceptions.NoSuchElementException:
    try:
        option_container = driver.find_element(By.CSS_SELECTOR, "div.productView-options")
        option = option_container.find_elements(By.CSS_SELECTOR, "label[data-product-attribute-value]")[0]
        per_option_id = ' ID]' in option.text
    except sel_exceptions.NoSuchElementException:
        print('\tno internal diameter information available')


od_in = float(diameters['imperial'].strip('"'))
ar = id_in / od_in
print(f"\tAspect Ratio: {ar}")


