# Ring Lord Scraping

## Imports and Setup

In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from lxml import etree
from time import sleep
import datetime

# Selenium Scraping imports
from selenium import webdriver 
from selenium.webdriver import Chrome 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By 
from webdriver_manager.chrome import ChromeDriverManager

# Set Variables
SLEEP_TIME = 5
STARTING_LINK = 'https://theringlord.com/rings/'

## Get Data

### Find All Product Pages

The layout of The Ring Lord's website can be described as a series of collection and product pages where a collection page's links of interest are to other collection pages or product pages. As we start on a collection page and the number of collection pages to reach a prodcut page is not fixed but not too high I feel that recursion is the most reasonable strategy to find all the product pages.


In [None]:

# Define a function to find all the product pages
def page_parser(url: str) -> None:
    # Ensure responsibility interval is respected
    sleep(SLEEP_TIME)

    # Print basic info
    now = datetime.datetime.now()
    print(f'{now}: Checking {url}: ', end='')

    # Get soup object from URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Determine if page is a product or collection
    products = soup.find_all('li', {'class': 'product'})
    collection = len(products) > 0

    if collection:
        print('collection')
        
        # Get all unique product links
        unique_links = set()
        for product in products:
            for a in product.find_all('a'):
                link = a['href']
                if 'http' in link:
                    unique_links.add(link)
        
        # Parse all product links
        for link in unique_links:
            page_parser(link)

    else:
        print('product')


page_parser(STARTING_LINK)


### Get Data From Product Page

Now that there is a way to find product pages we need to find a way to scrape all of the following data from the product page:
* SKU
* Material
* Price
* Quantity
* Color(If applicable)
* Wire Gauge
* Wire Diameter
* Ring Aspect Ratio
* Stock

#### Using Beautiful Soup

As a first start and to learn how to parse values from the page I will use Beautiful Soup to learn how to scrape data From the page.


##### Get Sample Data


In [2]:
# To reduce the number of requests to the website the following code is run to download and store repeatable sample pages for testing.

# Determine sample pages
product_urls = [
    "https://theringlord.com/enameled-copper-20ga-7-64-id/",  # Image option select
    "https://theringlord.com/stainless-steel-24g/",           # radio button option select
]

# Get Sample pages and store as local variable
product_pages = list()
for product_url in product_urls:
    response = requests.get(product_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    product_pages.append(soup)
    sleep(5)


#### Test Sample Data

In [3]:
# The purpose of this cell is to test the function I wrote to parse the data on the test pages.


# Define a function to parse the product page
def parse_product(soup: BeautifulSoup) -> None:

    # Get item with product information from page
    product_view = soup.find_all('div', {'class': 'productView'})[0]

    # Get and clean up title
    title = product_view.select('h1[class=productView-title]')[0].get_text()
    clean_title = ' '.join([x for x in title.split(' ') if x != ''])

    # Print out item title
    print(clean_title)

    

    # Get price currency
    currency = soup.find_all('main')[0]['data-currency-code']

    # Get price
    price = product_view.select('span[data-product-price-without-tax]')[0].get_text()
    print(f"\t{price} {currency}")

    # Get Quantity
    description_list = product_view.find_all('dl')[0]
    quantity_description = description_list.find(lambda tag: tag.name == 'dt' and 'Quantity:' in tag.text)
    quantity_data = quantity_description.next_sibling.next_sibling
    quantity = ''.join([c for c in quantity_data.text if c.isdigit()])

    print(f"\t{quantity}")

    # Get wire outer diameters
    description_list = product_view.find_all('dl')[0]
    wire_diameter_description = description_list.find(lambda tag: tag.name == 'dt' and 'Wire OD:' in tag.text)
    wire_diameter = wire_diameter_description.next_sibling.next_sibling

    wire_text = wire_diameter.text

    wds = wire_text.replace('(', '').replace(')', '').replace('= ', '').split(' ')

    diameters = {
        'imperial': wds[0],
        'metric': wds[1],
        'gauge': f"{wds[2]} {wds[3]}"
    }

    print(f"\t{diameters}")

    # Get ring material
    breadcrumb = soup.find_all('nav', attrs={'aria-label': 'Breadcrumb'})[0]
    material_link = breadcrumb.find_all('li')[2]
    material = material_link.find_all('span')[0].text

    print(f'\t{material}')

    # Get SKU
    description_list = product_view.find_all('dl')[0]
    sku_dt = description_list.find(lambda tag: tag.name == 'dt' and 'SKU:' in tag.text)
    sku_tag = sku_dt.next_sibling.next_sibling
    sku = sku_tag.text

    print(f"\t{sku}")



# Run parse function on test products
for product_page in product_pages:
    parse_product(product_page)


Enameled copper 20ga 7/64'' ID
	$2.44 USD
	570
	{'imperial': '0.032"', 'metric': '0.81mm', 'gauge': '20g AWG'}
	Enameled Copper
	SX-EC-20764
Stainless Steel 24g
	$4.82 - $4.99 USD
	1000
	{'imperial': '0.02"', 'metric': '0.5mm', 'gauge': '24g AWG'}
	Stainless Steel
	MC-SS-24


#### Experimentation

In [None]:

# Get productView
page = product_pages[1]
product_view = page.find_all('div', attrs={'class': 'productView'})[0]


description_list = product_view.find_all('dl')[0]
sku_dt = description_list.find(lambda tag: tag.name == 'dt' and 'SKU:' in tag.text)
sku_tag = sku_dt.next_sibling.next_sibling
sku = sku_tag.text

print(sku)

