# Metal Designz Scraping

## Imports and Setup

In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from time import sleep
import datetime
import csv
import re

# Selenium Scraping imports
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select

# Set Variables
SLEEP_TIME = 5
STARTING_LINK = 'https://www.metaldesignz.com/jump-rings/'
LINK_FILE_PATH = '../../product_page_listings/metal_designz/metal_designz_products.csv'


## Get Data

### Find All Product Pages

Looking at their website it appears that the basic structure is a main page with side bar links to sub pages for each ring category. The sub pages link to product pages but if there are enough product pages they will be paginated.


In [None]:

# Helper function that gets the product links from a category page
def get_product_links(soup: BeautifulSoup, writer: csv.writer) -> None:
    product_grid = soup.find('ul', {'class': 'productGrid'})
    products = product_grid.find_all('li', {'class': 'product'})

    for product in products:
        link_tag = product.find('a', {'class': 'card-figure__link'})
        link = link_tag['href']
        writer.writerow([link])


# Get link to next page if applicable
def get_next_page(soup: BeautifulSoup) -> str:
    next_page = None
    pagination = soup.find('nav', {'class': 'pagination'})
    link_tag = pagination.find('a', {'aria-label': 'Next'})

    if link_tag:
        next_page = link_tag['href']
    
    return next_page


# From the starting link find all product pages
def page_parser(url: str, writer: csv.writer) -> None:

    # Get page and create a parsable object
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Get links to category pages
    sidebar = soup.find('aside', {'class': 'page-sidebar'})
    category_sidebar = sidebar.find(lambda tag: tag.name == 'div' and tag.find('h2', string='Jump Rings') is not None)
    category_links = set(map(lambda tag: tag['href'], category_sidebar.find_all('a')))

    # Iterate through top level links
    for num, link in enumerate(category_links):
        print(f"Checking cateogry: {num} -  {link}")
        sleep(5)

        # Get first page of category
        response = requests.get(link)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get the product links from the page
        get_product_links(soup, writer)

        # Get link to the next page
        next_page = get_next_page(soup)

        # Itterate through remaining pages if applicable
        while next_page:
            print(f'Found New Page: {next_page}')
            sleep(5)

            # Get new page            
            response = requests.get(next_page)
            soup = BeautifulSoup(response.content, 'html.parser')

            # Print product links
            get_product_links(soup, writer)

            # Get next page if applicable
            next_page = get_next_page(soup)


# Use function to write links to file
with open(LINK_FILE_PATH, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    page_parser(STARTING_LINK, writer)


### Get Data From Product Pages

#### Try Scraping Using Beautiful Soup


##### Store Test Data Locally


In [None]:

# Test Case URLs
urls = [
    "https://www.metaldesignz.com/stainless-steel-jump-rings-14-gauge-9-32-id/",
    "https://www.metaldesignz.com/rainbowed-anodized-titanium-jump-rings-16-gauge-3-8-id/",
]

# Store all test pages locally
pages = list()
for url in urls:
    sleep(5)
    response = requests.get(url)
    pages.append(BeautifulSoup(response.content, 'html.parser'))



##### Define a Function to Get Data from the Page

In [None]:

def parse_page(page: BeautifulSoup) -> dict:
    # Define output
    out = {
        "time_accessed ":       str(datetime.datetime.now()),
        "sku":                  None,
        "product_name":         None,
        "material":             None,
        "price":                None,
        "currency":             "CAD",
        "wire_diameter_in":     None,
        "wire_diameter_mm":     None,
        "wire_diameter_gauge":  None,
        "internal_diameter_in": None,
        "internal_diameter_mm": None,
        "aspect_ratio":         None,
        "color":                None,
        "bags_in_stock":        None,
        "rings_per_bag":        None,
    }

    # Get sku
    sku = page.find('dd', {'data-product-sku': True}).text
    out['sku'] = sku


    # Get title
    title = page.find('h1', {'class': 'productView-title'}).text
    out['product_name'] = title


    # Get Material
    bread_crumbs = page.find('nav', {'aria-label': 'Breadcrumb'})
    material = bread_crumbs.find_all('li')[2].text.strip()
    out['material'] = material


    # Get Price
    price_section = page.find('div', {'class': 'productView-price'})
    price_tag = price_section.find('span', {'data-product-price-without-tax': True})
    price = price_tag.text
    out['price'] = price


    # Get Wire diameter info
    gauge = ''.join([i for i in title.split('Gauge')[0] if i.isdigit()])
    gauge_info = {
        # Follows stated rule that Metal Designz uses SWG for 14-19 or AWG for 20-24
        # Returns a tuple in the format: (metric, imperial, Gauge & System)
        '14': (0.080, 2.032, '14 SWG'),
        '15': (0.072, 1.829, '15 SWG'),
        '16': (0.064, 1.626, '16 SWG'),
        '17': (0.056, 1.422, '17 SWG'),
        '18': (0.048, 1.219, '18 SWG'),
        '19': (0.040, 1.016, '19 SWG'),
        '20': (0.0320, 0.812, '20 AWG'),
        '21': (0.0285, 0.723, '21 AWG'),
        '22': (0.0253, 0.644, '22 AWG'),
        '23': (0.0226, 0.573, '23 AWG'),
        '24': (0.0201, 0.511, '24 AWG'),
    }
    out['wire_diameter_in'], out['wire_diameter_mm'], out['wire_diameter_gauge'] = gauge_info[gauge]


    # Get Ring Inner Diameter
    title = page.find('h1', {'class': 'productView-title'}).text
    id_in_str = re.search(r'\d+\/\d+"', title, re.IGNORECASE).group()
    numerator, denominator = id_in_str.strip('"').split('/')
    id_in = float(numerator) / float(denominator)
    id_mm = id_in * 25.4
    out['internal_diameter_in'] = id_in
    out['internal_diameter_mm'] = id_mm


    # Get Aspect Ratio
    out['aspect_ratio'] = out['internal_diameter_in'] / out['wire_diameter_in']


    # Return data
    return out


##### Run All Test Cases

In [None]:

for page in pages:
    results = parse_page(page)
    print(results)



##### Experimentation

In [None]:

page = pages[0]

title = page.find('h1', {'class': 'productView-title'}).text
id_in_str = re.search(r'\d+\/\d+"', title, re.IGNORECASE).group()
numerator, denominator = id_in_str.strip('"').split('/')
id_in = float(numerator) / float(denominator)
id_mm = id_in * 25.4


#### Using Selenium

##### Setup

In [2]:

# Define a function that converts unclean strings into clean numbers
def get_number(string):
    return float(''.join([i for i in string if (i.isdigit() or i == '.')]))


# Create an easy way to start the webdriver
def start_driver():
    # Set options
    options = Options()
    # options.add_argument('--headless=new')

    # Start webdriver
    chrome_path = ChromeDriverManager().install()
    chrome_service = Service(chrome_path)

    driver = webdriver.Chrome(options=options, service=chrome_service)

    return driver


# Define a function that returns info about the products on a page given a url and a webdriver
def parse_page(url: str, driver: webdriver) -> dict:
    # Get the page and wait a bit for it to load
    driver.get(url)
    sleep(3)


    # Define output dictionary
    out = {
        "time_accessed ":       str(datetime.datetime.now()),
        "sku":                  None,
        "product_name":         None,
        "material":             None,
        "price":                None,
        "currency":             "CAD",
        "wire_diameter_in":     None,
        "wire_diameter_mm":     None,
        "wire_diameter_gauge":  None,
        "internal_diameter_in": None,
        "internal_diameter_mm": None,
        "aspect_ratio":         None,
        "color":                None,
        "bags_in_stock":        None,
        "rings_per_bag":        None,
    }


    # Get product name
    out['product_name'] = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text


    # Get material
    breadcrumbs = driver.find_element(By.CSS_SELECTOR, 'nav[aria-label=Breadcrumb]')
    out['material'] = breadcrumbs.find_elements(By.CSS_SELECTOR, 'li')[2].text.strip()

    ## Alternate method for invalid materials
    if out['material'] == 'Square Jump Rings':
        if 'stainless steel' in out['product_name'].lower():
            out['material'] = 'Stainless Steel'
        elif 'copper' in out['product_name'].lower():
            out['material'] = 'copper'
        elif 'bronze' in out['product_name'].lower():
            out['material'] = 'bronze'
        elif 'anodized aluminum' in out['product_name'].lower():
            out['material'] = 'anodized aluminum'


    # Get Wire diameter info
    gauge_match =  re.search(r'\d+ ?g', out['product_name'], re.IGNORECASE)
    if gauge_match:
        gauge = ''.join([i for i in gauge_match.group() if i.isdigit()])
        gauge_info = {
            # Follows stated rule that Metal Designz uses SWG for 14-19 or AWG for 20-24
            # Returns a tuple in the format: (metric, imperial, Gauge & System)
            '12': (0.104, 2.642, '12 SWG'),
            '13': (0.092, 2.337, '13 SWG'),
            '14': (0.080, 2.032, '14 SWG'),
            '15': (0.072, 1.829, '15 SWG'),
            '16': (0.064, 1.626, '16 SWG'),
            '17': (0.056, 1.422, '17 SWG'),
            '18': (0.048, 1.219, '18 SWG'),
            '19': (0.040, 1.016, '19 SWG'),
            '20': (0.0320, 0.812, '20 AWG'),
            '21': (0.0285, 0.723, '21 AWG'),
            '22': (0.0253, 0.644, '22 AWG'),
            '23': (0.0226, 0.573, '23 AWG'),
            '24': (0.0201, 0.511, '24 AWG'),
        }
        out['wire_diameter_in'], out['wire_diameter_mm'], out['wire_diameter_gauge'] = gauge_info[gauge]


    # Get Ring Inner Diameter
    id_in_match = re.search(r'\d+\/\d+"', out['product_name'], re.IGNORECASE)
    if id_in_match:
        numerator, denominator = id_in_match.group().strip('"').split('/')
        id_in = float(numerator) / float(denominator)
        id_mm = id_in * 25.4
        out['internal_diameter_in'] = id_in
        out['internal_diameter_mm'] = id_mm


    # Get Aspect Ratio
    if out['internal_diameter_in'] and out['wire_diameter_in']:
        out['aspect_ratio'] = out['internal_diameter_in'] / out['wire_diameter_in']


    # Parse options
    option_div = driver.find_element(By.CSS_SELECTOR, 'div[data-product-option-change]')
    selects = option_div.find_elements(By.CSS_SELECTOR, 'Select')
    labels = option_div.find_elements(By.CSS_SELECTOR, 'label')

    ## Set default selects
    qty_select = None
    color_select = None

    # set selects properly
    if len(selects) == 0:
        pass
    elif len(selects) == 1:
        if 'colour' in labels[0].text.lower() or 'matte' in labels[0].text.lower():
            color_select = Select(selects[0])
        else:
            qty_select = Select(selects[0])
    else:
        if 'colour' in labels[0].text.lower() or 'matte' in labels[0].text.lower():
            color_select = Select(selects[0])
            qty_select = Select(selects[1])
        else:
            color_select = Select(selects[1])
            qty_select = Select(selects[0])

    
    # Itterate through quantity options
    if qty_select:
        for index in range(1, len(qty_select.options)):
            qty_select.select_by_index(index)
            sleep(2)


            # Get Selected Quantity
            option_text = qty_select.first_selected_option.text
            qty_part = option_text.lower().split('of')[1]
            out['rings_per_bag'] = int(''.join([i for i in qty_part if i.isdigit()]))



            # Get SKU
            out['sku'] =  driver.find_element(By.CSS_SELECTOR, 'dd[data-product-sku]').text
            if out['sku'] == '':
                material_codes = {
                    "Anodized Aluminum": "AA"
                }
                out['sku'] = f"{material_codes[out['material']]}{gauge}{numerator}{denominator}-{out['rings_per_bag']}"


            # Get price
            price_section = driver.find_element(By.CSS_SELECTOR, 'div.productView-price')
            price_text = price_section.find_element(By.CSS_SELECTOR, 'span[data-product-price-without-tax]').text
            out['price'] = get_number(price_text)


            # Handle colors if applicable
            if color_select:
                for index in range(1, len(color_select.options)):
                    color_select.select_by_index(index)
                    sleep(2)
                    out['color'] = color_select.first_selected_option.text
                    yield out
            else:
                yield out
    

    elif color_select:
        for index in range(1, len(color_select.options)):
            color_select.select_by_index(index)
            sleep(2)


            # Get SKU
            out['sku'] =  driver.find_element(By.CSS_SELECTOR, 'dd[data-product-sku]').text
            if out['sku'] == '':
                material_codes = {
                    "Anodized Aluminum": "AA"
                }
                out['sku'] = f"{material_codes[out['material']]}{gauge}{numerator}{denominator}-{out['rings_per_bag']}"


            # Get price
            price_section = driver.find_element(By.CSS_SELECTOR, 'div.productView-price')
            price_text = price_section.find_element(By.CSS_SELECTOR, 'span[data-product-price-without-tax]').text
            out['price'] = get_number(price_text)

    
            # Get Color
            out['color'] = color_select.first_selected_option.text
            yield out

    
    else:
        # Get price
        price_section = driver.find_element(By.CSS_SELECTOR, 'div.productView-price')
        price_text = price_section.find_element(By.CSS_SELECTOR, 'span[data-product-price-without-tax]').text
        out['price'] = get_number(price_text)


        # Get SKU
        out['sku'] =  driver.find_element(By.CSS_SELECTOR, 'dd[data-product-sku]').text
        if out['sku'] == '':
            material_codes = {
                "Anodized Aluminum": "AA"
            }
            out['sku'] = f"{material_codes[out['material']]}{gauge}{numerator}{denominator}-{out['rings_per_bag']}"


        # Get qty
        qty_match = re.search(r'\d+ rings', out['product_name'], re.IGNORECASE)
        if qty_match:
            out['rings_per_bag'] = int(''.join([i for i in qty_match.group() if i.isdigit()]))


        yield out




##### Run Parsing Function on All Product Pages

In [None]:
driver = start_driver()

# Choose which link to start at(good for resuming after potential failures)
start_at = 100

# Open and read in list of product pages
with open(LINK_FILE_PATH, 'r', newline='') as product_links:
    reader = csv.reader(product_links)

    # Iterate through links starting at the one defined by the user
    links = [i[0] for i in reader]
    for i, link in enumerate(links[start_at:]):
        print(f"Testing Link {i+start_at:03}: {link}")

        # Read data from the product page
        for product in parse_page(link, driver):
            print(product)
        
        sleep(5)

##### Run Parsing Function on Testing Pages

In [3]:

# Testing URLs
urls = [
    "https://www.metaldesignz.com/stainless-steel-jump-rings-14-gauge-9-32-id/",                # Basic parsing
    "https://www.metaldesignz.com/rainbowed-anodized-titanium-jump-rings-16-gauge-3-8-id/",     # Basic parsing
    "https://www.metaldesignz.com/anodized-aluminum-16-gauge-3-16/",                            # Color Test
    "https://www.metaldesignz.com/anodized-aluminum-jump-rings-16-gauge-3-16/",                 # Abberant page: unique select order - lack of proper sku - unique bag qty text,
    "https://www.metaldesignz.com/anodized-aluminum-jump-rings-20-gauge-3-16-matte-colours/",   # Different type of color option box
    "https://www.metaldesignz.com/square-wire-stainless-steel-jump-rings-18-gauge-9-64-id/",    # New product name pattern and incorrect material
    "https://www.metaldesignz.com/stainless-steel-half-round-rings-14g-7-32/",                  # New type of ring gauge in product name
    "https://www.metaldesignz.com/stainless-steel-jump-rings-16-gauge-3-4-id-bag-of-10-rings/", # Has no selects
    "https://www.metaldesignz.com/bright-aluminum-mix-sold-by-the-ounce-limited-quantities/",   # Non-specific product
    "https://www.metaldesignz.com/bright-aluminum-jump-rings-12-gauge-9-32-id/",                # rings are 12 gauge
    "https://www.metaldesignz.com/rubber-jump-rings-10-mm/",                                    # Has only color option
]


driver = start_driver()

for url in urls:
    for product in parse_page(url, driver):
        print(product)
    sleep(5)


{'time_accessed ': '2023-07-27 20:26:33.422328', 'sku': 'SS14932-100', 'product_name': 'Stainless Steel Jump Rings 14 Gauge 9/32" id.', 'material': 'Stainless Steel', 'price': 11.25, 'currency': 'CAD', 'wire_diameter_in': 0.08, 'wire_diameter_mm': 2.032, 'wire_diameter_gauge': '14 SWG', 'internal_diameter_in': 0.28125, 'internal_diameter_mm': 7.14375, 'aspect_ratio': 3.515625, 'color': None, 'bags_in_stock': None, 'rings_per_bag': 100}
{'time_accessed ': '2023-07-27 20:26:33.422328', 'sku': 'SS14932-250', 'product_name': 'Stainless Steel Jump Rings 14 Gauge 9/32" id.', 'material': 'Stainless Steel', 'price': 25.88, 'currency': 'CAD', 'wire_diameter_in': 0.08, 'wire_diameter_mm': 2.032, 'wire_diameter_gauge': '14 SWG', 'internal_diameter_in': 0.28125, 'internal_diameter_mm': 7.14375, 'aspect_ratio': 3.515625, 'color': None, 'bags_in_stock': None, 'rings_per_bag': 250}
{'time_accessed ': '2023-07-27 20:26:33.422328', 'sku': 'SS14932-500', 'product_name': 'Stainless Steel Jump Rings 14 Ga

##### Experimentation

In [None]:
driver = start_driver()

sleep(5)

# driver.get("https://www.metaldesignz.com/stainless-steel-jump-rings-14-gauge-9-32-id/")
# driver.get("https://www.metaldesignz.com/anodized-aluminum-16-gauge-3-16/")
# driver.get("https://www.metaldesignz.com/anodized-aluminum-jump-rings-16-gauge-3-16/")
# driver.get("https://www.metaldesignz.com/anodized-aluminum-jump-rings-20-gauge-3-16-matte-colours/")
# driver.get("https://www.metaldesignz.com/square-wire-stainless-steel-jump-rings-18-gauge-9-64-id/")
# driver.get("https://www.metaldesignz.com/stainless-steel-half-round-rings-14g-7-32/")
# driver.get("https://www.metaldesignz.com/stainless-steel-jump-rings-16-gauge-3-4-id-bag-of-10-rings/")
# driver.get("https://www.metaldesignz.com/bright-aluminum-mix-sold-by-the-ounce-limited-quantities/")
driver.get("https://www.metaldesignz.com/rubber-jump-rings-10-mm/")



In [None]:

# Parse options
option_div = driver.find_element(By.CSS_SELECTOR, 'div[data-product-option-change]')
selects = option_div.find_elements(By.CSS_SELECTOR, 'Select')
labels = option_div.find_elements(By.CSS_SELECTOR, 'label')

## Set default selects
qty_select = None
color_select = None

# set selects properly
if len(selects) == 0:
    pass
elif len(selects) == 1:
    if 'colour' in labels[0].text.lower() or 'matte' in labels[0].text.lower():
        color_select = Select(selects[0])
    else:
        qty_select = Select(selects[0])
else:
    if 'colour' in labels[0].text.lower() or 'matte' in labels[0].text.lower():
        color_select = Select(selects[0])
        qty_select = Select(selects[1])
    else:
        color_select = Select(selects[1])
        qty_select = Select(selects[0])

