# Metal Designz Scraping

## Imports and Setup

In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from time import sleep
import datetime
import csv
import re

# Set Variables
SLEEP_TIME = 5
STARTING_LINK = 'https://www.metaldesignz.com/jump-rings/'
LINK_FILE_PATH = '../../product_page_listings/metal_designz/metal_designz_products.csv'


## Get Data

### Find All Product Pages

Looking at their website it appears that the basic structure is a main page with side bar links to sub pages for each ring category. The sub pages link to product pages but if there are enough product pages they will be paginated.


In [None]:

# Helper function that gets the product links from a category page
def get_product_links(soup: BeautifulSoup, writer: csv.writer) -> None:
    product_grid = soup.find('ul', {'class': 'productGrid'})
    products = product_grid.find_all('li', {'class': 'product'})

    for product in products:
        link_tag = product.find('a', {'class': 'card-figure__link'})
        link = link_tag['href']
        writer.writerow([link])


# Get link to next page if applicable
def get_next_page(soup: BeautifulSoup) -> str:
    next_page = None
    pagination = soup.find('nav', {'class': 'pagination'})
    link_tag = pagination.find('a', {'aria-label': 'Next'})

    if link_tag:
        next_page = link_tag['href']
    
    return next_page


# From the starting link find all product pages
def page_parser(url: str, writer: csv.writer) -> None:

    # Get page and create a parsable object
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Get links to category pages
    sidebar = soup.find('aside', {'class': 'page-sidebar'})
    category_sidebar = sidebar.find(lambda tag: tag.name == 'div' and tag.find('h2', string='Jump Rings') is not None)
    category_links = set(map(lambda tag: tag['href'], category_sidebar.find_all('a')))

    # Iterate through top level links
    for num, link in enumerate(category_links):
        print(f"Checking cateogry: {num} -  {link}")
        sleep(5)

        # Get first page of category
        response = requests.get(link)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get the product links from the page
        get_product_links(soup, writer)

        # Get link to the next page
        next_page = get_next_page(soup)

        # Itterate through remaining pages if applicable
        while next_page:
            print(f'Found New Page: {next_page}')
            sleep(5)

            # Get new page            
            response = requests.get(next_page)
            soup = BeautifulSoup(response.content, 'html.parser')

            # Print product links
            get_product_links(soup, writer)

            # Get next page if applicable
            next_page = get_next_page(soup)


# Use function to write links to file
with open(LINK_FILE_PATH, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    page_parser(STARTING_LINK, writer)


### Get Data From Product Pages

#### Try Scraping Using Beautiful Soup


##### Store Test Data Locally


In [2]:

# Test Case URLs
urls = [
    "https://www.metaldesignz.com/stainless-steel-jump-rings-14-gauge-9-32-id/",
    "https://www.metaldesignz.com/rainbowed-anodized-titanium-jump-rings-16-gauge-3-8-id/",
]

# Store all test pages locally
pages = list()
for url in urls:
    sleep(5)
    response = requests.get(url)
    pages.append(BeautifulSoup(response.content, 'html.parser'))



##### Define a Function to Get Data from the Page

In [3]:

def parse_page(page: BeautifulSoup) -> dict:
    # Define output
    out = {
        "time_accessed ":       str(datetime.datetime.now()),
        "sku":                  None,
        "product_name":         None,
        "material":             None,
        "price":                None,
        "currency":             "CAD",
        "wire_diameter_in":     None,
        "wire_diameter_mm":     None,
        "wire_diameter_gauge":  None,
        "internal_diameter_in": None,
        "internal_diameter_mm": None,
        "aspect_ratio":         None,
        "color":                None,
        "bags_in_stock":        None,
        "rings_per_bag":        None,
    }

    # Get sku
    sku = page.find('dd', {'data-product-sku': True}).text
    out['sku'] = sku


    # Get title
    title = page.find('h1', {'class': 'productView-title'}).text
    out['product_name'] = title


    # Get Material
    bread_crumbs = page.find('nav', {'aria-label': 'Breadcrumb'})
    material = bread_crumbs.find_all('li')[2].text.strip()
    out['material'] = material


    # Get Price
    price_section = page.find('div', {'class': 'productView-price'})
    price_tag = price_section.find('span', {'data-product-price-without-tax': True})
    price = price_tag.text
    out['price'] = price


    # Get Wire diameter info
    gauge = ''.join([i for i in title.split('Gauge')[0] if i.isdigit()])
    gauge_info = {
        # Follows stated rule that Metal Designz uses SWG for 14-19 or AWG for 20-24
        # Returns a tuple in the format: (metric, imperial, Gauge & System)
        '14': (0.080, 2.032, '14 SWG'),
        '15': (0.072, 1.829, '15 SWG'),
        '16': (0.064, 1.626, '16 SWG'),
        '17': (0.056, 1.422, '17 SWG'),
        '18': (0.048, 1.219, '18 SWG'),
        '19': (0.040, 1.016, '19 SWG'),
        '20': (0.0320, 0.812, '20 AWG'),
        '21': (0.0285, 0.723, '21 AWG'),
        '22': (0.0253, 0.644, '22 AWG'),
        '23': (0.0226, 0.573, '23 AWG'),
        '24': (0.0201, 0.511, '24 AWG'),
    }
    out['wire_diameter_in'], out['wire_diameter_mm'], out['wire_diameter_gauge'] = gauge_info[gauge]


    # Get Ring Inner Diameter
    title = page.find('h1', {'class': 'productView-title'}).text
    id_in_str = re.search(r'\d+\/\d+"', title, re.IGNORECASE).group()
    numerator, denominator = id_in_str.strip('"').split('/')
    id_in = float(numerator) / float(denominator)
    id_mm = id_in * 25.4
    out['internal_diameter_in'] = id_in
    out['internal_diameter_mm'] = id_mm


    # Return data
    return out


#### Run All Test Cases

In [4]:

for page in pages:
    results = parse_page(page)
    print(results)



{'time_accessed ': '2023-07-07 19:30:32.372057', 'sku': 'SS14932', 'product_name': 'Stainless Steel Jump Rings 14 Gauge 9/32" id.', 'material': 'Stainless Steel', 'price': '$11.25 - $83.84', 'currency': 'CAD', 'wire_diameter_in': 0.08, 'wire_diameter_mm': 2.032, 'wire_diameter_gauge': '14 SWG', 'internal_diameter_in': 0.28125, 'internal_diameter_mm': 7.14375, 'aspect_ratio': None, 'color': None, 'bags_in_stock': None, 'rings_per_bag': None}
{'time_accessed ': '2023-07-07 19:30:32.380067', 'sku': 'RTi1638', 'product_name': 'Rainbowed Anodized Titanium Jump Rings 16 Gauge 3/8" id.', 'material': 'Titanium', 'price': '$18.00 - $134.14', 'currency': 'CAD', 'wire_diameter_in': 0.064, 'wire_diameter_mm': 1.626, 'wire_diameter_gauge': '16 SWG', 'internal_diameter_in': 0.375, 'internal_diameter_mm': 9.524999999999999, 'aspect_ratio': None, 'color': None, 'bags_in_stock': None, 'rings_per_bag': None}


##### Experimentation

In [None]:

page = pages[0]

title = page.find('h1', {'class': 'productView-title'}).text
id_in_str = re.search(r'\d+\/\d+"', title, re.IGNORECASE).group()
numerator, denominator = id_in_str.strip('"').split('/')
id_in = float(numerator) / float(denominator)
id_mm = id_in * 25.4
