In [181]:
import requests
from bs4 import BeautifulSoup
import re
import os
import os.path
import base64
import urllib.parse

# Cached Retrieval Helpers

In [182]:
cache_dir = os.path.expanduser("cache/")
def retrieve_and_cache(url,verbose=False):
    # This caching has a MAJOR FLAW: It does not check whether the page has been updated.
    # Once downloaded, it will keep the file forever. 
    # This is acceptable since this is to aid development when I don't want to repeatedly hit the 
    # server and wait for a download, but, this is NOT well-suited for scraping the site to check for updates.
    # I suggest deleting the cache folder before doing a big scrape
    try:
        if verbose:
            print("Initializing cache dir")
        os.makedirs(cache_dir)
    except:
        if verbose:
            print("Cache dir exists")
        pass
    filename = urllib.parse.quote(url, '')
    filepath = os.path.join(cache_dir, filename)
    if verbose:
        print("Checking cache:", filepath)
    raw_html = ""
    if os.path.exists(filepath):
        if verbose:
            print("Retrieving from cache")
        with open(filepath, 'r') as file:
            raw_html = file.read()
    else:
        if verbose:
            print("Fetching and saving to cache")
        response = requests.get(url)
        raw_html = response.text
        with open(filepath, 'x') as file:
            file.write(raw_html)
    return raw_html

# Retrieving the manufacturer list

In [183]:
root_url = "https://www.motorcyclespecs.co.za/Manufacturer.htm"
manufacturers_html = retrieve_and_cache(root_url,verbose=True)

Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2FManufacturer.htm
Retrieving from cache


In [184]:
manufacturers_html
manufacturers_soup = BeautifulSoup(manufacturers_html, 'lxml')

# Retrieving each manufacturer's page

In [185]:
# Find all the links
hook = manufacturers_soup.find_all(string=re.compile(".*Classic Bikes.*"))
assert(len(hook)==1)  # Assert we find one and exactly one NavigatableString containing "Make Model"
hook = hook[0]
table = hook.find_parent('table')
rows = table.find_all('a')
urls = []
ignorelist = ["Technical.htm", "converter.html", "links.htm","video_clips.htm", "bikes/Classics.htm", "bikes/custom_bikes.htm", "bikes/Individual.html", "bikes/racing_bikes.html", "bikes/designs.html"]
for link in rows:
    href = link.attrs['href']
    if href in ignorelist:
        print("Skipping", href)
        pass
    else:
        urls.append(urllib.parse.urljoin(root_url, href))
# Filter out some of the links we're not interested in

urls

Skipping bikes/Classics.htm
Skipping bikes/custom_bikes.htm
Skipping bikes/Individual.html
Skipping bikes/racing_bikes.html
Skipping bikes/designs.html
Skipping video_clips.htm
Skipping Technical.htm
Skipping converter.html
Skipping links.htm


['https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html',
 'https://www.motorcyclespecs.co.za/bikes/Adler.html',
 'https://www.motorcyclespecs.co.za/bikes/AJP.htm',
 'https://www.motorcyclespecs.co.za/bikes/AJS.htm',
 'https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia.html',
 'https://www.motorcyclespecs.co.za/bikes/Arial.htm',
 'https://www.motorcyclespecs.co.za/bikes/Arlen_Ness.html',
 'https://www.motorcyclespecs.co.za/bikes/atk.htm',
 'https://www.motorcyclespecs.co.za/bikes/Wakan.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bajaj.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bakar.htm',
 'https://www.motorcyclespecs.co.za/bikes/Barigo.htm',
 'https://www.motorcyclespecs.co.za/bikes/beneli.html',
 'https://www.motorcyclespecs.co.za/bikes/Beta.htm',
 'https://www.motorcyclespecs.co.za/bikes/big_bear.html',
 'https://www.motorcyclespecs.co.za/bikes/big_dog.html',
 'https://www.motorcyclespecs.co.za/bikes/bimota.h

In [186]:
for url in urls:
    retrieve_and_cache(url,verbose=True)

Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAC_Schnitzer.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAdler.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAJP.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAJS.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2FIndividual%2FAlfer.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAprilia.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAri

# Getting each page of the paginated manufacturer's pages

In [187]:
# There's a bunch of different ones. Let's filter out the different pages. 
test_urls = ["https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html", "https://www.motorcyclespecs.co.za/bikes/bmw.htm", "https://www.motorcyclespecs.co.za/bikes/triumph.html"]

In [206]:
def get_all_pages_of_mfct(mfct_url):
    mfct = BeautifulSoup(retrieve_and_cache(mfct_url), 'lxml')
    base_file = os.path.splitext(os.path.split(urllib.parse.urlparse(mfct_url).path)[1])[0]

    all_mfct_urls = [mfct_url]
    # First let's retrieve all the sub-pages, if there are any
    for link in mfct.find_all("a"):
        # Arbitrarily limiting to 30 pages per manufacturer, so we don't accidentally scrape models with only numeric numbers
        if link.text.isnumeric() and int(link.text) < 30 and base_file in link.attrs['href']:
            full_url = urllib.parse.urljoin(mfct_url, link.attrs['href'])
            if full_url not in all_mfct_urls:
                all_mfct_urls.append(full_url)
    return all_mfct_urls

In [207]:
expanded_urls = []
for url in urls:
    expanded_urls.extend(get_all_pages_of_mfct(url))
    

['https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html',
 'https://www.motorcyclespecs.co.za/bikes/Adler.html',
 'https://www.motorcyclespecs.co.za/bikes/AJP.htm',
 'https://www.motorcyclespecs.co.za/bikes/AJS.htm',
 'https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia3.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia4.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia5.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia6.html',
 'https://www.motorcyclespecs.co.za/bikes/Arial.htm',
 'https://www.motorcyclespecs.co.za/bikes/Arlen_Ness.html',
 'https://www.motorcyclespecs.co.za/bikes/atk.htm',
 'https://www.motorcyclespecs.co.za/bikes/Wakan.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bajaj.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bakar.htm',
 'https://www.motorcyclespecs.co.za/bikes/Barigo.htm',
 'https://www.motorcyclespecs.co.za/bikes/

In [None]:
for url in expanded_urls:
    retrieve_and_cache(url,verbose=True)

Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAC_Schnitzer.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAdler.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAJP.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAJS.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2FIndividual%2FAlfer.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAprilia.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FApr

Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FDucati3.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FDucati5.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FDucati6.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FDucati7.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FDucati8.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FDucati9.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2F

Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FKawasaki3.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FKawasaki4.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FKawasaki5.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FKawasaki6.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FKawasaki7.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FKawasaki8.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/ht

Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2Fsuzuki3.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2Fsuzuki4.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2Fsuzuki5.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2Fsuzuki6.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2Fsuzuki7.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2Fsuzuki8.html
Fetching and saving to cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2F

# Enumerating each of the models