In [345]:
import requests
from bs4 import BeautifulSoup
import re
import os
import os.path
import base64
import urllib.parse
import time

# Cached Retrieval Helpers

We carry two structures: a list of all URLs, and a dictionary tree structure of manufacturer to urls

In [502]:
cache_dir = os.path.expanduser("cache/")
session = requests.Session()
def retrieve_and_cache(url,verbose=False,session=None):
    # pass in a requests.Session() object if you want to make many calls, keeping the connection open.
    # This gives you a 2x speedup.
    
    # This caching has a MAJOR FLAW: It does not check whether the page has been updated.
    # Once downloaded, it will keep the file forever. 
    # This is acceptable since this is to aid development when I don't want to repeatedly hit the 
    # server and wait for a download, but, this is NOT well-suited for scraping the site to check for updates.
    # I suggest deleting the cache folder before doing a big scrape
    #
    # We can also try out CacheControl https://cachecontrol.readthedocs.io/en/latest/
    
    start_time = time.time()
    if not os.path.exists(cache_dir):
        if verbose:
            print("Initializing cache directory")
        os.makedirs(cache_dir)
    filename = urllib.parse.quote(url, '')
    filepath = os.path.join(cache_dir, filename)
    if verbose:
        print("Checking cache for", filename)
    raw_html = ""

    if os.path.exists(filepath):
        if verbose:
            print("  Retrieving from cache")
        with open(filepath, 'r') as file:
            raw_html = file.read()
    else:
        if verbose:
            print("  Fetching and saving to cache")
        response = None
        if session:
            response = session.get(url)
        else:
            response = requests.get(url)
        
        raw_html = response.text
        with open(filepath, 'x') as file:
            file.write(raw_html)
    print("  Took", time.time() - start_time,"seconds")
    return raw_html

# Retrieving the manufacturer list

In [491]:
root_url = "https://www.motorcyclespecs.co.za/Manufacturer.htm"
manufacturers_html = retrieve_and_cache(root_url,verbose=True)

Initializing cache dir
  Took 0.002068042755126953 seconds


In [492]:
manufacturers_html
manufacturers_soup = BeautifulSoup(manufacturers_html, 'lxml')
manufacturers_soup

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html dir="ltr" itemscope="" itemtype="http://schema.org/Article" xmlns="http://www.w3.org/1999/xhtml">
<!-- #BeginTemplate "head.dwt" -->
<head>
<style>


                        .Customized_Leaderboard { width: 320px; height: 100px; }
                        @media(min-width: 500px) { .Customized_Leaderboard { width: 320px; height: 100px; } }
                        @media(min-width: 801px) { .Customized_Leaderboard { width: 728px; height: 90px; } }
                        @media(min-width: 1025px) { .Customized_Leaderboard { width: 970px; height: 250px; } }
                       </style>
<style>


                         .Billboard { width: 320px; height: 100px; }
                        @media(min-width: 500px) { .Billboard { width: 320px; height: 100px; } }
                        @media(min-width: 801px) { .Billboard { width: 728px; height: 90px; } }
       

# Retrieving each manufacturer's page

In [461]:
# Find all the links
# Fill out both a list of just the URLs, and a list of dictionaries of the manufacturer name and base url
hook = manufacturers_soup.find_all(string=re.compile(".*Classic Bikes.*"))
assert(len(hook)==1)  # Assert we find one and exactly one NavigatableString containing "Make Model"
hook = hook[0]
table = hook.find_parent('table')
rows = table.find_all('a')
urls = []
mfct_urls = []
ignorelist = ["Technical.htm", "converter.html", "links.htm","video_clips.htm", "bikes/Classics.htm", "bikes/custom_bikes.htm", "bikes/Individual.html", "bikes/racing_bikes.html", "bikes/designs.html"]
for link in rows:
    href = link.attrs['href']
    mfct_name = " ".join(" ".join(link.find_all(text=True)).split()).strip()
    full_url = urllib.parse.urljoin(root_url, href)
    if href in ignorelist or mfct_name == "" or full_url in urls:
        print("Skipping", href)
        continue
    else:
        urls.append(full_url)
        mfct_urls.append({'mfct':mfct_name, 'base_url':full_url})
# Filter out some of the links we're not interested in

#urls = list(dict.fromkeys(urls)) # Remove duplicates while preserving order
urls

Skipping bikes/Classics.htm
Skipping bikes/custom_bikes.htm
Skipping bikes/Individual.html
Skipping bikes/racing_bikes.html
Skipping bikes/CF_Moto.htm
Skipping bikes/confederate.htm
Skipping bikes/designs.html
Skipping video_clips.htm
Skipping Technical.htm
Skipping converter.html
Skipping links.htm


['https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html',
 'https://www.motorcyclespecs.co.za/bikes/Adler.html',
 'https://www.motorcyclespecs.co.za/bikes/AJP.htm',
 'https://www.motorcyclespecs.co.za/bikes/AJS.htm',
 'https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia.html',
 'https://www.motorcyclespecs.co.za/bikes/Arial.htm',
 'https://www.motorcyclespecs.co.za/bikes/Arlen_Ness.html',
 'https://www.motorcyclespecs.co.za/bikes/atk.htm',
 'https://www.motorcyclespecs.co.za/bikes/Wakan.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bajaj.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bakar.htm',
 'https://www.motorcyclespecs.co.za/bikes/Barigo.htm',
 'https://www.motorcyclespecs.co.za/bikes/beneli.html',
 'https://www.motorcyclespecs.co.za/bikes/Beta.htm',
 'https://www.motorcyclespecs.co.za/bikes/big_bear.html',
 'https://www.motorcyclespecs.co.za/bikes/big_dog.html',
 'https://www.motorcyclespecs.co.za/bikes/bimota.h

In [462]:
# Retrieve the URLs
for url in urls:
    retrieve_and_cache(url,verbose=True)

Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAC_Schnitzer.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAdler.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAJP.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAJS.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2FIndividual%2FAlfer.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAprilia.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAri

Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2Ftriumph.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FTVS.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2Fural.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FVelocette.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FVespa.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FVictory.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbik

# Getting each page of the paginated manufacturer's pages

In [463]:
def get_all_pages_of_mfct(mfct_url):
    mfct = BeautifulSoup(retrieve_and_cache(mfct_url), 'lxml')
    base_file = os.path.splitext(os.path.split(urllib.parse.urlparse(mfct_url).path)[1])[0]

    all_mfct_urls = [mfct_url]
    # First let's retrieve all the sub-pages, if there are any
    for link in mfct.find_all("a"):
        # Arbitrarily limiting to 30 pages per manufacturer, so we don't accidentally scrape models with only numeric numbers
        try:
            if link.text.strip().isnumeric() and int(link.text.strip()) < 30 and base_file in link.attrs['href'] and "model" not in link.attrs['href']:
                full_url = urllib.parse.urljoin(mfct_url, link.attrs['href'])
                if full_url not in all_mfct_urls:
                    all_mfct_urls.append(full_url)
        except:
            print("Skipping", link.text, "on", mfct_url)
            pass
    return all_mfct_urls

In [464]:
expanded_urls = []
for entry in mfct_urls:
    all_mfct_urls = get_all_pages_of_mfct(entry['base_url'])
    entry['all_mfct_urls'] = all_mfct_urls
    expanded_urls.extend(all_mfct_urls)
expanded_urls

Skipping 
9½  on https://www.motorcyclespecs.co.za/bikes/moto_morini.html
Skipping 
9½ on https://www.motorcyclespecs.co.za/bikes/xtr.html


['https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html',
 'https://www.motorcyclespecs.co.za/bikes/Adler.html',
 'https://www.motorcyclespecs.co.za/bikes/AJP.htm',
 'https://www.motorcyclespecs.co.za/bikes/AJS.htm',
 'https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia2.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia3.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia4.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia5.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia6.html',
 'https://www.motorcyclespecs.co.za/bikes/Arial.htm',
 'https://www.motorcyclespecs.co.za/bikes/Arlen_Ness.html',
 'https://www.motorcyclespecs.co.za/bikes/atk.htm',
 'https://www.motorcyclespecs.co.za/bikes/Wakan.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bajaj.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bakar.htm',
 'https://www.motorcyclespecs.co.za/bik

In [465]:
mfct_urls

[{'mfct': 'AC Schnitzer',
  'base_url': 'https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html',
  'all_mfct_urls': ['https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html']},
 {'mfct': 'Adler',
  'base_url': 'https://www.motorcyclespecs.co.za/bikes/Adler.html',
  'all_mfct_urls': ['https://www.motorcyclespecs.co.za/bikes/Adler.html']},
 {'mfct': 'AJP',
  'base_url': 'https://www.motorcyclespecs.co.za/bikes/AJP.htm',
  'all_mfct_urls': ['https://www.motorcyclespecs.co.za/bikes/AJP.htm']},
 {'mfct': 'AJS',
  'base_url': 'https://www.motorcyclespecs.co.za/bikes/AJS.htm',
  'all_mfct_urls': ['https://www.motorcyclespecs.co.za/bikes/AJS.htm']},
 {'mfct': 'Alfer',
  'base_url': 'https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm',
  'all_mfct_urls': ['https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm']},
 {'mfct': 'Aprilia',
  'base_url': 'https://www.motorcyclespecs.co.za/bikes/Aprilia.html',
  'all_mfct_urls': ['https://www.motorcyclespecs.co.za/bikes/Aprilia.

In [466]:
for url in expanded_urls:
    retrieve_and_cache(url,verbose=True)

Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAC_Schnitzer.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAdler.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAJP.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAJS.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2FIndividual%2FAlfer.htm
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FAprilia.html
Retrieving from cache
Initializing cache dir
Cache dir exists
Checking cache: cache/https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fbikes%2FApr

In [467]:
expanded_urls

['https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html',
 'https://www.motorcyclespecs.co.za/bikes/Adler.html',
 'https://www.motorcyclespecs.co.za/bikes/AJP.htm',
 'https://www.motorcyclespecs.co.za/bikes/AJS.htm',
 'https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia2.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia3.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia4.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia5.html',
 'https://www.motorcyclespecs.co.za/bikes/Aprilia6.html',
 'https://www.motorcyclespecs.co.za/bikes/Arial.htm',
 'https://www.motorcyclespecs.co.za/bikes/Arlen_Ness.html',
 'https://www.motorcyclespecs.co.za/bikes/atk.htm',
 'https://www.motorcyclespecs.co.za/bikes/Wakan.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bajaj.htm',
 'https://www.motorcyclespecs.co.za/bikes/Bakar.htm',
 'https://www.motorcyclespecs.co.za/bik

In [468]:
## Checking that we maintained integrity between these two data structures.
# why am i doing this, this is just adding complexity
a1 = [entry['all_mfct_urls'] for entry in mfct_urls]
a2 = [item for sublist in a1 for item in sublist]
assert(expanded_urls == a2)


In [469]:
expanded_urls[0]

'https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html'

# Enumerating each of the models on a manufacturer page

Known possibilities on a manufacturer page:

* No models, just info about a company. 
* One table of models
* Multiple tables of models

There may be multiple pages for a manufacturer too. Hmm, maybe we should make these URLs so that we have an entry for the manufacturer?

The links themselves are a total mess:

* A single line sometimes contain multiple links for different letters of the model name, pointing to the same page. 

In [470]:
# There's a bunch of different ones. Let's filter out the different pages. 
test_urls = [
    "https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html", 
    "https://www.motorcyclespecs.co.za/bikes/bmw.htm", 
    "https://www.motorcyclespecs.co.za/bikes/triumph.html"]

In [471]:
def get_models_on_page(mfct_url,verbose=True):
    soup = BeautifulSoup(retrieve_and_cache(mfct_url))
    models_on_page = []
    for link in soup.find_all("a"):
        if 'href' in link.attrs and "/model/" in link.attrs['href']:
            href = link['href']
            full_url = urllib.parse.urljoin(mfct_url, link.attrs['href'])
            models_on_page.append(full_url)
    
    if verbose:
        print("Extracted",len(models_on_page),"models from", mfct_url)
    
    return models_on_page

#get_models_on_page(test_urls[1])
get_models_on_page("https://www.motorcyclespecs.co.za/bikes/Wunderlich.htm")

Extracted 14 models from https://www.motorcyclespecs.co.za/bikes/Wunderlich.htm


['https://www.motorcyclespecs.co.za/model/Custom/wunderlich_f_800_gs.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/wunderlich_hp2_sport_speedcruise.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/bmw_k1300r_by_wunderlich.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/bmw_r_1200gs_wunderlich_jararaca.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/BMW_R1200GS_by_Wunderlich.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/BMW_R1200R_Digimoto.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/BMW_R1200R_Digimoto.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/BMW_R_Nine_TT.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/Wunderlich_Hybrid_BMW_R1200GS_LC_2WD.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/Wunderlich_Hybrid_BMW_R1200GS_LC_2WD.htm',
 'https://www.motorcyclespecs.co.za/model/Projects_Designs/Wunderlich_R1600_C_Vision.htm',
 'https://www.motorcyclespecs.co.za/model/Custom/wunderlich_bmw_s1000rr_piranha.htm',
 'https:

In [472]:
all_models = []
for mfct in mfct_urls:
    print("Retrieving models for", mfct['mfct'])
    mfct['models'] = []
    for mfct_page in mfct['all_mfct_urls']:
        print("  Retrieving", mfct_page)
        models_on_page = get_models_on_page(mfct_page)
        all_models.extend(models_on_page)
        mfct['models'].extend(models_on_page)
        
        

Retrieving models for AC Schnitzer
  Retrieving https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html
Extracted 23 models from https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html
Retrieving models for Adler
  Retrieving https://www.motorcyclespecs.co.za/bikes/Adler.html
Extracted 12 models from https://www.motorcyclespecs.co.za/bikes/Adler.html
Retrieving models for AJP
  Retrieving https://www.motorcyclespecs.co.za/bikes/AJP.htm
Extracted 15 models from https://www.motorcyclespecs.co.za/bikes/AJP.htm
Retrieving models for AJS
  Retrieving https://www.motorcyclespecs.co.za/bikes/AJS.htm
Extracted 33 models from https://www.motorcyclespecs.co.za/bikes/AJS.htm
Retrieving models for Alfer
  Retrieving https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm
Extracted 0 models from https://www.motorcyclespecs.co.za/model/Individual/Alfer.htm
Retrieving models for Aprilia
  Retrieving https://www.motorcyclespecs.co.za/bikes/Aprilia.html
Extracted 61 models from https://www.mot

Extracted 94 models from https://www.motorcyclespecs.co.za/bikes/Ducati6.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/Ducati7.html
Extracted 109 models from https://www.motorcyclespecs.co.za/bikes/Ducati7.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/Ducati8.html
Extracted 71 models from https://www.motorcyclespecs.co.za/bikes/Ducati8.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/Ducati9.html
Extracted 55 models from https://www.motorcyclespecs.co.za/bikes/Ducati9.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/Ducati10.html
Extracted 45 models from https://www.motorcyclespecs.co.za/bikes/Ducati10.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/Ducati11.html
Extracted 51 models from https://www.motorcyclespecs.co.za/bikes/Ducati11.html
Retrieving models for Excelsior
  Retrieving https://www.motorcyclespecs.co.za/bikes/excelsior.html
Extracted 4 models from https://www.motorcyclespecs.co.za/bikes/excelsior.html
Retrieving models 

Extracted 85 models from https://www.motorcyclespecs.co.za/bikes/Kawasaki13.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/Kawasaki14.html
Extracted 61 models from https://www.motorcyclespecs.co.za/bikes/Kawasaki14.html
Retrieving models for KTM
  Retrieving https://www.motorcyclespecs.co.za/bikes/KTM.htm
Extracted 78 models from https://www.motorcyclespecs.co.za/bikes/KTM.htm
  Retrieving https://www.motorcyclespecs.co.za/bikes/KTM2.html
Extracted 74 models from https://www.motorcyclespecs.co.za/bikes/KTM2.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/KTM3.html
Extracted 67 models from https://www.motorcyclespecs.co.za/bikes/KTM3.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/KTM4.html
Extracted 91 models from https://www.motorcyclespecs.co.za/bikes/KTM4.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/KTM5.html
Extracted 58 models from https://www.motorcyclespecs.co.za/bikes/KTM5.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/KTM6

Extracted 73 models from https://www.motorcyclespecs.co.za/bikes/suzuki12.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/suzuki13.html
Extracted 118 models from https://www.motorcyclespecs.co.za/bikes/suzuki13.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/suzuki14.html
Extracted 114 models from https://www.motorcyclespecs.co.za/bikes/suzuki14.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/suzuki15.html
Extracted 80 models from https://www.motorcyclespecs.co.za/bikes/suzuki15.html
  Retrieving https://www.motorcyclespecs.co.za/bikes/suzuki16.html
Extracted 103 models from https://www.motorcyclespecs.co.za/bikes/suzuki16.html
Retrieving models for SWM
  Retrieving https://www.motorcyclespecs.co.za/bikes/swm.htm
Extracted 8 models from https://www.motorcyclespecs.co.za/bikes/swm.htm
Retrieving models for SYM
  Retrieving https://www.motorcyclespecs.co.za/bikes/SYM.htm
Extracted 44 models from https://www.motorcyclespecs.co.za/bikes/SYM.htm
Retrieving mode

In [473]:
print(len(all_models))
all_models[10]

11759


'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_bmw_k1300r.htm'

# That's a lot of models! Test retrieving models for a few test manufacturers

In [474]:
[[i, x['mfct']] for i,x in enumerate(mfct_urls)]

[[0, 'AC Schnitzer'],
 [1, 'Adler'],
 [2, 'AJP'],
 [3, 'AJS'],
 [4, 'Alfer'],
 [5, 'Aprilia'],
 [6, 'Ariel'],
 [7, 'Arlen Ness'],
 [8, 'ATK'],
 [9, 'Avinton / Wakan'],
 [10, 'Bajai'],
 [11, 'Bakker'],
 [12, 'Barigo'],
 [13, 'Benelli'],
 [14, 'Beta'],
 [15, 'Big Bear'],
 [16, 'Big Dog'],
 [17, 'Bimota'],
 [18, 'BMS Choppers'],
 [19, 'BMW'],
 [20, 'Boss Hoss'],
 [21, 'Boxer'],
 [22, 'Brammo'],
 [23, 'Britten'],
 [24, 'Brixton Motorcycles'],
 [25, 'Brough Superior Motorcycles'],
 [26, 'BRP Cam-Am'],
 [27, 'BSA'],
 [28, 'Buell / EBR'],
 [29, 'Bultaco'],
 [30, 'Cagiva'],
 [31, 'Campagna'],
 [32, 'CCM'],
 [33, 'CF Moto'],
 [34, 'Confederate / Combat Motors'],
 [35, 'CR&S'],
 [36, 'CZ Motorcycles'],
 [37, 'Daelim'],
 [38, 'Derbi'],
 [39, 'Deus'],
 [40, 'DP Customs'],
 [41, 'Ducati'],
 [42, 'Excelsior'],
 [43, 'Exile'],
 [44, 'GASGAS'],
 [45, 'Ghezzi Brian'],
 [46, 'Gilera'],
 [47, 'GIMA'],
 [48, 'Harley-Davidson'],
 [49, 'Harris'],
 [50, 'Hartford'],
 [51, 'HDT USA'],
 [52, 'Hesketh'],
 [53, 

In [501]:
BMW = 19
TRIUMPH = 114
HARLEY = 48
VICTORY = 119
print(len(mfct_urls[VICTORY]['models']))

selected_model_urls = []
session = requests.Session()
for model_url in mfct_urls[BMW]['models']:
    selected_model_urls.append(model_url)
    retrieve_and_cache(model_url,verbose=True,session=session)

94
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-concept-6.html
  Retrieving from cache
  Took 0.0028901100158691406 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-concopt-9cento.html
  Retrieving from cache
  Took 0.00046896934509277344 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-motorrad-vision.html
  Retrieving from cache
  Took 0.00048613548278808594 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-concept-10.html
  Retrieving from cache
  Took 0.0003979206085205078 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-concept-ce.html
  Retrieving from cache
  Took 0.0004069805145263672 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-c1-concept.html
  Retrieving from cache
  Took 0.0005030632019042969 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespec

  Took 1.8333749771118164 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k100-86.html
  Fetching and saving to cache
  Took 0.8085722923278809 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k100lt-86.html
  Fetching and saving to cache
  Took 2.4529919624328613 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-K100lt-87.html
  Fetching and saving to cache
  Took 1.068068027496338 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k100rs-83.html
  Fetching and saving to cache
  Took 1.8053247928619385 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k100rs-84.html
  Fetching and saving to cache
  Took 0.8173019886016846 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k100rs-85.html
  Fetching and saving to cache
  Took 0.8218197822570801 seconds
Checking cache 

  Took 0.7511591911315918 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k1300s-11.html
  Fetching and saving to cache
  Took 0.7491822242736816 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k1300s-12.html
  Fetching and saving to cache
  Took 0.7570681571960449 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k1300s-se-12.html
  Fetching and saving to cache
  Took 0.7594788074493408 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k1300s-13.html
  Fetching and saving to cache
  Took 0.7683351039886475 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k1300s-30th-anniversary.html
  Fetching and saving to cache
  Took 0.7581980228424072 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-k1300s-14.html
  Fetching and saving to cache
  Took 0.7602770328521729 sec

  Took 0.8186030387878418 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-r18-b-22.html
  Fetching and saving to cache
  Took 0.8171837329864502 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-r18-transcontinental-22.html
  Fetching and saving to cache
  Took 0.822998046875 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-r18-100th-anniversary-23.html
  Fetching and saving to cache
  Took 0.7733750343322754 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-r18-roctane-2023.html
  Fetching and saving to cache
  Took 0.8639311790466309 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-r20.html
  Fetching and saving to cache
  Took 0.8322570323944092 seconds
Checking cache for https%3A%2F%2Fwww.motorcyclespecs.co.za%2Fmodel%2Fbmw%2Fbmw-r23.html
  Fetching and saving to cache
  Took 0.8026988506317139 s

KeyboardInterrupt: 

In [487]:
selected_model_urls

['https://www.motorcyclespecs.co.za/model/bmw/bmw-concept-6.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-concopt-9cento.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-motorrad-vision.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-concept-10.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-concept-ce.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-c1-concept.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-concept-c-scooter.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-escooter-concept.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-electric-scooter-concept.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-c1-e.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-simple_and-clever.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-electric-roadster.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-err-concept.html',
 'https://www.motorcyclespecs.co.za/model/bmw/bmw-e-power-roadster.html'

In [483]:
time.time() - a

12.673673152923584