In [2]:
import pickle
import requests
from bs4 import BeautifulSoup
import re
import os
import os.path
import base64
import urllib.parse
import time

with open('mfct_urls.pickle', 'rb') as f:
    mfct_urls = pickle.load(f)
with open('expanded_urls.pickle', 'rb') as f:
    expanded_urls = pickle.load(f)
with open('all_models.pickle', 'rb') as f:
    all_models = pickle.load(f)

In [3]:
import collections
assert(len([item for item, count in collections.Counter(all_models).items() if count > 1]) == 0)


In [29]:
cache_dir = os.path.expanduser("cache/")

def retrieve_and_cache(url,verbose=False,session=None):
    # pass in a requests.Session() object if you want to make many calls, keeping the connection open.
    # This gives you a 2x speedup.
    
    # This caching has a MAJOR FLAW: It does not check whether the page has been updated.
    # Once downloaded, it will keep the file forever. 
    # This is acceptable since this is to aid development when I don't want to repeatedly hit the 
    # server and wait for a download, but, this is NOT well-suited for scraping the site to check for updates.
    # I suggest deleting the cache folder before doing a big scrape
    #
    # We can also try out CacheControl https://cachecontrol.readthedocs.io/en/latest/
    
    start_time = time.time()
    if not os.path.exists(cache_dir):
        if verbose:
            print("Initializing cache directory")
        os.makedirs(cache_dir)
    filename = urllib.parse.quote(url, '')
    filepath = os.path.join(cache_dir, filename)
    if verbose:
        print("Checking cache for", filename)
    raw_html = ""

    if os.path.exists(filepath):
        if verbose:
            print("  Retrieving from cache")
        with open(filepath, 'r') as file:
            raw_html = file.read()
    else:
        if verbose:
            print("  Fetching and saving to cache")
        response = None
        if session:
            response = session.get(url)
        else:
            response = requests.get(url)
        
        raw_html = response.text
        with open(filepath, 'x') as file:
            file.write(raw_html)
    if verbose:
        print("  Took", time.time() - start_time,"seconds")
    return raw_html

def dedupe_list_preserving_order(items):
    return list(dict.fromkeys(items))

def parse_table_to_dict(text,url=None): # Parses the first Make-Model table we find on the page.
    soup = BeautifulSoup(text, 'lxml') # important to pip install lxml
    output = {}
    
    # Find the Table
    hook = soup.find_all(string=re.compile(".*Make Model.*"))
    if len(hook) == 0:
        return output
    if len(hook) > 1:
        print("Woah more than one table at",url)
    assert(len(hook)>=1)  # Assert we find at least one NavigatableString containing "Make Model"
    table = None
    for h in hook: # Let's find the first one that is inside a table. 
        if h.find_parent('table'):
            table = h.find_parent('table')
            break
    assert(table) # We assert we did find one
    rows = table.find_all('tr')
    assert(len(rows) > 1) # Assert we've found and parsed a table with many rows
    
    # Extract the key:value pairs
    for row in rows:
        cols = row.find_all('td')
        if len(cols) != 2:
            print("We found more than 2 columns, using only the first two, on page", url)
            continue
        assert(len(cols) == 2) # We expect two columns in each row, which we treat as a key:value pair
        key = cols[0].get_text().strip()
        value = cols[1].get_text().strip().replace("\t", "")
        output[key] = value
    return output

def decorate_table(table_dict, key_vals):
    return table_dict | key_vals

def get_mfct(name):
    for i, m in enumerate(mfct_urls):
        if m['mfct'] == name:
            return m
    return None
        
def clean_keys(table_dict,url=None):
    d = {}
    for k, v in table_dict.items():
        # Remove punctuation 
        k = k.translate(str.maketrans('', '', '!"#$%&\'()*+,-.:;<=>?@[\]^_`{|}~'))
        
        # Remove extraneous whitespace
        k = " ".join(k.split()).strip()
        
        # Ensure Title Case
        k = k.title()

        # Collapse certain special cases. DANGER, THIS WILL COLLIDE TABLE ENTRIES
        k = k.replace("Abs", "ABS")
        k = k.replace("Rear Wheels", "Rear Wheel")
        k = k.replace("Wheels Rear", "Rear Wheel")
        k = k.replace("Front Wheels", "Front Wheel")
        k = k.replace("Wheels Front", "Front Wheel")
        k = k.replace("Final Reduction Ratio", "Final Reduction")
        k = k.replace("Gear Ratios", "Gear Ratio")
        k = k.replace("Primary Reduction Ratio", "Primary Reduction")
        
        # Save the new key
        if k in d:
            # Combine entries if there's a colliding key. If no collision, we ignore.
            if d[k].strip() != v.strip():
                #print("Found colliding entries for key", k, "combining old value [",d[k],"] and new value [",v,"] for",url)
                d[k] = d[k] + " | " + v
        else:
            d[k] = v
                
        
    return d

def clean_vals(table_dict,url=None):
    d = {}
    for k, v in table_dict.items():
        # Remove extraneous whitespace
        v = " ".join(v.split()).strip()

        # Save the new key
        d[k] = v
        
    return d

def clean_model_table(table_dict,url=None):
    return clean_vals(clean_keys(table_dict,url=url),url=url)

In [30]:
def get_table_for_url(url):
    raw_html = retrieve_and_cache(url)
    table_dict = parse_table_to_dict(raw_html,url=url)
    table_dict_decorated = decorate_table(table_dict, {"Link":url})
    table_dict_decorated_cleaned = clean_model_table(table_dict_decorated,url=url)
    return table_dict_decorated_cleaned

In [31]:
def get_all_model_tables_for_mfct(mfct_name):
    m = get_mfct(mfct_name)
    tables = []
    for url in m['models']:
        table = get_table_for_url(url)
        tables.append(table)
    return tables

In [32]:
cleaned_tables = get_all_model_tables_for_mfct('Kawasaki')

We found more than 2 columns, using only the first two, on page https://www.motorcyclespecs.co.za/model/kawasaki/kawasaki_s3_400%2075.htm
We found more than 2 columns, using only the first two, on page https://www.motorcyclespecs.co.za/model/kawasaki/kawasaki_s3_400%2075.htm
Found colliding entries for key Ground Clearance combining old value [ 1435 mm / 56.5 in. ] and new value [ 125 mm / 4.9 in ] for https://www.motorcyclespecs.co.za/model/kawasaki/kawasaki_z1000r_17.htm
Found colliding entries for key Ground Clearance combining old value [ 1435 mm / 56.5 in. ] and new value [ 125 mm / 4.9 in ] for https://www.motorcyclespecs.co.za/model/kawasaki/kawasaki_z1000r_19.html


In [34]:
import pandas as pd
df = pd.DataFrame(cleaned_tables)
df
df.to_csv('kawitest.csv')

In [17]:
mfct_urls

[{'mfct': 'AC Schnitzer',
  'base_url': 'https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html',
  'all_mfct_urls': ['https://www.motorcyclespecs.co.za/bikes/AC_Schnitzer.html'],
  'models': ['https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_f_650cs.htm',
   'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_F800R.htm',
   'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_800s_twinstar.htm',
   'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_hp2.htm',
   'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_k_1200rGT.htm',
   'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_k1200r.htm',
   'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_k_1200rs.htm',
   'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_k_1200s.htm',
   'https://www.motorcyclespecs.co.za/model/AC%20Schnitzer/ac_schnitzer_k1300s.htm',
   'https://www.motorcyclespecs.

## WARNING: THE CODE BELOW PROCESSES ALL 11k MODELS AND SAVES TO OVER 100 CSV FILES

In [15]:
all_tables = []
for i, mfct_name in enumerate([m['mfct'] for m in mfct_urls]):
    print("Processing", i,"of",len(mfct_urls)-1,":",mfct_name)
    cleaned_tables = get_all_model_tables_for_mfct(mfct_name)
    all_tables.extend(cleaned_tables)
    df = pd.DataFrame(cleaned_tables)
    mfct_urls[i]['df'] = df
    df.to_csv("models {index:03d} - {mfct_name}.csv".format(index = i, mfct_name=urllib.parse.quote(mfct_name,'')))

Processing 0 of 132 : AC Schnitzer
Processing 1 of 132 : Adler
Processing 2 of 132 : AJP
Processing 3 of 132 : AJS
Processing 4 of 132 : Alfer
Processing 5 of 132 : Aprilia
Overwriting key Final Drive old value: Chain new value 17:40 for https://www.motorcyclespecs.co.za/model/aprilia/aprilia-rs125r-nero-06.html
Overwriting key Final Drive old value: Chain new value 17:40 for https://www.motorcyclespecs.co.za/model/aprilia/aprilia-rs125r-06.html
Overwriting key Final Drive old value: Chain new value 17:40 for https://www.motorcyclespecs.co.za/model/aprilia/aprilia-rs125r-07.html
Overwriting key Final Drive old value: Chain new value 17:40 for https://www.motorcyclespecs.co.za/model/aprilia/aprilia-rs125-lorenzo-07.html
Overwriting key Final Drive old value: Chain new value 17:40 for https://www.motorcyclespecs.co.za/model/aprilia/aprilia-rs125r-08.html
Overwriting key Final Drive old value: Chain new value 17:40 for https://www.motorcyclespecs.co.za/model/aprilia/aprilia-rs125r-rosso-f

We found more than 2 columns on page https://www.motorcyclespecs.co.za/model/ducati/ducati_multistrada_1200%20S%20GT%2014.htm
We found more than 2 columns on page https://www.motorcyclespecs.co.za/model/ducati/ducati_multistrada_1200%20S%20pikes%20peak%2014.htm
Processing 42 of 132 : Excelsior
Processing 43 of 132 : Exile
Processing 44 of 132 : GASGAS
Processing 45 of 132 : Ghezzi Brian
Processing 46 of 132 : Gilera
Processing 47 of 132 : GIMA
Processing 48 of 132 : Harley-Davidson
Processing 49 of 132 : Harris
Processing 50 of 132 : Hartford
Processing 51 of 132 : HDT USA
Processing 52 of 132 : Hesketh
Processing 53 of 132 : Hero
Processing 54 of 132 : Highland
Processing 55 of 132 : Honda
We found more than 2 columns on page https://www.motorcyclespecs.co.za/model/Honda/honda_cbr600rr_se_09.html
We found more than 2 columns on page https://www.motorcyclespecs.co.za/model/Honda/honda_cbr600rr_se_09.html
We found more than 2 columns on page https://www.motorcyclespecs.co.za/model/Honda

Processing 86 of 132 : MTT
Processing 87 of 132 : Münch
Processing 88 of 132 : MV Agusta
Overwriting key Electronic Quickshift old value: MV EAS 3.0 (Electronically Assisted Shift up & down) new value MV EAS 2.0 (Electronically Assisted Shift up & down) for https://www.motorcyclespecs.co.za/model/mv/mv_agusta_brutale_800_ross_21.html
Overwriting key Electronic Quickshift old value: MV EAS 3.0 (Electronically Assisted Shift up & down) new value MV EAS 2.0 (Electronically Assisted Shift up & down) for https://www.motorcyclespecs.co.za/model/mv/mv_agusta_brutale_800rr_21.html
Overwriting key Electronic Quickshift old value: MV EAS 3.0 (Electronically Assisted Shift up & down) new value MV EAS 2.0 (Electronically Assisted Shift up & down) for https://www.motorcyclespecs.co.za/model/mv/mv_agusta_brutale_800rr_scs_21.html
Overwriting key Electronic Quickshift old value: MV EAS 3.0 (Electronically Assisted Shift up & down) new value MV EAS 2.0 (Electronically Assisted Shift up & down) for htt

# Combine all the tables into one giant table

In [10]:
df = pd.DataFrame(all_tables)

# Export to CSV

In [13]:
df.to_csv('ALL_MODELS.csv')

# Export to Excel

Install the dependencies

> pip install XlsxWriter


In [24]:
#with pd.ExcelWriter("ALL_MODELS.xlsx") as writer:
#    df.to_excel(writer, sheet_name="models", index=False)

with pd.ExcelWriter("models_on_sheets.xlsx") as writer:
    for m in mfct_urls:
        m['df'].to_excel(writer, sheet_name=urllib.parse.quote(m['mfct'],'')[0:31], index=False)

# Debugging

In [133]:
url = "https://www.motorcyclespecs.co.za/model/voxan/Voxan_GTV_1200.html"
raw_html = retrieve_and_cache(url)
table_dict = parse_table_to_dict(raw_html,url=url)


Woah more than one table at https://www.motorcyclespecs.co.za/model/voxan/Voxan_GTV_1200.html


AttributeError: 'NoneType' object has no attribute 'find_all'

In [142]:
soup = BeautifulSoup(raw_html, 'lxml') # important to pip install lxml
output = {}

# Find the Table
hook = soup.find_all(string=re.compile(".*Make Model.*"))
if len(hook) > 1:
    print("Woah more than one table at",url)
assert(len(hook)>=1)  # Assert we find at least one NavigatableString containing "Make Model"
for h in hook:
    if h.find_parent('table'):
        table = h.find_parent('table')
        break
table


Woah more than one table at https://www.motorcyclespecs.co.za/model/voxan/Voxan_GTV_1200.html


<table height="821" id="table8" width="100%">
<tr>
<td class="xl24" height="39" style="border-style: solid" width="618">
<p align="justify"><font face="Arial">Make Model</font></p></td>
<td class="xl25" height="38" style="border-left-style: solid; border-left-width: medium; border-right-style: solid; border-top-style: solid; border-bottom-style: solid" width="1230">
<font face="Arial">Voxan GTV 1200</font></td>
</tr>
<tr>
<td class="xl24" height="39" style="border-left-style: solid; border-right-style: solid; border-top-style: solid; border-top-width: medium; border-bottom-style: solid" width="618">
<p align="justify"><font face="Arial">Year</font></p></td>
<td class="xl26" height="38" style="border-left-style: solid; border-left-width: medium; border-right-style: solid; border-top-style: solid; border-top-width: medium; border-bottom-style: solid" width="1230" x:num="">
<font face="Arial">2010</font></td>
</tr>
<tr>
<td class="xl24" height="39" style="border-left-style: solid; border-

In [141]:
hook

['Make Model', 'Make Model']

In [None]:
hook = hook[0] # For now we're only parsing the first one. 
table = hook.find_parent('table')
rows = table.find_all('tr')


In [1]:
import xlsxwriter

In [23]:
"hkjansdflkjnasdf"[0:31]

'hkjansdflkjnasdf'