In [None]:
import csv
import os.path
import time

from tqdm import tqdm

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException

In [None]:
CHROME_OPTIONS = webdriver.ChromeOptions()
CHROME_OPTIONS.add_argument("--headless")

SOURCE = "https://projects.jsonline.com/database/2021/2/city-of-milwaukee-salaries-2020.html"

In [None]:
def _get_number_of_pages(browser):
    """Read off the total number of pages from the source
    """
    return int(browser.find_element_by_xpath(
        '//label[@data-cb-name="LabelTotal"]'
    ).text)


def _navigate_to_page(browser, number):
    """Click the right button to navigate to the nth page of data
    """
    option = browser.find_element_by_xpath(f'//option[@value="{number}"]')
    option.click()  # load the data on page `number`


def _get_page_data(browser):
    """Scrape tabular data off the current page
    """
    def scrape_from_page(browser):
        return [[item.text for item in row.find_elements_by_tag_name("td")]
                for row in browser.find_elements_by_tag_name("tr")[1::]]

    try:
        return scrape_from_page(browser)
    except StaleElementReferenceException:
        # retry with enough time to load the data
        return scrape_from_page(browser)

In [None]:
# connect via Chrome
browser = webdriver.Chrome(options=CHROME_OPTIONS)

# navigate to the page
browser.get(SOURCE)

# initial placeholder for table rows
rows = []

In [None]:
# get start time
start = time.time()

# read off table column headers
header = [item.text for item in
          browser.find_elements_by_tag_name("th")]

# retrieve data from the table
total = _get_number_of_pages(browser)
for i in tqdm(range(total), desc="Progress: "):
    page_no = i + 1  # convert from 0-count to 1-count
    _navigate_to_page(browser, page_no)
    rows += _get_page_data(browser)

# consistency check on number of rows
print(f"Total no. of records: {len(rows)}")

# report the time taken
minutes = (time.time() - start) / 60
print(f"Total time taken: {minutes} minutes")

In [None]:
# close the browser
browser.close()

In [None]:
# prepare the output file
name = os.path.basename(SOURCE).split(".")[0]
filename = f"{name}.tsv"

# write the data to file
with open(filename, "w") as fileobj:
    writer = csv.writer(fileobj, delimiter="\t")
    writer.writerow(header)  # column headers
    writer.writerows(rows)  # column data