In [None]:
import csv
import os.path
import re
import time

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException

In [None]:
CHROME_OPTIONS = webdriver.ChromeOptions()
CHROME_OPTIONS.add_argument("--headless")

SOURCE = "https://projects.jsonline.com/apps/Milwaukee-Homicide-Database/"

YEAR = 2020

In [None]:
class wait_til_success(object):

    def __init__(self, f):
        self.f = f

    def __call__(self, *args):
        try:
            return self.f(*args)
        except StaleElementReferenceException:
            # retry with enough time to load the data
            return self.f(*args)


@wait_til_success
def _navigate_to_year(browser):
    """Click the right button to navigate to the requested year
    """
    option = browser.find_element_by_xpath(f'//option[@value="{YEAR}"]')
    option.click()  # load the data on page `YEAR`


@wait_til_success
def _navigate_to_all_data(browser):
    """Click the right button to load all data
    """
    option = browser.find_element_by_xpath(f'//option[@value="all"]')
    option.click()


@wait_til_success
def _expand_entry(card):
    """Expand the associated entry into view
    """
    div = card.find_element_by_class_name("jss235")
    button = div.find_element_by_tag_name("button")
    button.click()


@wait_til_success
def _get_page_data(browser):
    """Scrape tabular data off the current page
    """
    def get_text_elements(classname):
        return [item.text for item in
                browser.find_elements_by_class_name(classname)]

    def extract_text(card):
        _expand_entry(card)  # click into view
        pars = [item.text for item in
                card.find_elements_by_tag_name("p")]
        if not all(pars):
            # try again if text is not present
            pars = [item.text for item in
                    card.find_elements_by_tag_name("p")]
        return "\n".join(pars)

    def extract_age(text):
        try:
            age_str = re.search("[0-9]+ years", text)[0]
            return int(age_str.split(" ")[0])
        except TypeError:
            return ""

    def extract_coverage(element):
        links = [item.get_attribute("href") for item in
                 element.find_elements_by_tag_name("a")]
        return "\n".join(links)

    # retrieve data from the page
    cards = browser.find_elements_by_class_name("jss227")
    name = get_text_elements("jss234")
    date = get_text_elements("jss236")
    address = get_text_elements("jss237")
    details = [extract_text(card) for card in cards]
    age = [extract_age(text) for text in details]
    charges = ["Yes" if ("Charges" in text) else "No"
               for text in details]
    coverage = [extract_coverage(card) for card in cards]
    return list(map(
        list,
        zip(name, age, date, address,
            charges, details, coverage),
    ))

In [None]:
# connect via Chrome
browser = webdriver.Chrome(options=CHROME_OPTIONS)

# navigate to the page
browser.get(SOURCE)

In [None]:
# get start time
start = time.time()

# read off table column headers
header = [
    "Name",
    "Age (years)",
    "Date",
    "Address",
    "Charges filed?",
    "Details",
    "Media coverage",
]

# navigate to the right year
_navigate_to_year(browser)

# select all data
_navigate_to_all_data(browser)
rows = _get_page_data(browser)

# consistency check on number of rows
print(f"Total no. of records: {len(rows)}")

# report the time taken
seconds = (time.time() - start)
print(f"Total time taken: {seconds} seconds")

In [None]:
# close the browser
browser.close()

In [None]:
# prepare the output file
name = SOURCE.split("/")[-2].lower()
filename = f"{name.lower()}_{YEAR}.csv"

# write the data to file
with open(filename, "w") as fileobj:
    writer = csv.writer(fileobj)
    writer.writerow(header)  # column headers
    writer.writerows(rows)  # column data