### Imports

In [1]:
import base64
import os
import time
from bs4 import BeautifulSoup

from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as expected
from selenium.webdriver.support.wait import WebDriverWait

## Parse the existing report

In [2]:
with open('report-filtered.html') as html_file:
    soup = BeautifulSoup(html_file, 'html.parser')

In [192]:
data_rows = soup('tr')[3:]
visible_rows = [row for row in data_rows if 'style' not in row.attrs or not row['style']]

In [193]:
len(visible_rows)

10

In [194]:
def get_description(tr):
    # Get the description from a row
    return tr('td')[1].text.strip()

In [195]:
def get_graph_link(tr):
    # Get the graph link from a row
    return tr.find('a')['href']

In [196]:
data = [{
    'title': get_description(row),
    'graph_link': get_graph_link(row)
} for row in visible_rows]

## Supplement with graph images

In [197]:
if os.path.exists('C:\dev\geckodriver'):
    executable_path = 'C:\dev\geckodriver'
    
elif os.path.exists('/usr/local/bin/geckodriver'):
    executable_path = '/usr/local/bin/geckodriver'

else:
    raise OSError('Geckodriver not found')
    
print(f'Using {executable_path} for geckodriver path')

Using /usr/local/bin/geckodriver for geckodriver path


In [201]:
import hashlib
import time

def get_graph_screenshot(url):
    print(hashlib.md5(url.encode('utf-8')).hexdigest())
    
    # This isn't ideal, but wait doesn't seem to work properly when 
    # getting multiple URLs and making a new driver guarantees no duplicated graphs
    driver = Firefox(executable_path=executable_path, firefox_options=options)
    wait = WebDriverWait(driver, timeout=300)
    # Render the URL with geckodriver, return the graph screenshot in base64    
    driver.get(url)
    # Canvas write is async, no easy event to follow - just wait a second for now
    time.sleep(1) 
       
    canvas = driver.find_element_by_tag_name('canvas')
    canvas_bytes = canvas.screenshot_as_base64
    
    driver.quit()
    
    return canvas_bytes

In [203]:
options = Options()
options.add_argument('-headless')

In [205]:
import hashlib

for item in data:
    item['graph_bytes'] = get_graph_screenshot(item['graph_link'])

for item in data:
    print(hashlib.md5(item['graph_bytes'].encode('utf-8')).hexdigest())
    
driver.quit()

1c174f787543453eeb23a35ca45c7509
0a2a69cae67b1f8c18cff7f5258f15c2
5eb6db107f34e07a5e340fdf4b4cc4f9
57e772abe65a767bff5be7610cb2435b
05fe26bfdf7a68a66656be3fab1c707c
39be67f54548f65076d5bfde5ba097da
8b6c016c6a7bc7c9c3e6af5d99606b77
5cb5c0e06ba6fff192028e96515e5607
568cf012dbed31d48acd59d737ae79f0
4e43266447f76ece598a1900e5a0efda
ae6369ac6e173c74b732ff0fb52f9586
d651794212f59ddfc3ea377c35a73f0b
aab1e448b000154934d01b6ae45c2bf0
e0c7c0f3c7ddb829b673d4492f29a1c5
6fc499709ea2b23e4fa7d178141d58ec
f10f1b59159b5b1021361912acda9c8e
a1276b070db1b0dc056908a1d9a8bd53
00334caf816aea9e89daea674cf9cab5
456f7c8a93f913f561a4cf6f66c7c7c6
300d171f96f8324d415c78831aff6ad9


In [206]:
bs = [item['graph_bytes'] for item in data]
print(len(bs))
print(len(set(bs)))

10
10


## Generate output

In [207]:
def make_image_element(image_bytes):
    return f"<img src='data:image/png;base64,{image_bytes}'>"

In [208]:
eles = [make_image_element(item['graph_bytes']) for item in data]

In [209]:
def write_item(item):
    return f"<h3>{item['title']} \
        <small><a href='{item['graph_link']}'>(View in RAGE)</a></small> \
        </h3> \
        {make_image_element(item['graph_bytes'])}<br>"

In [210]:
written_items = '\n'.join([write_item(item) for item in data])

In [211]:
html = f"""<html>
    <body>
        {written_items}
    </body>
</html>
"""

In [212]:
with open('output.html', 'w') as f:
    print(html, file=f)