### Imports

In [13]:
import os
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from tabulate import tabulate

import logging # TODO NEXT: use this!

from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as expected
from selenium.webdriver.support.wait import WebDriverWait

## Script inputs (to be parsed)

In [2]:
report_file = 'report-filtered.html'

# TODO: make this a passed param
if os.path.exists('C:\dev\geckodriver.exe'):
    executable_path = 'C:\dev\geckodriver.exe'
    
elif os.path.exists('/usr/local/bin/geckodriver'):
    executable_path = '/usr/local/bin/geckodriver'

else:
    raise OSError('Geckodriver not found')
    
print(f'Using {executable_path} for geckodriver path')

Using C:\dev\geckodriver.exe for geckodriver path


## Parse the existing report

Input to this module is just the report_file

In [3]:
with open(report_file) as html_file:
    soup = BeautifulSoup(html_file, 'html.parser')

In [4]:
data_rows = soup('tr')[3:]
visible_rows = [row for row in data_rows if 'style' not in row.attrs or not row['style']]

In [5]:
def get_description(tr):
    # Get the description from a row
    return tr('td')[1].text.strip()

In [6]:
def get_graph_link(tr):
    # Get the graph link from a row
    return tr.find('a')['href']

In [7]:
def get_data_tds(tr):
    # tds from 3 are data
    return tr('td')[3:]

In [8]:
# Ignore the first 3 tds in third row, rest are data
data_headings = soup('tr')[2]('td')[3:]

In [9]:
data = [{
    'title': get_description(row),
    'graph_link': get_graph_link(row),
    'tds': get_data_tds(row)
} for row in visible_rows]

Output of this module is data_headings, data

## Supplement with graph images

Input to this module is geckodriver_path, and data from the report file

In [10]:
def get_graph_screenshot(url):
    # Reset the wait each time
    wait = WebDriverWait(driver, timeout=300)
    # Render the URL with geckodriver, return the graph screenshot in base64    
    driver.get(url)
    
    # On draw starting the progress_img element appears (spinner). When done it disappears.
    wait.until(expected.visibility_of_element_located((By.ID, 'progress_img')))
    wait.until(expected.invisibility_of_element_located((By.ID, 'progress_img')))

    canvas = driver.find_element_by_tag_name('canvas')
    canvas_bytes = canvas.screenshot_as_base64

    return canvas_bytes

In [11]:
options = Options()
options.add_argument('-headless')

In [14]:
driver = Firefox(executable_path=executable_path, firefox_options=options)

for item in tqdm(data):
    item['graph_bytes'] = get_graph_screenshot(item['graph_link'])

driver.quit()

100%|████████████████████████████████████████████████████████| 10/10 [01:03<00:00,  6.33s/it]


Output of this module is just the augmented data

## Generate output

Input to this module is data_headings (from report parsing), data (after augmenting with graphs)

In [15]:
def make_image_element(image_bytes):
    return f"<img src='data:image/png;base64,{image_bytes}' />"

In [16]:
eles = [make_image_element(item['graph_bytes']) for item in data]

In [17]:
def join_cells(tds):
    # Join string representations of the tds
    return ''.join([str(td) for td in tds])

In [18]:
def make_table_element(heading_cells, data_cells):
    return f"<table><tr>{join_cells(heading_cells)}</tr><tr>{join_cells(data_cells)}</tr></table>"

In [19]:
def write_item(item):
    return f"<h3>{item['title']} \
        <small><a href='{item['graph_link']}'>(View in RAGE)</a></small> \
        </h3> \
        <div>{make_table_element(data_headings, item['tds'])}</div> \
        <div>{make_image_element(item['graph_bytes'])}</div>"

In [20]:
written_items = '\n'.join([write_item(item) for item in data])

In [21]:
html = f"""<html>
    <body>
        {written_items}
    </body>
</html>
"""

In [22]:
with open('output.html', 'w') as f:
    print(html, file=f)