In [1]:
import pyquery, urllib, re, os, base64
import lxml
Q = pyquery.PyQuery
unquote = urllib.parse.unquote

In [2]:
def fix_test_results(doc, target):
    doc = url2fp(doc)
    target = url2fp(target)
    with open(doc, 'r') as f:
        html = f.read()

    q = Q(html)
    q.remove_namespaces()  # simplifies query
    rows = q("tr")[1:]
    cells = []
    for r in rows:
        if len(r) == 4:
            c = r[3]
        elif len(r) == 2:
            c = r[1]
        Q(c)("div div p").each(lambda _,c: cells.append(c))

    pattern = re.compile(".*?/Results/([^/]*)?.*")
    for c in cells:
        c.text = pattern.match(c.text).group(1)

    with open(target,'w') as f:
        f.write(q.outer_html())
    

In [3]:
def url2fp(url):
    fp = url
    if fp.startswith("file:///"):
        fp = fp[8:]
    if not os.path.exists(fp):
        fp = unquote(fp)
    return fp
    
class ImgEncoderCache():
    def __init__(self, path):
        self._path = path
        self._imgs = {}
        
    def __getitem__(self, key):
        ret = self._imgs.get(key)
        if ret is not None:
            return ret
        self.add_img(key)
        return self._imgs[key] 
    
    def add_img(self, file):
        if not os.path.isabs(file):
            fp = os.path.join(self._path, file)
        else:
            fp = file
            
        with open(fp, 'rb') as f:
            data = f.read()
        data = base64.b64encode(data).decode('utf-8')
        
        ext = os.path.splitext(file)[1][1:]
        src = "data:image/%s;base64,%s"%(ext, data)
        self._imgs[file] = src
        

def images_2_js(images):
    src = """
var images = {};
%s
document.querySelectorAll("img").forEach((img) => {
    img.src = images[img.getAttribute('data-img-src')];
});
"""
    images_as_src = []
    for img in images.items():
        images_as_src.append("images['%s'] = '%s';"%(img))
    
    return src % "\n".join(images_as_src)

def fix_img_links(url, out):
    fp = url2fp(url)
    out = url2fp(out)
    cache = ImgEncoderCache(os.path.dirname(fp))

    parser = lxml.html.HTMLParser(encoding='utf8')
    with open(fp, 'rb') as f:
        html = lxml.html.parse(f, parser=parser)

    images = {}
    for img in html.getroot().xpath(".//img"):
        src = img.attrib['src']
        images[src] = cache[src]
        img.attrib['data-img-src'] = src
        
    js = images_2_js(images)
    script = lxml.etree.SubElement(html.xpath("/html")[0], "script", {"type": "application/javascript"})
    script.text = js

    with open(out, 'wb') as f:
        html.write(f, pretty_print=True, method='html')

In [4]:
doc = "file:///C:/Users/nstar/AppData/Local/Temp/TTf%7B69e50632-9838-4c2d-9e5e-20aa615cf5c0%7D.html"
tmp = doc.replace(os.path.basename(doc), "tmp.html")
target_folder = "C:\\Users\\nstar\\OneDrive - PBS Biotech\\OneDrive Shared\\3.1.1 DHF\\ID00020\\ID00020 - Attachments"
target = "3.1.1 Test Run Build Report.html"
output = os.path.join(target_folder, target)
fix_test_results(doc,tmp)
fix_img_links(tmp,output)