In [2]:
import os
import re

from tabulate import tabulate

import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1) # None for other python versions

In [52]:
file = '/home/laura/ANALYSIS/covidma/Quality/processed/20089101.trimmed_R1_fastqc.html'
folder = '/home/laura/ANALYSIS/covidma/Quality/'

In [67]:
def extract_processed_html(fastqc_folder, sample, suffix):
    for root, _, files in os.walk(fastqc_folder):
        for name in files:
            fileName = os.path.join(root, name)
            if 'Quality/processed' in fileName and name.endswith(suffix) and name.startswith(sample):
                return fileName


In [57]:
extract_processed_html(folder, '20089101')

'/home/laura/ANALYSIS/covidma/Quality/processed/20089101.trimmed_R2_fastqc.html'

In [83]:
def extract_files_html(fastqc_folder):
    html_pairs = {}
    count = 0
    for root, _, files in os.walk(fastqc_folder):
        for name in files:
            fileName = os.path.join(root, name)
            if 'Quality/raw' in fileName and name.endswith('fastqc.html'):
                sample = name.split('.')[0]
                suffix = ('.').join(name.split('.')[-2:])
                raw_html = fileName
                processed_html = extract_processed_html(fastqc_folder, sample, suffix)
                html_pairs[count] = [raw_html, processed_html]
                count = count + 1
    return html_pairs


In [84]:
pair_dict = extract_files_html(folder)
pair_dict

{0: ['/home/laura/ANALYSIS/covidma/Quality/raw/20091271.R1_fastqc.html',
  '/home/laura/ANALYSIS/covidma/Quality/processed/20091271.trimmed_R1_fastqc.html'],
 1: ['/home/laura/ANALYSIS/covidma/Quality/raw/20089101.R2_fastqc.html',
  '/home/laura/ANALYSIS/covidma/Quality/processed/20089101.trimmed_R2_fastqc.html'],
 2: ['/home/laura/ANALYSIS/covidma/Quality/raw/20089101.R1_fastqc.html',
  '/home/laura/ANALYSIS/covidma/Quality/processed/20089101.trimmed_R1_fastqc.html'],
 3: ['/home/laura/ANALYSIS/covidma/Quality/raw/20091271.R2_fastqc.html',
  '/home/laura/ANALYSIS/covidma/Quality/processed/20091271.trimmed_R2_fastqc.html']}

In [41]:
def extract_quality_graph(html_file):
    with open(html_file, 'r') as f:
        content = f.read()
        image_tag = re.search(r'<img class="indented" src=.*alt="Per base quality graph" width="1020" height="600"/>', content)

    return image_tag.group(0)


In [103]:
def extract_basic_stats(html_file):
    with open(html_file, 'r') as f:
        content = f.read()
        table_tag = re.search(r'Basic Statistics</h2>(<table>(.+?)</table>)', content)

    return table_tag.group(1)

In [104]:
extract_basic_stats(file)

'<table><thead><tr><th>Measure</th><th>Value</th></tr></thead><tbody><tr><td>Filename</td><td>20089101.trimmed_R1.fastq.gz</td></tr><tr><td>File type</td><td>Conventional base calls</td></tr><tr><td>Encoding</td><td>Sanger / Illumina 1.9</td></tr><tr><td>Total Sequences</td><td>493241</td></tr><tr><td>Sequences flagged as poor quality</td><td>0</td></tr><tr><td>Sequence length</td><td>25-301</td></tr><tr><td>%GC</td><td>38</td></tr></tbody></table>'

In [113]:
def format_html_image(output_folder, files):
    html_template = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>covidma quality output</title>
    <style type="text/css">
        body {
        margin: 0 auto;
        }
        </style>
    </head>
    <body>
    IMAGEHTMLPAIRED
    </body>
    </html>
    """
    output_file = os.path.join(output_folder, 'fastq_image_report.html')
    all_images_tables = ''
    for number, pair in files.items():
        div_structure = """
        <div class="container">
        <table>
            <tr>
            <th>FILENAMERAW</th>
            <th>FILENAMETRIMMED</th>
            </tr>
            <tr>
            <td>TABLEQUALRAW</td>
            <td>TABLEQUALTRIMMED</td>
            </tr>
            <tr>
            <td>IMAGEQUALRAW</td>
            <td>IMAGEQUALTRIMMED</td>
            </tr>
        </table>
        <br>
        </div>
        """
        div_structure = div_structure.replace('FILENAMERAW', pair[0])
        div_structure = div_structure.replace('FILENAMETRIMMED', pair[1])
        div_structure = div_structure.replace('TABLEQUALRAW', extract_basic_stats(pair[0]))
        div_structure = div_structure.replace('TABLEQUALTRIMMED', extract_basic_stats(pair[1]))
        div_structure = div_structure.replace('IMAGEQUALRAW', extract_quality_graph(pair[0]))
        div_structure = div_structure.replace('IMAGEQUALTRIMMED', extract_quality_graph(pair[1]))

        

        all_images_tables = all_images_tables + div_structure

    final_html_template = html_template.replace('IMAGEHTMLPAIRED', all_images_tables)
    with open(output_file, 'w+') as f:
        f.write(final_html_template)


In [114]:
format_html_image(folder, pair_dict)