In [None]:
import binascii
import gzip
import json
import os
import sys
import re
import subprocess

from io import BytesIO

import pandas as pd
import numpy as np
import json
import altair as alt
import math

from glob import glob

from Bio import SeqIO
from IPython.display import HTML
from onecodex.notebooks.report import set_logo, set_style, title
import onecodex
from pathlib import Path

import matplotlib.pyplot as plt
import dnaplotlib as dpl
from matplotlib import gridspec
import matplotlib.ticker as mticker
from IPython.display import display, Image


import warnings # to avoid printing "FixedFormatter should only be used together with FixedLocator"

warnings.filterwarnings("ignore") # This only works sometimes...
warnings.simplefilter('ignore')

plt.rcParams.update({'font.family':'Fira Sans', "font.size": 12})

%matplotlib inline

In [None]:
logo_path = os.environ.get("ONE_CODEX_REPORT_LOGO")

if logo_path:
    display(set_logo(logo_path))

In [None]:
### Use the snpEff-generated tsv (variants.vcf with annotations and irrelevant information filtered out)
### as the basis for the variants table.
### The "allele_reads_by_strand" column is either the Medaka-generated SR or bcftools-generated DP4

def read_tsv_as_dataframe(path):


        df_snpeff = pd.read_csv(path, sep="\t", dtype={'POS': 'int32', \
                                                           'REF': 'str', \
                                                           'ALT': 'str', \
                                                           'allele reads by strand': 'str', \
                                                           'ANN[0].EFFECT': 'str', \
                                                           'ANN[0].HGVS P': 'str', \
                                                           'BCSQ': 'str'
                                                          },)
        

        if df_snpeff.empty: # If there are no variants
            df_snpeff = pd.DataFrame()
        else:
            df_snpeff = df_snpeff.rename(columns={"REF": "Ref", \
                                                          "ALT": "Alt", \
                                                          "ANN[0].HGVS P": "Variant effect", \
                                                          "ANN[0].EFFECT": "Variant type", \
                                                          "BCSQ": "Linkage"
                                                         })

            if INSTRUMENT_VENDOR == 'Illumina':
                position_column = "Position (first ref nt)"
            elif INSTRUMENT_VENDOR == 'Oxford Nanopore':
                position_column = "Position"

            df_snpeff = df_snpeff.rename(columns={"POS": position_column})
            df_snpeff["Missense mutation"] = ""
            df_snpeff = df_snpeff.reset_index()

            for i in df_snpeff.index:
                depths = [int(x) for x in str(df_snpeff.loc[i,'allele reads by strand']).split(",")]
                if sum(depths[:]) == 0:
                    df_snpeff = df_snpeff.drop(index=i)
                else:
                    df_snpeff.loc[i,'Alt depth'] = '{:.0f}'.format(sum(depths[2:]))
                    df_snpeff.loc[i,'Ref depth'] = '{:.0f}'.format(sum(depths[:2]))
                    df_snpeff.loc[i,'Total depth'] = '{:.0f}'.format(sum(depths[:]))
                    alt_freq = sum(depths[2:])/sum(depths[:])*100
                    if alt_freq == 100:
                        df_snpeff.loc[i,'Alt frequency (%)'] = '{:.0f}'.format(alt_freq)
                    else:
                        df_snpeff.loc[i,'Alt frequency (%)'] = '{:.2f}'.format(alt_freq)

            df_snpeff = df_snpeff.set_index(position_column)
            
        return df_snpeff


In [None]:
onecodex.Api() # initialize plot embedding
pass # don't print anything

In [None]:
RESULTS_DIR = Path(os.environ["RESULTS_DIR"])

In [None]:
INSTRUMENT_VENDOR = os.environ["INSTRUMENT_VENDOR"]
ARTIC_PRIMER_VERSION = os.environ["ARTIC_PRIMER_VERSION"]

if INSTRUMENT_VENDOR == 'Illumina':
    MIN_DEPTH = 10
elif INSTRUMENT_VENDOR == 'Oxford Nanopore':
    MIN_DEPTH= 50
else:
    raise Exception(f"Invalid sequencing platform: ${INSTRUMENT_VENDOR}")

SAMPLE_PATH = os.environ.get("SAMPLE_PATH") or glob(os.path.join(RESULTS_DIR, "*.fastq.gz"))[0]

# outputs of bioinformatics pipeline (default paths)
VARIANTS_VCF_PATH = RESULTS_DIR / "variants.vcf"
VARIANTS_SNPEFF_PATH = RESULTS_DIR / "variants.snpeff.tsv"
NEXTCLADE_JSON = RESULTS_DIR / "nextclade.json"
NEXTCLADE_TSV_PATH = RESULTS_DIR / "nextclade.tsv"
PANGOLIN_CSV_PATH = RESULTS_DIR / "pangolin.csv"
CONSENSUS_PATH = RESULTS_DIR / "consensus.fa"
SNPS_DEPTH_PATH = RESULTS_DIR / "snps.depth"
AA_CODES_PATH = RESULTS_DIR / "aa_codes.txt"

# databases
REFERENCE_PATH = os.environ.get("FASTA_REFERENCE", "/share/nCoV-2019.reference.fasta")


In [None]:
# count total reads

def is_gz_file(filepath):  
    # https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed
    with open(filepath, "rb") as test_f:
        return test_f.read(2) == b"\x1f\x8b"

total_reads = 0

if is_gz_file(SAMPLE_PATH):
    with gzip.open(SAMPLE_PATH, "rt") as handle:
        for line in handle:
            total_reads += 1
else:
    with open(SAMPLE_PATH, "rt") as handle:
        for line in handle:
            total_reads += 1

total_reads = total_reads / 4

In [None]:
# load reference genome
reference = list(SeqIO.parse(CONSENSUS_PATH, "fasta"))
reference_length = len(reference[0])

In [None]:
warning_messages = []

In [None]:
# TODO: generate before
with open(RESULTS_DIR / "total_mapped_reads.txt") as handle:
    total_mapped_reads = int(handle.read())

In [None]:
depth_table = []

with open(SNPS_DEPTH_PATH) as handle:
    for line in handle:
        row = line.strip().split("\t")
        if len(row) == 1:
            continue
        depth_table.append(
            {"reference": row[0], "position": int(row[1]), "depth": int(row[2])}
        )
depth_table = pd.DataFrame(depth_table, columns=["reference", "position", "depth"])

if (depth_table.values == 0).any():
    warning_messages.append("One or more bases are spanned by zero reads.")

In [None]:
# Calculate genome coverage (what percent of bases are coveraged at X coverage)
# Use a fixed reference length that we use for `samtools depth` above

covered_sites = set()
covered_sites_mindepth = set()

for _, row in depth_table.iterrows():
    row = row.to_dict()
    if row["depth"] >= 1:
        covered_sites.add(row["position"])
    if row["depth"] >= MIN_DEPTH:
        covered_sites_mindepth.add(row["position"]) 

cov = len(covered_sites) / reference_length
if cov <= 0.9:
    warning_messages.append(
        "The consensus sequence is too incomplete for GISAID submission (reads must span >90% of the reference)."
    )
cov_mindepth = len(covered_sites_mindepth) / reference_length

In [None]:
mean_depth = depth_table["depth"].mean() if not depth_table.empty else 0
median_depth = depth_table["depth"].median() if not depth_table.empty else 0

In [None]:
# Read Nextclade and Pangolin tables

if not os.path.exists(PANGOLIN_CSV_PATH):
    warning_messages.append("No pangolin output")
    have_pangolin = False
else:
    have_pangolin = True
    pangolin_table = pd.read_csv(PANGOLIN_CSV_PATH, sep=",")



In [None]:
# Read nextclade JSON
##### Please note that everything in the Nextclade JSON (nt positions, ranges, codon positions) is 0-indexed,
##### but SARS-CoV-2 variants (and most things) are reported as 1-indexed.

with open(NEXTCLADE_JSON) as json_file:
    nextclade_json = json.load(json_file)
    
# get nextclade version for pdf
nextclade_version = nextclade_json.get("nextcladeAlgoVersion")

In [None]:
if len(nextclade_json.get("errors")) > 0:
    have_nextclade = False
    nextclade_lineage = None    
    for errors1 in nextclade_json["errors"]:
        for errors2 in errors1["errors"]:
            warning_messages.append('(Nextclade Error) '
                                    + errors2)      
    n_variants_mindepth = None
    n_variants = None
    nextclade_pm_count = None
    variants_table = pd.DataFrame()
    
elif int(nextclade_json.get('results')[0]['totalSubstitutions']) ==0:
    warning_messages.append("No variants detected")
    have_nextclade = True
    nextclade_lineage = None
    n_variants_mindepth = 0
    n_variants = 0
    nextclade_pm_count = 0
    variants_table = pd.DataFrame()

else:
    have_nextclade = True
    # Add in gene info
    df_orfs = pd.read_csv(
        RESULTS_DIR / "annot_table.orfs.txt",
        sep="\t",
        header=None,
        usecols=[0, 1, 2],
        names=["gene", "start", "stop"],
        dtype={"gene": "str", "start": "int32", "stop": "int32"}
    )



################## If ONT, do not report indels and use Nextclade as the source of SNV information
    if INSTRUMENT_VENDOR == 'Oxford Nanopore':

        # load nextclade JSON
        rows_list = [] #for subst in nextclade_json["substitutions"]:  # Each substitution is a dictionary
        for subst in nextclade_json.get('results')[0]['substitutions']:  # update to reflect new json format
            dict1 = {}
            dict1["Position"] = (
                subst["pos"] + 1 # update to reflect new json format
            )  # JSON positions are 0-indexed; convert to 1-index
            dict1["Ref"] = subst["refNuc"]
            dict1["Alt"] = subst["queryNuc"]
            if len(subst["aaSubstitutions"]) != 0:
                for mutation in subst[
                    "aaSubstitutions"
                ]:  # JSON codons are 0-indexed; convert to 1-index
                    dict1["Missense mutation"] = (
                        mutation["refAA"] + str(mutation["codon"] + 1) + mutation["queryAA"]
                    )
            else:
                dict1["Missense mutation"] = ""
            rows_list.append(dict1)

        df_nextclade = pd.DataFrame(rows_list)
        # in case mutations are outside of genes
        df_nextclade['Gene'] = ''

        # Add in low-complexity region info
        df_low_complexity = pd.read_csv(RESULTS_DIR / "low_complexity_regions.txt", sep="\t", header=None, usecols=[1,2], names=["start","stop"])

        # join nextclade, VCF data and ORF annotations

        for i in df_nextclade.index:

            for j in df_orfs.index:
                if (
                    df_orfs.loc[j, "start"]
                    <= df_nextclade.loc[i, "Position"]
                    <= df_orfs.loc[j, "stop"]
                ):
                    df_nextclade.loc[i, "Gene"] = df_orfs.loc[j, "gene"]

        # Add in a flag for a low-complexity region              
            for k in df_low_complexity.index:
                if (
                    df_low_complexity.loc[k, "start"]
                    <= df_nextclade.loc[i, "Position"]
                    <= df_low_complexity.loc[k, "stop"]
                ):
                    df_nextclade.loc[i, "Low complexity region"] = "X"
                else:
                    df_nextclade.loc[i, "Low complexity region"] = ""

        variant_table = df_nextclade.set_index("Position")


        # load variants VCF
        df_vcf = read_tsv_as_dataframe(VARIANTS_SNPEFF_PATH)
        
        # Check that the same variant is always called from both amplicons spanning a position
        df_vcf = df_vcf.reset_index()
        df_dup = df_vcf[df_vcf["Position"].duplicated(keep=False)]
        for i in np.unique(df_dup["Position"]):
            alt_list = df_dup[df_dup["Position"]==i]["Alt"].tolist()
            assert alt_list[0] == alt_list[1],"Different variants are called between pools 1 and 2 for at least one position."

        # Instead of summing depths (will give much higher reads for positions with duplicate calls),
        # just keep the first call (this is the one that is annotated with BCSQ).
        
        for dup_position in np.unique(df_dup["Position"].tolist()):
            drop_index = max(df_dup[df_dup["Position"]==dup_position].index.tolist())
            df_vcf = df_vcf.drop(index=drop_index)
        df_vcf = df_vcf.set_index("Position")
        df_vcf = df_vcf[["Variant type","Variant effect","Alt depth","Ref depth","Total depth","Alt frequency (%)"]]


        variants_table = variant_table.merge(
            df_vcf, left_index=True, right_index=True, how="left"
        )

        
        display_columns = [
            "Ref",
            "Alt",
            "Alt depth",
            "Total depth",
            "Alt frequency (%)",
            "Gene",
            "Variant type",
            "Missense mutation",
            "Low complexity region"
        ]

       
        
        variants_table = variants_table[display_columns]
        
############ End ONT

############ If Illumina, indels are valid. Just use SnpEff annotations for the variants table

    elif INSTRUMENT_VENDOR == "Illumina":
            
        df_aa_codes = pd.read_csv(AA_CODES_PATH, sep='\t', index_col="Three-Letter Code")
        
        df_snpeff = read_tsv_as_dataframe(VARIANTS_SNPEFF_PATH)
        
        # One position can have multiple variants, so don't use position as index
        df_snpeff = df_snpeff.reset_index()

        # In case no SNVs are located in genes
        df_snpeff["Gene"] = ""
        for i in df_snpeff.index:
                
            #### Add in gene information
            for j in df_orfs.index:
                if (
                    df_orfs.loc[j, "start"]
                    <= i
                    <= df_orfs.loc[j, "stop"]
                ):
                    df_snpeff.loc[i, "Gene"] = df_orfs.loc[j, "gene"]


            ### Fill in all missense variants with missense mutation (convert HGVS notation to one-letter code)
            if 'missense variant' in df_snpeff.loc[i,'Variant type']:
                aa_mut = df_snpeff.loc[i,'Variant effect'].rsplit('.')[1]
                if aa_mut[-3:] in df_aa_codes.index:
                    queryAA = df_aa_codes.loc[aa_mut[-3:], "One-Letter Code"]
                    refAA = df_aa_codes.loc[aa_mut[0:3], "One-Letter Code"]
                    codon = ""
                    for z in aa_mut:
                        if z.isdigit():
                            codon = codon + z
                    df_snpeff.loc[i, "Missense mutation"] = refAA + codon + queryAA
            else:
                df_snpeff.loc[i, "Missense mutation"] = ''
                
            ### Identify subsitutions linked to another variant on the same codon;
            ### replace their missense mutation with the linked one
            if '@' in str(df_snpeff.loc[i,"Linkage"]): # If the variant is linked to another variant
                position_linked = int(df_snpeff.loc[i,"Linkage"].rsplit('@')[1].rsplit(',')[0])
                type_linked = df_snpeff[df_snpeff['Position (first ref nt)']==position_linked]["Variant type"] # the effect of the linked variant
                # If linkage points to another substitution on the same codon
                if any(word in type_linked for word in ["synonymous","missense"]):
                    df_snpeff.loc[i,"Missense mutation"] = df_snpeff.loc[position_linked,"Missense mutation"]                

        df_snpeff = df_snpeff.drop(columns={"allele reads by strand"})
        df_snpeff = df_snpeff.fillna("")
        df_snpeff = df_snpeff[["Position (first ref nt)", "Ref", "Alt", "Alt depth", "Total depth", "Alt frequency (%)", \
                                  "Gene", "Variant type", "Missense mutation"]]
        df_snpeff = df_snpeff.set_index("Position (first ref nt)")
        variants_table = df_snpeff
        
    
    n_variants = variants_table.shape[0]
    n_variants_mindepth = sum(variants_table["Total depth"].astype(float) > MIN_DEPTH)
    
    nextclade_lineage = nextclade_json.get('results')[0]['customNodeAttributes']['clade_nextstrain']
    nextclade_pm_count = nextclade_json.get('results')[0]['privateNucMutations']['totalPrivateSubstitutions']
    
    
    
    
    

In [None]:
if have_pangolin:

    pangolin_lineage = pangolin_table['lineage'][0]
    pangolin_version = pangolin_table['pangolin_version'][0]
    
    
    # Do not assign a Pangolin or Nextclade lineage if consensus does not pass QC
    
    for record in SeqIO.parse(CONSENSUS_PATH, "fasta"):
    
        if record.seq.count("N") > 20000:
            pangolin_lineage="unassigned"
            nextclade_lineage="unassigned"
            warning_messages.append(
                "The consensus sequence has too many ambiguous bases: "
                + str('{:,}'.format(record.seq.count("N")))
                + f" N's against the "
                + str('{:,}'.format(reference_length))
                + " base reference sequence."
            )
        # Split contig into unambiguous stretches
        runs = re.split(
            r"[^ATGC]", str(record.seq)
        )  
        max_len = len(max(runs, key=len))  # Length of longest unambiguous stretch

        if max_len < 10000:
            pangolin_lineage="Cannot be confidently assigned"
            nextclade_lineage="Cannot be confidently assigned"
            warning_messages.append(
                "The consensus sequence is too incomplete for GISAID submission: the longest stretch of unambiguous bases is only "
                + str('{:,}'.format(max_len))
                + " bases (must be over 10,000)."
            )
        
else:
    warning_messages.append("Pangolin failed")
    pangolin_lineage = "Undetected or error"
    pangolin_version = "NA"



In [None]:
title("SARS-CoV-2 (COVID-19) Sequencing Overview")

In [None]:
text = []

if INSTRUMENT_VENDOR == "Oxford Nanopore":
    variant_description = "single-nucleotide variants (SNVs)"
elif INSTRUMENT_VENDOR == "Illumina":
    variant_description = "variants"
    
text.append(f"""
<p>
This report summarizes the detection of SARS-CoV-2 {variant_description} in sample 
<strong>{os.path.basename(SAMPLE_PATH)}</strong>, generated on the <strong>{INSTRUMENT_VENDOR}</strong> sequencing platform with ARTIC V{ARTIC_PRIMER_VERSION} primers.
</p>
""") 
    
text.append(f"""
<p>This sample contained <strong>{int(total_reads):,}</strong> read{'' if total_reads == 1 else 's'}, with
<strong>{total_mapped_reads / total_reads:.1%}</strong> mapping to the 
<a href='https://www.ncbi.nlm.nih.gov/nuccore/MN908947.3/' target='_blank'>Wuhan-Hu-1 reference</a>.
Reads span <strong>{cov:.0%}</strong> of the genome, with a mean depth of <strong>{mean_depth:.0f}x</strong>, a median depth of <strong>{median_depth:.0f}x</strong>, and {cov_mindepth:.0%} of the genome covered at depth >{MIN_DEPTH:}x.
</p>
""")

if total_mapped_reads >= 1:
    
    if INSTRUMENT_VENDOR == "Oxford Nanopore":
        text.append(f"""
        <p>A total of <strong>{n_variants_mindepth}</strong> SNV{'s were' if n_variants_mindepth != 1 else ' was'} detected 
    at depths >{MIN_DEPTH:}x, the minimum depth chosen for confident SNV detection based on 
        <a href="https://doi.org/10.1038/s41467-020-20075-6">benchmarking</a> of Oxford Nanopore sequencing data. 
        Vertical black lines on the coverage plot below show the depth of high quality reads (may be less than total reads) for each SNV. </p>

        <p>SNV detection in low complexity regions (ex: homopolymer-rich) is less accurate and flagged in the table below.
        Oxford Nanopore sequencing is unsuitable for detection of small indel varants, which we do not report here.
        </p>
        """)
        
    elif INSTRUMENT_VENDOR == "Illumina":
        text.append(f"""
        <p>A total of <strong>{n_variants_mindepth}</strong> variant{'s were' if n_variants_mindepth != 1 else ' was'} detected 
    at depths >{MIN_DEPTH:}x, the minimum depth chosen for confident variant detection using Illumina sequencing data. 
    Vertical black lines on the coverage plot below show the depth of high quality reads (may be less than total reads) for each variant.</p>
        """)

    if pangolin_lineage == "Cannot be confidently assigned":
        text.append(f"""<p>The genome quality is too low to confidently assign a Pangolin or Nextclade lineage (see warning messages).</p>""")
    else:
        text.append(f"""
    <p>This genome is classified as Pangolin lineage <strong>{pangolin_lineage}</strong> using Pangolin
    version {pangolin_version} and Nextclade lineage <strong>{nextclade_lineage}</strong> using Nextclade version {nextclade_version} with <strong>{nextclade_pm_count} 
    private mutation{'s' if nextclade_pm_count != 1 else ''}</strong> detected.
    </p>
    """)

HTML("".join(text))

In [None]:
#######################

# Un-smoothed coverage plot in matplotlib

#######################

if total_mapped_reads >= 1: # Do not plot at all if there are no reads
    ############### Define genome diagram design 

    cur_region = [0, 30000]

    # Colors
    col_map = {}
    col_map['grey'] = "#6e6e6e"
    col_map['ocx_signature_green'] = "#128887"
    col_map['ocx_navy_blue'] = "#16347B"
    col_map['ocx_blue'] = "#0072C7"
    col_map['ocx_light_blue'] = "#01ACEC"
    col_map['ocx_cyan'] ="#97E9FC"
    col_map['ocx_forest_green'] = "#0A605E"
    col_map['ocx_kelly_green'] = "#1DA893"
    col_map['ocx_blue_green'] = "#3DD8BE"
    col_map['ocx_pastel_green'] = "#ABEFE2"
    col_map['ocx_dark_purple'] = "#37257D"
    col_map['ocx_purple'] = "#9C78E0"
    col_map['ocx_pastel_purple'] = "#CBC0F9"
    col_map['ocx_light_purple'] = "#E3DDFF"
    col_map['ocx_burnt_sienna'] = "#BC5B00"
    col_map['ocx_orange'] = "#EB984A"
    col_map['ocx_yellow'] = "#FCE34D"
    col_map['ocx_light_yellow'] = "#FEF2A3"
    col_map['ocx_dark_red'] = "#950303"
    col_map['ocx_red'] = "#DD3A3A"
    col_map['ocx_coral'] = "#FF8D8B"
    col_map['ocx_peach'] = "#FFD5CB"
    col_map['ocx_dark_magenta'] = "#771354"
    col_map['ocx_magenta'] = "#C13A8B"
    col_map['ocx_pink'] = "#F28BBF"
    col_map['ocx_light_pink'] = "#F9D9E7"

    # dnaplotlib formatting options

    # Some additional parameters that can be set:
    # 'label_style':'italic'
    # 'linewidth':1.0
    # 'arrowhead_length':8,

    Y_OFFSET=8
    LABEL_ROTATION=45
    LABEL_SIZE=9.5 # font size
    LINEWIDTH=0.1
    LINECOLOR="#3b3b3b"
    SNP_LINEWIDTH=0.85
    EDGE_COLOR=col_map['grey']

    opt_orf1ab = { 'label':'orf1ab', 'label_color':col_map['ocx_blue'], 'label_y_offset':Y_OFFSET, \
                  'color':col_map['ocx_blue'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
                 'linewidth':LINEWIDTH, 'linecolor':LINECOLOR, 'edgecolor':EDGE_COLOR }
    opt_spike = { 'label':'spike', 'label_color':col_map['ocx_red'], 'label_y_offset':Y_OFFSET, \
                 'color':col_map['ocx_red'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
                'linewidth':LINEWIDTH, 'linecolor':LINECOLOR, 'edgecolor':EDGE_COLOR }
    opt_orf3a = {'label':'orf3a', 'label_color':col_map['ocx_orange'], 'label_y_offset':-Y_OFFSET, \
                 'color':col_map['ocx_orange'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
                'linewidth':LINEWIDTH, 'label_x_offset':-500, 'edgecolor':EDGE_COLOR }
    opt_geneE = {'label':'geneE', 'label_color':"#a296d6", 'label_y_offset':Y_OFFSET, \
                 'color':col_map['ocx_pastel_purple'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
                'linewidth':LINEWIDTH, 'label_x_offset':500, 'edgecolor':EDGE_COLOR }
    opt_geneM = {'label':'geneM', 'label_color':col_map['ocx_light_blue'], 'label_y_offset':-Y_OFFSET, \
                 'color':col_map['ocx_light_blue'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
                'linewidth':LINEWIDTH, 'label_x_offset':-500, 'edgecolor':EDGE_COLOR }
    opt_orf6 = {'label':'orf6', 'label_color':"#a9db7d", 'label_y_offset':Y_OFFSET, \
                'color':"#cdffa1", 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
               'linewidth':LINEWIDTH, 'label_x_offset':400, 'edgecolor':EDGE_COLOR }
    opt_orf7a = {'label':'orf7a', 'label_color':col_map['ocx_coral'], 'label_y_offset':-Y_OFFSET, \
                 'color':col_map['ocx_coral'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
                'linewidth':LINEWIDTH, 'label_x_offset':-400, 'edgecolor':EDGE_COLOR }
    opt_orf8 = {'label':'orf8', 'label_color':col_map['ocx_magenta'], 'label_y_offset':Y_OFFSET, \
                'color':col_map['ocx_magenta'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
               'linewidth':LINEWIDTH, 'label_x_offset':500, 'edgecolor':EDGE_COLOR }
    opt_geneN = {'label':'geneN', 'label_color':'#90decf', 'label_y_offset':-Y_OFFSET, \
                 'color':col_map['ocx_pastel_green'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
                'linewidth':LINEWIDTH, 'label_x_offset':-500, 'edgecolor':EDGE_COLOR }
    opt_orf10 = {'label':'orf10', 'label_color':"#e6bcce", 'label_y_offset':Y_OFFSET, \
                 'color':col_map['ocx_light_pink'], 'label_rotation':LABEL_ROTATION, 'label_size':LABEL_SIZE, \
                'linewidth':LINEWIDTH, 'label_x_offset':500, 'edgecolor':EDGE_COLOR }
    opt_snv = { 'color':'black' , 'linewidth':SNP_LINEWIDTH }

    # Define ORFs (if want arrows instead of blocks, switch to 'type':'CDS')
    ORF1AB = {'type':'UserDefined', 'name':'orf1ab', 'start':266,  'end':21555, 'fwd':True, 'opts':opt_orf1ab}
    SPIKE = {'type':'UserDefined', 'name':'spike', 'start':21563, 'end':25384, 'fwd':True, 'opts':opt_spike}
    ORF3A = {'type':'UserDefined', 'name':'orf3a', 'start':25393, 'end':26220, 'fwd':True, 'opts':opt_orf3a}
    GENEE = {'type':'UserDefined', 'name':'geneE', 'start':26245, 'end':26472, 'fwd':True, 'opts':opt_geneE}
    GENEM = {'type':'UserDefined', 'name':'geneM', 'start':26523, 'end':27191, 'fwd':True, 'opts':opt_geneM}
    ORF6 = {'type':'UserDefined', 'name':'orf6', 'start':27202, 'end':27387, 'fwd':True, 'opts':opt_orf6}
    ORF7A = {'type':'UserDefined', 'name':'orf6', 'start':27394, 'end':27759, 'fwd':True, 'opts':opt_orf7a}
    ORF8 = {'type':'UserDefined', 'name':'orf8', 'start':27894, 'end':28259, 'fwd':True, 'opts':opt_orf8}
    GENEN = {'type':'UserDefined', 'name':'geneN', 'start':28274, 'end':29533, 'fwd':True, 'opts':opt_geneN}
    ORF10 = {'type':'UserDefined', 'name':'orf10', 'start':29558, 'end':29674, 'fwd':True, 'opts':opt_orf10}

    # A design is merely a list of parts and their properties
    design = [ORF1AB, SPIKE, ORF3A, GENEE, GENEM, ORF6, ORF7A, ORF8, GENEN, ORF10]

    # Add SNVs to the design
    if not variants_table.empty:
        for position, row in variants_table.iterrows():
            START=position
            END=position+1
            design.append({'type':'UserDefined', 'name':'snv', 'start':START,  'end':END, 'fwd':True, 'opts':opt_snv})


    ################# Plot genome diagram

    # Create the overall figure
    fig = plt.figure(figsize=(8,2), dpi=300)
    gs = gridspec.GridSpec(2, 1, height_ratios=[0.4, 1])

    # Create the DNAplotlib renderer
    dr = dpl.DNARenderer(scale=15, linewidth=0.9)

    # Render the orfs to axis
    ax_dna = plt.subplot(gs[0])
    start, end = dr.renderDNA(ax_dna, design, dr.trace_part_renderers(), plot_backbone=True)
    ax_dna.set_xlim(cur_region)
    ax_dna.set_ylim([-5,8])
    ax_dna.axis('off')


    ################# Plot coverage

    # Generate axes for coverage plot
    ax = plt.subplot(gs[1])


    ###### x-axis

    ax.set_xlim(cur_region)
    ax.set_xlabel('Genomic Coordinate (kb)', fontsize=12, labelpad=5)

    # Set x-axis ticks to kb
    labels = ax.get_xticks().tolist()
    labels_kb = [int(float(label)/1000) for label in labels]
    ax.set_xticklabels(labels_kb)

    # Set 5 minor ticks per major tick
    from matplotlib.ticker import AutoMinorLocator
    minor_locator = AutoMinorLocator(5)
    ax.xaxis.set_minor_locator(minor_locator)

    ###### y-axis

    ax.set_ylabel('Depth', fontsize=12, labelpad=5)

    # Set y-axis limits according to maximum depth in sample
    ax.set_yscale('log')
    if depth_table['depth'].max() > 0:
        exp = math.ceil(math.log10(depth_table['depth'].max()))
    else:
        exp = 0
    yaxis_max = 10**exp
    ax.set_ylim((1,yaxis_max))

    # y-axis major ticks at every multiple of 10
    locmaj = mticker.LogLocator(base=10, numticks=20) # numticks should be > number of ticks to display
    ax.yaxis.set_major_locator(locmaj)

    # y-axis major tick labels: commas at thousands
    ax.get_yaxis().set_major_formatter(mticker.FuncFormatter(lambda x, p: format(int(x), ',')))

    # y-axis minor ticks (5 per major tick)
    locmin = mticker.LogLocator(base=10.0,subs=(0.2,0.4,0.6,0.8),numticks=20)
    ax.yaxis.set_minor_locator(locmin)
    ax.yaxis.set_minor_formatter(mticker.NullFormatter())


    ##### axis colors and spine visiblity

    # Set axis colors to grey; show only the bottom and left spines
    for SPINE in ['bottom', 'left']:
        ax.spines[SPINE].set_color(col_map['grey'])
    for SPINE in ['right', 'top']:
        ax.spines[SPINE].set_visible(False)
    for AXIS in ['x', 'y']:
        ax.tick_params(axis=AXIS, which='both', colors=col_map['grey'], labelsize=10)
    ax.yaxis.set_ticks_position('left')

    ax.xaxis.set_ticks_position('bottom')


    ##### Plot depths as a colored fill
    plt.fill_between(depth_table['position'], depth_table['depth'], color=col_map['ocx_kelly_green'])

    ##### Show min depth as grey dashed line
    plt.hlines(y=MIN_DEPTH, xmin=0, xmax=30000, linestyle='--', linewidth=0.5, color=col_map['grey'])

    ##### Show SNVs and color according to AA mutation
    if not variants_table.empty:
        for position, row in variants_table[variants_table["Total depth"].astype(float) > MIN_DEPTH].iterrows():
            yvalue = depth_table.loc[depth_table['position'] == position, 'depth'].iloc[0]
            xvalue = np.arange(position-0.45, position+0.45)
            plt.fill_between(xvalue, yvalue, color="black", linewidth=SNP_LINEWIDTH)


    ############# Update subplot spacing

    plt.subplots_adjust(hspace=0.45, left=0.01, right=0.99, top=0.99, bottom=0.01)

    ############# Close and save

    plt.close()

    fig.savefig("covplot.png", bbox_inches='tight', dpi=300)

    import base64
    with open("covplot.png", "rb") as handle:
        data = handle.read()
    plot_data = base64.b64encode(data).decode('utf-8')

    display(HTML(f'<img src="data:image/png;base64, {plot_data}"/>'))
    

In [None]:
pd.set_option('display.max_rows', 200000)

if not variants_table.empty: # If there are variants   
    # Display full tables up to 200 rows

    variants_table_display = variants_table.rename(columns={"Alt depth": "Alt depth (high quality reads)", \
                                                      "Total depth": "Total depth (high quality reads)" \
                                                     })
    variants_table_display_filtered = variants_table_display[variants_table_display['Total depth (high quality reads)'].astype(float) > MIN_DEPTH]
    
    if variants_table_display_filtered.empty:
        display(HTML(f"No variants detected."))
    else:
        display(variants_table_display_filtered)
        legend_text = "SARS-CoV-2 variants."

        n_extra_variants = (
            n_variants - n_variants_mindepth if not variants_table.empty else 0
        )

        if n_extra_variants > 0:
            legend_text += f" An additional {n_extra_variants} variant{'s' if n_extra_variants > 1 else ''} <{MIN_DEPTH}× depth {'are' if n_extra_variants > 1 else 'is'} not shown."

        if os.environ.get("ONE_CODEX_REPORT_UUID"):
            legend_text += f""" 
                 A variants TSV and consensus FASTA is available <a target="_blank" href=\"{'https://app.onecodex.com/report/' + os.environ['ONE_CODEX_REPORT_UUID'] + '/files'}\">here</a>.
                """
        try:
            git_commit_no = subprocess.check_output(['git', 'log','-n','1','--pretty=format:"%H"'], stderr=subprocess.STDOUT).decode('utf-8')
            commit_text = f" This report was generated with commit number {git_commit_no}."
        except subprocess.CalledProcessError as e:
            commit_text = "Could not retrieve git commit hash. rc=" + str(e.returncode) + "; output=" + str(e.output)

        display(HTML(
            '<div style="text-align: center; padding-top: 10px; font-size: 0.7em; color: #777;"><em>'
            + legend_text
            + '<p style="text-align:center">'
            + commit_text
            + "</p></em></div>"
        ))
    
else:
    display(HTML(f"No variants detected."))

### Additional Resources

- Additional bioinformatics pipeline details are [available on GitHub](https://github.com/onecodex/sars-cov-2)
- [Nextstrain](https://nextstrain.org/ncov) maintains an up-to-date analysis of SARS-CoV-2 (HCoV-19).
- The [Global Initiative on Sharing All Influenza Data (GISAID)](https://www.gisaid.org/) hosts viral genomes from ongoing outbreaks. Please [contact us](mailto:hello@onecodex.com) for help submitting your data.

In [None]:
# Add One Codex report ID to footer for reproducibility/data provenance (not yet in v0.7.2)
HTML(
    f"""
<style type='text/css'>
@page {{
    @bottom-center {{
        content: "{os.environ['ONE_CODEX_REPORT_UUID'] + ' -' if os.environ.get('ONE_CODEX_REPORT_UUID') else ''} NOT FOR DIAGNOSTIC USE" !important;
    }}
}}
</style>
"""
)

In [None]:
# Save a JSON too, including filtered variants <50x
results = {
    "n_reads": total_reads,
    "n_mapped_reads": total_mapped_reads,
    "report_id": os.environ.get("ONE_CODEX_REPORT_UUID"), 
    "sample_id": os.environ.get("ONE_CODEX_SAMPLE_UUID"),
    "variants": variants_table.to_dict(orient='records') if n_variants else None,
    "coverage": cov,
    "coverage_over_min_depth": cov_mindepth,
    "min_depth": MIN_DEPTH,
    "mean_depth": mean_depth,
    "median_depth": median_depth,
    "nextclade_results": nextclade_json,
    "nextclade_lineage": nextclade_lineage,
    "pangolin_results": pangolin_table.to_dict(orient='records'),
    "pangolin_lineage": pangolin_lineage,
    "warnings": warning_messages,
}

with gzip.open(f"{os.path.basename(SAMPLE_PATH)}.report.json.gz", "w") as f:
    f.write(json.dumps(results).encode())

In [None]:
if len(warning_messages) > 0:
    display(HTML("<ul>"))
    display(HTML("<h1>Warning Messages</h1>"))    
    for message in set(warning_messages):
        display(HTML(f"<li>{message}</li>"))
    display(HTML("</ul>"))