In [None]:
import json
import os
import pandas as pd
import altair as alt
from Bio import SeqIO

from IPython.display import HTML

from onecodex import Api
from onecodex.notebooks.report import set_style, title

In [None]:
ocx = Api()

In [None]:
ENVIRONMENT = os.environ.get("ONE_CODEX_REPORT_ENV", "draft")

if ENVIRONMENT == "production":
    sample_uuid = os.environ["ONE_CODEX_SAMPLE_UUID"]
    sample = ocx.Samples.get(sample_uuid)
    assert sample is not None, "Sample does not exist"
    sample_filename = sample.filename
    if not os.path.exists(sample_filename):
        sample.download()
else:
    sample_filename = os.environ["TEST_INPUT_FASTQ"]

In [None]:
# note our DB actually has v1 of this assembly as of Feb 2020
VARIANTS_TSV_PATH = "variants.tsv"
BAM_PATH = "covid19.bam"
REFERENCE_PATH = os.environ.get(
    "FASTA_REFERENCE", "reference/nCoV-2019.reference.fasta"
)
BED_FILE_PATH = os.environ.get("BED_FILE_PATH", "reference/artic-v1/ARTIC-V1.bed")

In [None]:
!bash covid19_call_variants.sh {REFERENCE_PATH} {sample_filename} {BED_FILE_PATH} command > variants.log 2>&1

In [None]:
# load reference genome
reference = list(SeqIO.parse(REFERENCE_PATH, "fasta"))

In [None]:
# calculate mapping depth
!samtools depth $BAM_PATH > snps.depth

In [None]:
n_reads = sample.primary_classification.results()["n_reads"]

In [None]:
# see https://www.biostars.org/p/138116/
n_aligned_reads = !samtools view -F 0x4 $BAM_PATH | cut -f 1 | sort | uniq | wc -l
n_aligned_reads = int(n_aligned_reads[0])

In [None]:
depth_table = []

with open("snps.depth") as handle:
    for line in handle:
        row = line.strip().split("\t")
        depth_table.append(
            {"reference": row[0], "position": int(row[1]), "depth": int(row[2])}
        )
depth_table = pd.DataFrame(depth_table)

In [None]:
# calculate genome coverage
# (what percent of bases are coveraged at X coverage)
min_depth = 1
reference_length = len(reference[0])

covered_sites = set()

for _, row in depth_table.iterrows():
    row = row.to_dict()
    if row["depth"] >= min_depth:
        covered_sites.add(row["position"])

cov = len(covered_sites) / reference_length

In [None]:
# get mean over windows because altair can't handle > 5k points ...
binned_depths = []
window_width = reference_length // 4500

for i in range(1, reference_length, window_width):
    window = depth_table.loc[
        (depth_table["position"] > i) & (depth_table["position"] < i + window_width)
    ]

    binned_depths.append(
        {"position": i, "depth": window["depth"].mean(),}
    )

binned_depths = pd.DataFrame(binned_depths)

In [None]:
depth = depth_table["depth"].mean()

In [None]:
# Read fom variants.tsv
snp_table = pd.read_csv(VARIANTS_TSV_PATH, sep="\t")

In [None]:
n_snps = snp_table.shape[0]

In [None]:
title("SARS-CoV-2 (COVID-19) Sequencing Overview")

In [None]:
text = f"""
This report summarizes the detection of SARS-CoV-2, the causative agent of COVID-19, in sample 
<strong>{sample_filename}</strong>. 
This sample contained <strong>{n_reads:,}</strong> reads, with
<strong>{n_aligned_reads:,}</strong> mapping to the 
<a href='https://www.ncbi.nlm.nih.gov/nuccore/NC_045512' target='_blank'>reference</a>. 
Reads cover <strong>{cov:.1%}</strong> of the SARS-CoV-2 genome, with a mean depth of <strong>{depth:.1f}x</strong>.
A total of <strong>{n_snps}</strong> variant{'s were' if n_snps != 1 else 'was'} detected."""

HTML(text)

In [None]:
# An updated theme not yet in onecodex v0.7.2
def onecodex_theme_alt():
    onecodex_palette = [
        "#ffffcc",
        "#c7e9b4",
        "#7fcdbb",
        "#41b6c4",
        "#2c7fb8",
        "#264153",
    ]

    return {
        "config": {
            "range": {"heatmap": list(reversed(onecodex_palette))},
            "axis": {
                "labelFont": "Palatino",
                "labelFontSize": 12,
                "titleFont": "Palatino",
                "titleFontSize": 12,
                "grid": False,
            },
            "area": {"fill": "#128887",},
            "legend": {
                "labelFont": "Palatino",
                "labelFontSize": 12,
                "titleFont": "Palatino",
                "titleFontSize": 12,
            },
            "title": {"font": "Palatino"},
            "view": {"width": 400, "height": 400, "strokeWidth": 0},
            "background": "transparent",
        }
    }


alt.themes.register("onecodex2", onecodex_theme_alt)
alt.themes.enable("onecodex2")
None

In [None]:
# Coverage plot
plot = (
    alt.Chart(binned_depths)
    .mark_area()
    .transform_window(rolling_mean="mean(depth)", frame=[-50, 50])
    .encode(
        x=alt.X(
            "position",
            title="Genomic Coordinate",
            scale=alt.Scale(domain=[0, reference_length]),
        ),
        y=alt.Y("rolling_mean:Q", scale=alt.Scale(type="linear"), title="Depth"),
    )
    .properties(
        title=f"SARS-CoV-2 ({os.path.basename(REFERENCE_PATH).rstrip('.gbk')})",
        width=550,
        height=150,
    )
)
plot

In [None]:
if snp_table.empty:
    table = pd.DataFrame(columns=["POS", "Variant", "Gene"])
    table = HTML(
        "<div style='text-align: center; color: #555; width: 100%;'><em>No SNPs found.</em><br /><br /></div>"
    )
else:
    snp_table["Position"] = snp_table["POS"]
    snp_table["Variant"] = [f"{r['REF']} → {r['ALT']}" for _, r in snp_table.iterrows()]
    snp_table["Depth"] = snp_table["ALT_DP"].apply(lambda x: f"{x}×")

    # snp_table["Gene"] = "TODO"
    table = snp_table.loc[
        snp_table["ALT_DP"] >= 10, ["Position", "Variant", "Depth"]
    ]  # , "Gene"]]
    table = HTML(table.to_html(index=False))
table

In [None]:
legend_text = "SARS-CoV-2 variants."
n_extra_variants = (
    snp_table[snp_table["ALT_DP"] < 10].shape[0] if not snp_table.empty else 0
)
if n_extra_variants:
    legend_text += f" An additional {n_extra_variants} variant{'s' if n_extra_variants > 1 else ''} <10× depth {'are' if n_extra_variants > 1 else 'is'} not shown."
if os.environ.get("ONE_CODEX_REPORT_UUID"):
    legend_text += f""" 
         A variants TSV and consensus FASTA is available <a target="_blank" href=\"{'https://app.onecodex.com/report/' + os.environ['ONE_CODEX_REPORT_UUID'] + '/files'}\">here</a>.
        """
HTML(
    '<div style="text-align: center; padding-top: 10px; font-size: 0.7em; color: #777;"><em>'
    + legend_text
    + "</em></div>"
)

### Additional Resources

- Additional bioinformatics pipeline details are [available on GitHub](https://github.com/onecodex/sars-cov-2)
- [Nextstrain](https://nextstrain.org/ncov) maintains an up-to-date analysis of SARS-CoV-2 (HCoV-19).
- The [Global Initiative on Sharing All Influenza Data (GISAID)](https://www.gisaid.org/) hosts viral genomes from ongoing outbreaks. Please [contact us](mailto:hello@onecodex.com) for help submitting your data.

In [None]:
# Add One Codex report ID to footer for reproducibility/data provenance (not yet in v0.7.2)
HTML(
    f"""
<style type='text/css'>
@page {{
    @bottom-center {{
        content: "{os.environ['ONE_CODEX_REPORT_UUID'] + ' -' if os.environ.get('ONE_CODEX_REPORT_UUID') else ''} NOT FOR DIAGNOSTIC USE" !important;
    }}
}}
</style>
"""
)

In [None]:
# Save a JSON too, including filtered variants <10x
results = [r.to_dict() for _, r in snp_table.iterrows()]

with open("results.json", "w") as f:
    json.dump(results, f)

In [None]:
# Clean up files
!rm -f {sample.filename} snps.depth variants.log covid19.bam