# This pipeline assumes ARTIC V3 amplicon sequencing using ONT

In [1]:
import binascii
import gzip
import json
import os
import sys

from io import BytesIO

import pandas as pd
import altair as alt

from altair_saver import save
#from Bio import SeqIO
from IPython.display import HTML
from onecodex import Api
from onecodex.notebooks.report import set_style, title

In [2]:
ocx = Api()

In [15]:
ENVIRONMENT = os.environ.get("ONE_CODEX_REPORT_ENV", "draft")

if ENVIRONMENT == "production":
    sample_uuid = os.environ["ONE_CODEX_SAMPLE_UUID"]
    sample = ocx.Samples.get(sample_uuid)
    assert sample is not None, "Sample does not exist"
    sample_filename = sample.filename
    if not os.path.exists(sample_filename):
        sample.download()
else:
    sample_filename = "sample.fastq"

In [16]:
# note our DB actually has v1 of this assembly as of Feb 2020
VARIANTS_TSV_PATH = "variants.tsv"
NEXTCLADE_TSV_PATH = "nextclade.tsv"
PANGOLIN_CSV_PATH = "pangolin.csv"
BAM_PATH = "covid19.bam"
REFERENCE_PATH = os.environ.get(
    "FASTA_REFERENCE", "share/nCoV-2019.reference.fasta"
)
#BED_FILE_PATH = os.environ.get("BED_FILE_PATH", "reference/artic-v3/ARTIC-V3.bed")
REFERENCE_NAME = os.path.basename(REFERENCE_PATH).rstrip('.fasta')

In [20]:
#if os.getenv("ORBITER_SEQUENCING_PLATFORM") == "Oxford Nanopore":

mindepth = 50
!bash covid19_call_variants.artic.sh {sample_filename} #> variants.log 2>&1


/
Processing 1 files in ./
guppyplexed_.fastq	1345877
[32m[22mRunning: [39m[22m artic-tools validate_scheme /artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.scheme.bed
[02:20:36] [artic-tools::validate_scheme] starting primer scheme validator
[02:20:36] [artic-tools::validate_scheme] reading scheme
[02:20:36] [artic-tools::validate_scheme] collecting scheme stats
[02:20:36] [artic-tools::validate_scheme] 	primer scheme file:	/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.scheme.bed
[02:20:36] [artic-tools::validate_scheme] 	reference sequence:	MN908947.3
[02:20:36] [artic-tools::validate_scheme] 	number of pools:	2
[02:20:36] [artic-tools::validate_scheme] 	number of primers:	218 (includes 22 alts)
[02:20:36] [artic-tools::validate_scheme] 	minimum primer size:	22
[02:20:36] [artic-tools::validate_scheme] 	maximum primer size:	57
[02:20:36] [artic-tools::validate_scheme] 	number of amplicons:	98
[02:20:36] [artic-tools::validate_scheme] 	mean amplicon size:	343
[02:20:3

In [21]:
# load reference genome
#reference = list(SeqIO.parse(REFERENCE_PATH, "fasta"))
#reference_length = len(reference[0])

reference_length = 29903


In [22]:
!samtools depth $BAM_PATH > snps.depth 2> /dev/null

In [23]:
#n_reads = sample.primary_classification.results()["n_reads"]
n_reads = 55000

In [24]:
samtools_view_output = !samtools view -F 2308 $BAM_PATH | wc -l
n_mapped_reads = int(samtools_view_output[0])
percent_mapped_reads = n_mapped_reads/n_reads*100

In [25]:
depth_table = []

with open("snps.depth") as handle:
    for line in handle:
        row = line.strip().split("\t")
        depth_table.append(
            {"reference": row[0], "position": int(row[1]), "depth": int(row[2])}
        )
depth_table = pd.DataFrame(depth_table, columns=["reference", "position", "depth"])

In [26]:
# Calculate genome coverage (what percent of bases are coveraged at X coverage)
# Use a fixed reference length that we use for `samtools depth` above

covered_sites = set()
covered_sites_mindepth = set()

for _, row in depth_table.iterrows():
    row = row.to_dict()
    if row["depth"] >= 1:
        covered_sites.add(row["position"])
    if row["depth"] >= mindepth:
        covered_sites_mindepth.add(row["position"])        

cov = len(covered_sites) / reference_length
cov_mindepth = len(covered_sites_mindepth) / reference_length

In [27]:
# get mean over windows because altair can't handle > 5k points ...
binned_depths = []
window_width = reference_length // 4500

for i in range(1, reference_length, window_width):
    window = depth_table.loc[
        (depth_table["position"] > i) & (depth_table["position"] < i + window_width)
    ]

    binned_depths.append(
        {"position": i, "depth": window["depth"].mean(),}
    )

binned_depths = pd.DataFrame(binned_depths)
mean_depth = depth_table["depth"].mean() if not depth_table.empty else 0
median_depth = depth_table["depth"].median() if not depth_table.empty else 0

In [28]:
# Read from variants.tsv
snp_table = pd.read_csv(VARIANTS_TSV_PATH, sep="\t")
# Read Nextclade and Pangolin tables
nextclade_table = pd.read_csv(NEXTCLADE_TSV_PATH, sep="\t")
pangolin_table = pd.read_csv(PANGOLIN_CSV_PATH, sep=",")

In [29]:
n_snps = snp_table.shape[0]
n_snps_mindepth = snp_table[snp_table['depth']>=mindepth].shape[0]
nextclade_lineage = nextclade_table['clade'].iloc[0]
nextclade_pm_count = nextclade_table['qc.privateMutations.total'].iloc[0]
pangolin_lineage = pangolin_table['lineage'].iloc[0]
pangolin_version = pangolin_table['pangoLEARN_version'].iloc[0]

In [30]:
title("SARS-CoV-2 (COVID-19) Sequencing Overview")

In [41]:
text = f"""
This report summarizes the detection of SARS-CoV-2 in sample 
<strong>{sample_filename}</strong>. 

<p>This sample contained <strong>{n_reads:,}</strong> reads, with
<strong>{percent_mapped_reads:.1%}</strong> mapping to the 
<a href='https://www.ncbi.nlm.nih.gov/nuccore/MN908947.3/' target='_blank'>Wuhan-Hu-1 reference</a>.
Reads span <strong>{cov:.0%}</strong> of the genome, with {cov_mindepth:.0%} of the genome covered at depths >{mindepth:}x, with a mean depth of <strong>{mean_depth:.0f}x</strong>.</p>

<p>A total of <strong>{n_snps_mindepth}</strong> variant{'s were' if n_snps_mindepth != 1 else 'was'} detected at depths over {mindepth:}x.
This genome is classified as Pangolin lineage <strong>{pangolin_lineage}</strong> using PangoLEARN verison {pangolin_version} and Nextclade lineage <strong>{nextclade_lineage}</strong> with {nextclade_pm_count} private mutation{'s' if nextclade_pm_count != 1 else ''}.</p>"""

HTML(text)

In [42]:
# Coverage plot
plot = (
    alt.Chart(binned_depths)
    .mark_area()
    .transform_window(rolling_mean="mean(depth)", frame=[-50, 50])
    .encode(
        x=alt.X(
            "position",
            title="Genomic Coordinate",
            scale=alt.Scale(domain=[0, reference_length]),
        ),
        y=alt.Y("rolling_mean:Q", scale=alt.Scale(type="linear"), title="Depth"),
    )
    .properties(
        title=f"SARS-CoV-2 ({REFERENCE_NAME})",
        width=550,
        height=150,
    )
)
plot

internal/modules/cjs/loader.js:965
  throw err;
  ^

Error: Cannot find module 'vega'
Require stack:
- /usr/lib/node_modules/vega-lite/build/vega-lite.js
- /usr/lib/node_modules/vega-lite/bin/vl2vg
    at Function.Module._resolveFilename (internal/modules/cjs/loader.js:962:15)
    at Function.Module._load (internal/modules/cjs/loader.js:838:27)
    at Module.require (internal/modules/cjs/loader.js:1022:19)
    at require (internal/modules/cjs/helpers.js:72:18)
    at /usr/lib/node_modules/vega-lite/build/vega-lite.js:2:105
    at Object.<anonymous> (/usr/lib/node_modules/vega-lite/build/vega-lite.js:5:2)
    at Module._compile (internal/modules/cjs/loader.js:1118:30)
    at Object.Module._extensions..js (internal/modules/cjs/loader.js:1138:10)
    at Module.load (internal/modules/cjs/loader.js:982:32)
    at Function.Module._load (internal/modules/cjs/loader.js:875:14) {
  code: 'MODULE_NOT_FOUND',
  requireStack: [
    '/usr/lib/node_modules/vega-lite/build/vega-lite.js',
    '/usr/li

CalledProcessError: Command '['/usr/bin/vl2vg']' returned non-zero exit status 1.

alt.Chart(...)

In [63]:
# Can use the previously defined mindepth instead of DEPTH_FILTER

if snp_table.empty or snp_table.loc[snp_table["depth"] >= mindepth, :].shape[0] == 0:
    table = pd.DataFrame(columns=["POS", "Variant", "Gene"])
    # should display mindepth instead of 10x
    table = HTML(
        "<div style='text-align: center; color: #555; width: 100%;'><em>No SNPs found >= 50x depth.</em><br /><br /></div>"
    )
else:
    snp_table["Position"] = snp_table["position"]
    snp_table["Variant"] = [f"{r['ref allele']} → {r['alt allele']}" for _, r in snp_table.iterrows()]
    snp_table["Depth"] = snp_table["depth"]
    snp_table["Frequency"] = snp_table["alt frequency"] # Already expressed as a percentage to 2 decimal places
    snp_table["Gene"] = snp_table["orf"]
    snp_table["Protein mutation"] = snp_table["protein sequence variant"]

    table = snp_table.loc[
        snp_table["Depth"] >= mindepth, ["Position", "Variant", "Depth", "Frequency", "Gene", "Protein mutation"]
    ]  # , "Gene", "Protein change"]]
    table = HTML(table.to_html(index=False))
table

Position,Variant,Depth,Frequency,Gene,Protein mutation
14408,C → T,357,96.08%,nsp12 in orf1ab,
23403,A → G,351,97.72%,spike,


In [67]:
legend_text = "SARS-CoV-2 variants."
n_extra_variants = (
    snp_table[snp_table["Depth"] < mindepth].shape[0] if not snp_table.empty else 0
)
if n_extra_variants:
    legend_text += f" An additional {n_extra_variants} variant{'s' if n_extra_variants > 1 else ''} <{mindepth}× depth {'are' if n_extra_variants > 1 else 'is'} not shown."
if os.environ.get("ONE_CODEX_REPORT_UUID"):
    legend_text += f""" 
         A variants TSV and consensus FASTA is available <a target="_blank" href=\"{'https://app.onecodex.com/report/' + os.environ['ONE_CODEX_REPORT_UUID'] + '/files'}\">here</a>.
        """
HTML(
    '<div style="text-align: center; padding-top: 10px; font-size: 0.7em; color: #777;"><em>'
    + legend_text
    + "</em></div>"
)

### Additional Resources

- Additional bioinformatics pipeline details are [available on GitHub](https://github.com/onecodex/sars-cov-2)
- [Nextstrain](https://nextstrain.org/ncov) maintains an up-to-date analysis of SARS-CoV-2 (HCoV-19).
- The [Global Initiative on Sharing All Influenza Data (GISAID)](https://www.gisaid.org/) hosts viral genomes from ongoing outbreaks. Please [contact us](mailto:hello@onecodex.com) for help submitting your data.

In [68]:
# Add One Codex report ID to footer for reproducibility/data provenance (not yet in v0.7.2)
HTML(
    f"""
<style type='text/css'>
@page {{
    @bottom-center {{
        content: "{os.environ['ONE_CODEX_REPORT_UUID'] + ' -' if os.environ.get('ONE_CODEX_REPORT_UUID') else ''} NOT FOR DIAGNOSTIC USE" !important;
    }}
}}
</style>
"""
)

In [69]:
# Save a JSON too, including filtered variants <50x
results = {
    "n_reads": n_reads,
    "n_mapped_reads": n_mapped_reads,
    "report_id": os.environ.get("ONE_CODEX_REPORT_UUID"), 
    "sample_id": os.environ.get("ONE_CODEX_SAMPLE_UUID"),
    "variants": [r.to_dict() for _, r in snp_table.iterrows()],
    "coverage": cov,
    "coverage_over_50x": cov_mindepth,
    "mean_depth": mean_depth,
    "median_depth": median_depth,
}

with gzip.open(f"{sample.filename}.report.json.gz", "w") as f:
    f.write(json.dumps(results).encode())


NameError: name 'sample' is not defined

In [None]:
# Clean up files
!rm -f {sample.filename} snps.depth variants.log covid19.bam.bai