# GIAB Nanopore Quality Control

Runs per-sample QC on Oxford Nanopore data generated for the genome in a bottle consortium. Requires reads to have been basecalled and aligned to the reference genome using the included pipeline.

In [1]:
import datetime
print("Ran on: ", datetime.datetime.today().strftime("%b %d, %Y at %I:%M %p"))

Ran on:  Apr 23, 2019 at 09:01 AM


## Pre-processing
Load metadata, load fastq and bam statistics, combine statistics...

In [9]:
import collections
import numpy
import os
import pandas
import sys
import tqdm

from biorpy import r, iimage

sys.path.append("/oak/stanford/groups/msalit/ndolson/ONT-pipe-run-logs")
import experiments
import qc

In [10]:
import importlib
_ = importlib.reload(qc)

In [11]:
metadata = experiments.load_experiment_metadata()

In [13]:
fastq_results = {}

for flowcell_label, flowcell_info in list(metadata.items()):
    results = []
    for run_info in flowcell_info["datasets"]:
        run_label = run_info["name"]
        for fastq in qc.fastq_paths(flowcell_label, run_label):
            results.append(qc.get_fastq_stats(fastq))
        
        if len(results) > 0:
            cur_results = pandas.concat(results).set_index("read_id")
            if cur_results.index.duplicated().any():
                print(f"WARNING: duplicated reads exist in fastq for run {run_info['name']}!")
                print(sum(cur_results.index.duplicated()))
                print(cur_results.loc[cur_results.index.duplicated()])
            cur_results = cur_results.loc[~cur_results.index.duplicated()]
            fastq_results[run_label] = cur_results
        else:
            print(f"WARNING: run fastq is apparently empty: {run_label}")