In [32]:
from subprocess import call
from os import remove
from os.path import join
import shutil
import pysam as ps
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [33]:
pd.options.display.float_format = '{:,.3f}'.format

In [41]:
trio_dir = "/Users/dcooke/Genomics/octopus/paper/denovo/"

truth_vcfs = {
    'sanger': join(trio_dir, "eval/sanger/sanger_validated.vcf.gz"),
    'igv': join(trio_dir, "eval/igv/strong/igv_validated.vcf.gz")
}

caller_vcfs = {
    'octopus': join(trio_dir, "calls/octopus/octopus.v0.5.2-beta.WGS500.AW-CS-trio.denovo.vcf.gz"),
    'GATK4': join(trio_dir, "calls/gatk4/gatk4.aw_sc_trio.wgs500.bwa.b37.vqsr.cpg.denovo.vcf.gz"),
    'Strelka2': join(trio_dir, "calls/strelka2/strelka.aw_sc_trio.wgs500.bwa.b37.denovo.PASS.all_samples.vcf.gz"),
    'Platypus': join(trio_dir, "calls/platypus/platypus.aw_sc_trio.wgs500.bwa.b37.denovo.vcf.gz"),
    'DeepVariant': join(trio_dir, "calls/deepvariant/deepvariant.aw-trio.wgs500.b37.bwa-mem.denovo.vcf.gz"),
    'FreeBayes': join(trio_dir, "calls/freebayes/freebayes.aw-sc_trio.wgs500.dedup.b37.filter.denovo.norm.vcf.gz")
}

caller_score_metric = {
    'octopus': 'INFO.PP',
    'GATK4': 'QUAL',
    'Strelka2': 'FORMAT.GQX',
    'Platypus': 'QUAL',
    'DeepVariant': 'GQ',
    'FreeBayes': 'GQ'
}

caller_colours = {
    'octopus': sns.xkcd_rgb["windows blue"],
    'GATK4': sns.xkcd_rgb["pale red"],
    'Strelka2': sns.xkcd_rgb["amber"],
    'Platypus': sns.xkcd_rgb["greyish"],
    'DeepVariant': sns.xkcd_rgb["faded green"],
    'FreeBayes': sns.xkcd_rgb["dusty purple"]
}

rtg_bin = '/Users/dcooke/Genomics/apps/rtg-tools/rtg'
rtg_ref = '/Users/dcooke/Genomics/references/human/GRCh37_SDF'

rtg_out_dir = '/Users/dcooke/Genomics/syntumour/eval'

offspring_name = 'AW_SC_4659'

In [35]:
def merge(src_vcf_names, dst_vcf_name):
    merge_cmd = ['bcftools', 'concat', '-a', '-Oz', '-o', dst_vcf_name] + src_vcf_names
    call(merge_cmd)
    index(dst_vcf_name)

def index(vcf_path):
    call(['tabix', vcf_path])
    return vcf_path + '.tbi'

def remove_vcf(vcf_path):
    remove(vcf_path)
    if exists(vcf_path + '.tbi'):
        remove(vcf_path + '.tbi')

In [36]:
def run_rtg_vcfeval(caller_vcf, truth_vcf, out_dir,
                    score_metric=None, bed_regions=None, 
                    include_filtered=False):
    rtg_cmd = [rtg_bin, 'vcfeval', '-t', rtg_ref, '-b', truth_vcf, '-c', caller_vcf, '-o', out_dir,
               '--squash-ploidy', '--sample', offspring_name]
    if bed_regions is not None:
        rtg_cmd += ['--bed-regions', bed_regions]
    if score_metric is not None:
        rtg_cmd += ['-f', score_metric]
    if include_filtered:
        rtg_cmd.append('--all-records')
#     print(' '.join(rtg_cmd))
    call(rtg_cmd)

def evaluate(caller, caller_vcf, truth_vcf, out_dir,
             bed_regions=None, include_filtered=False):
    run_rtg_vcfeval(caller_vcf, truth_vcf, out_dir, 
                    score_metric=caller_score_metric[caller],
                    include_filtered=include_filtered)

In [37]:
def filter_trio_vcf(in_vcf_name, is_good, out_vcf_name):
    in_vcf = ps.VariantFile(in_vcf_name)
    out_vcf = ps.VariantFile(out_vcf_name, 'w', header=in_vcf.header)
    for rec in in_vcf:
        if is_good(rec, in_vcf.header.samples):
            out_vcf.write(rec)
    in_vcf.close()
    out_vcf.close()
    index(out_vcf_name)

def passes_format_thresholds(rec, samples, metric='GQ', threshold=10):
    return all(rec.samples[sample][metric] >= threshold for sample in samples)

In [38]:
def is_snv(record):
    return all(len(record.ref) == len(alt) for alt in record.alts)

def count_records(vcf_path, variant_type='both'):
    vcf = ps.VariantFile(vcf_path)
    if variant_type == 'both':
        return sum(1 for rec in vcf)
    elif 'sn' in variant_type:
        return sum(1 if is_snv(rec) else 0 for rec in vcf)
    else:
        return sum(0 if is_snv(rec) else 1 for rec in vcf)

def count_tp(rtg_eval_dir, variant_type='both'):
    return count_records(join(rtg_eval_dir, 'tp-baseline.vcf.gz'), variant_type=variant_type)

def count_fp(rtg_eval_dir, variant_type='both'):
    return count_records(join(rtg_eval_dir, 'fp.vcf.gz'), variant_type=variant_type)

def count_fn(rtg_eval_dir, variant_type='both'):
    return count_records(join(rtg_eval_dir, 'fn.vcf.gz'), variant_type=variant_type)

def read_caller_metric_df(caller, caller_vcf, truth_vcfs, all_truth_vcf,
                          include_filtered=False,
                          temp_dir='/tmp'):
    caller_df = pd.DataFrame([[caller]], columns=['caller'])
    tot_tp, tot_fn = 0, 0
    tot_tp_snv, tot_tp_indel, tot_fn_snv, tot_fn_indel = 0, 0, 0, 0
    for truth_set_name, truth_vcf in truth_vcfs.items():
        rtg_eval_dir = join(temp_dir, caller + '.' + truth_set_name + '.eval')
        evaluate(caller, caller_vcf, truth_vcf, rtg_eval_dir, include_filtered=include_filtered)
        tp = count_tp(rtg_eval_dir)
        caller_df['tp: ' + truth_set_name] = tp
        tot_tp += tp
        tp_snv = count_tp(rtg_eval_dir, variant_type='snv')
        caller_df['tp SNV: ' + truth_set_name] = tp_snv
        tot_tp_snv += tp_snv
        tp_indel = count_tp(rtg_eval_dir, variant_type='indel')
        caller_df['tp INDEL: ' + truth_set_name] = tp_indel
        tot_tp_indel += tp_indel
        fn = count_fn(rtg_eval_dir)
        caller_df['fn: ' + truth_set_name] = fn
        tot_fn += fn
        fn_snv = count_fn(rtg_eval_dir, variant_type='snv')
        caller_df['fn SNV: ' + truth_set_name] = fn_snv
        tot_fn_snv += fn_snv
        fn_indel = count_fn(rtg_eval_dir, variant_type='indel')
        caller_df['fn INDEL: ' + truth_set_name] = fn_indel
        tot_fn_indel += fn_indel
        shutil.rmtree(rtg_eval_dir)
    caller_df['tp'] = tot_tp
    caller_df['fn'] = tot_fn
    caller_df['tp SNV'] = tot_tp_snv
    caller_df['tp INDEL'] = tot_tp_indel
    caller_df['fn SNV'] = tot_fn_snv
    caller_df['fn INDEL'] = tot_fn_indel
    rtg_eval_dir = join(temp_dir, caller + '.eval')
    evaluate(caller, caller_vcf, all_truth_vcf, rtg_eval_dir, include_filtered=include_filtered)
    caller_df['fp'] = count_fp(rtg_eval_dir)
    shutil.rmtree(rtg_eval_dir)
    n_truth = tot_tp + tot_fn
    caller_df['recall'] = caller_df['tp'] / n_truth
    caller_df['precision'] = caller_df['tp'] / (caller_df['tp'] + caller_df['fp'])
    caller_df['FPR'] = caller_df['fp'] / n_truth
    caller_df['FNR'] = 1. - caller_df['recall']
    caller_df['FDR'] = 1. - caller_df['precision']
    caller_df['F-measure'] = 2 * ((caller_df['precision'] * caller_df['recall']) / (caller_df['precision'] + caller_df['recall']))
    return caller_df

def read_metric_df(caller_vcfs, truth_vcfs, include_filtered=False):
    merged_truth_vcf = "/tmp/merged_truth.vcf.gz"
    merge([p[1] for p in truth_vcfs.items()], merged_truth_vcf)
    caller_dfs = []
    for caller, caller_vcf in caller_vcfs.items():
        caller_dfs.append(read_caller_metric_df(caller, caller_vcf, truth_vcfs, merged_truth_vcf, include_filtered))
    remove(merged_truth_vcf)
    remove(merged_truth_vcf + ".tbi")
    return pd.concat(caller_dfs)

In [42]:
metric_df = read_metric_df(caller_vcfs, truth_vcfs)

In [40]:
metric_df[['caller', 'tp SNV', 'tp INDEL', 'fn SNV', 'fn INDEL', 'tp', 'fn', 'fp']]

Unnamed: 0,caller,tp SNV,tp INDEL,fn SNV,fn INDEL,tp,fn,fp
0,octopus,72,13,3,8,85,11,10
0,GATK4,46,4,29,17,50,46,90
0,Strelka2,75,5,0,16,80,16,1548
0,Platypus,72,4,3,17,76,20,163
0,DeepVariant,70,0,5,21,70,26,10306
0,FreeBayes,73,5,2,16,78,18,339


In [31]:
metric_unfiltered_df = read_metric_df(caller_vcfs, truth_vcfs, include_filtered=True)

In [32]:
metric_unfiltered_df

Unnamed: 0,caller,tp: sanger,fn: sanger,tp: igv,fn: igv,tp,fn,fp,recall,precision,FPR,FNR,FDR,F-measure
0,octopus,57,0,39,0,96,0,20101,1.0,0.005,209.385,0.0,0.995,0.009
0,GATK4,39,18,14,25,53,43,834,0.552,0.06,8.688,0.448,0.94,0.108
0,Strelka2,57,0,24,15,81,15,7432,0.844,0.011,77.417,0.156,0.989,0.021
0,Platypus,56,1,25,14,81,15,3694,0.844,0.021,38.479,0.156,0.979,0.042
0,FreeBayes,57,0,22,17,79,17,4810,0.823,0.016,50.104,0.177,0.984,0.032


In [19]:
filtered_strelka2_vcf = caller_vcfs['Strelka2'].replace('.vcf', '.filtered.vcf')
filter_trio_vcf(caller_vcfs['Strelka2'],
                lambda rec, samples: passes_format_thresholds(rec, samples, 'GQX', 10),
                filtered_strelka2_vcf)
merged_truth_vcf = "/tmp/merged_truth.vcf.gz"
merge([p[1] for p in truth_vcfs.items()], merged_truth_vcf)
df = read_caller_metric_df('Strelka2', filtered_strelka2_vcf, truth_vcfs, merged_truth_vcf)
df[['tp SNV', 'tp INDEL', 'fn SNV', 'fn INDEL', 'tp', 'fn', 'fp']]

Unnamed: 0,tp SNV,tp INDEL,fn SNV,fn INDEL,tp,fn,fp
0,75,5,0,16,80,16,492


In [29]:
filtered_strelka2_vcf = caller_vcfs['Strelka2'].replace('.vcf', '.filtered.vcf')
filter_trio_vcf(caller_vcfs['Strelka2'],
                lambda rec, samples: passes_format_thresholds(rec, samples, 'GQX', 20),
                filtered_strelka2_vcf)
merged_truth_vcf = "/tmp/merged_truth.vcf.gz"
merge([p[1] for p in truth_vcfs.items()], merged_truth_vcf)
df = read_caller_metric_df('Strelka2', filtered_strelka2_vcf, truth_vcfs, merged_truth_vcf)
df[['tp SNV', 'tp INDEL', 'fn SNV', 'fn INDEL', 'tp', 'fn', 'fp']]

Unnamed: 0,tp SNV,tp INDEL,fn SNV,fn INDEL,tp,fn,fp
0,68,4,7,17,72,24,114


In [30]:
filtered_strelka2_vcf = caller_vcfs['Strelka2'].replace('.vcf', '.filtered.vcf')
filter_trio_vcf(caller_vcfs['Strelka2'],
                lambda rec, samples: passes_format_thresholds(rec, samples, 'GQX', 22),
                filtered_strelka2_vcf)
merged_truth_vcf = "/tmp/merged_truth.vcf.gz"
merge([p[1] for p in truth_vcfs.items()], merged_truth_vcf)
df = read_caller_metric_df('Strelka2', filtered_strelka2_vcf, truth_vcfs, merged_truth_vcf)
df[['tp SNV', 'tp INDEL', 'fn SNV', 'fn INDEL', 'tp', 'fn', 'fp']]

Unnamed: 0,tp SNV,tp INDEL,fn SNV,fn INDEL,tp,fn,fp
0,68,4,7,17,72,24,106


In [31]:
filtered_strelka2_vcf = caller_vcfs['Strelka2'].replace('.vcf', '.filtered.vcf')
filter_trio_vcf(caller_vcfs['Strelka2'],
                lambda rec, samples: passes_format_thresholds(rec, samples, 'GQX', 23),
                filtered_strelka2_vcf)
merged_truth_vcf = "/tmp/merged_truth.vcf.gz"
merge([p[1] for p in truth_vcfs.items()], merged_truth_vcf)
df = read_caller_metric_df('Strelka2', filtered_strelka2_vcf, truth_vcfs, merged_truth_vcf)
df[['tp SNV', 'tp INDEL', 'fn SNV', 'fn INDEL', 'tp', 'fn', 'fp']]

Unnamed: 0,tp SNV,tp INDEL,fn SNV,fn INDEL,tp,fn,fp
0,46,4,29,17,50,46,41


In [28]:
filtered_strelka2_vcf = caller_vcfs['Strelka2'].replace('.vcf', '.filtered.vcf')
filter_trio_vcf(caller_vcfs['Strelka2'],
                lambda rec, samples: passes_format_thresholds(rec, samples, 'GQX', 25),
                filtered_strelka2_vcf)
merged_truth_vcf = "/tmp/merged_truth.vcf.gz"
merge([p[1] for p in truth_vcfs.items()], merged_truth_vcf)
df = read_caller_metric_df('Strelka2', filtered_strelka2_vcf, truth_vcfs, merged_truth_vcf)
df[['tp SNV', 'tp INDEL', 'fn SNV', 'fn INDEL', 'tp', 'fn', 'fp']]

Unnamed: 0,tp SNV,tp INDEL,fn SNV,fn INDEL,tp,fn,fp
0,46,4,29,17,50,46,31


In [27]:
filtered_deepvariant_vcf = caller_vcfs['DeepVariant'].replace('.vcf', '.filtered.vcf')
filter_trio_vcf(caller_vcfs['DeepVariant'],
                lambda rec, samples: passes_format_thresholds(rec, samples, 'GQ', 46),
                filtered_deepvariant_vcf)
merged_truth_vcf = "/tmp/merged_truth.vcf.gz"
merge([p[1] for p in truth_vcfs.items()], merged_truth_vcf)
df = read_caller_metric_df('DeepVariant', filtered_deepvariant_vcf, truth_vcfs, merged_truth_vcf)
df[['tp SNV', 'tp INDEL', 'fn SNV', 'fn INDEL', 'tp', 'fn', 'fp']]

Unnamed: 0,tp SNV,tp INDEL,fn SNV,fn INDEL,tp,fn,fp
0,9,0,66,21,9,87,90
