In [1]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])
project = 'brownm28/kfdrc-benchmarking'

### Tag files

In [2]:
bam_manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/repository_1560614429.tsv')
cav_file_list = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jun-20_set.csv')
header_key = {"ICGC Donor": "case_id", "Specimen ID": "aliquot_id", "Sample ID": "sample_id", "Project": "disease_type"}
head = next(bam_manifest)
header = head.rstrip('\n').split('\t')
meta_dict = {}
for line in bam_manifest:
    info = line.rstrip('\n').split('\t')
    fname = info[3]
    info[9] = info[9].split('-')[0]
    spec_type = info[6]
    spec_type = spec_type.replace('tumour', 'tumor')
    (sample_type, Composition) = spec_type.split(' - ')
    meta_dict[fname] = {}
    meta_dict[fname]['reference_genome'] = 'hg19'
    for key in header_key:
        try:
            meta_dict[fname][header_key[key]] = info[header.index(key)]
        except:
            pdb.set_trace()
            hold = 1
    meta_dict[fname]['sample_type'] = sample_type
    meta_dict[fname]['Composition'] = Composition
    # pdb.set_trace()
    # hold = 1

head = next(cav_file_list)
for line in cav_file_list:
    fields = line.rstrip('\n').split(',')
    (fid, fname) = (fields[0], fields[1])
    file_obj = api.files.get(fid)
    
    for key in meta_dict[fname]:
        file_obj.metadata[key] = meta_dict[fname][key]
    sys.stderr.write('Tagged ' + fname + '\n')
    file_obj.save()
    

Tagged PCAWG.11d167fc-2ff3-42e0-b064-b11ad64d456f.bam
Tagged PCAWG.20b85de1-8cc4-4a57-baa8-656cdde95fa9.bam
Tagged PCAWG.222f5df2-cab8-4e0f-8e65-cfb1862779ca.bam
Tagged PCAWG.29034096-d0af-4a86-b348-1c918253a9ef.bam
Tagged PCAWG.3b8eb3f9-0ebb-4da8-bb18-7d1bf75ce527.bam
Tagged PCAWG.47071d63-9223-4faf-9a50-3af9c6c9492e.bam
Tagged PCAWG.67455c36-aa47-4cc4-8b6d-9a9012b616ed.bam
Tagged PCAWG.80304de2-8f90-4d66-a991-f1102cfb3eb9.bam
Tagged PCAWG.b75e14f5-11dd-4a0c-a072-304f9ea40885.bam


### Index bams

In [3]:
cav_file_list = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jun-20_set.csv')
head = next(cav_file_list)
app_name = project + '/samtools-index'
for line in cav_file_list:
    info = line.rstrip('\n').split(',')
    bam = api.files.get(info[0])
    in_dict = {}
    in_dict['input_reads'] = bam
    in_dict['threads'] = 4
    task_name = 'SAMTOOLS INDEX: ' + bam.metadata['sample_id']
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=True)
    task.save()

## Benchmark Lancet

In [1]:
def get_refs(api, project):
    ref_dict = {}
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['hs37d5.fa'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['hs37d5.dict'])[0]
    ref_dict['wgs_calling_interval_list'] = api.files.query(project=project, names=['b37_wgs_calling_regions.bed'])[0]
    # ref_dict['wgs_calling_interval_list'] = api.files.query(project=project, names=['PCAWG_super_merged_target.bed'])[0]
    # ref_dict['exome_flag'] = 'Y'
    ref_dict['exome_flag'] = 'N'
    ref_dict['select_vars_mode'] = 'gatk'
    # ref_dict['window'] = 600
    ref_dict['window'] = 500
    return ref_dict

In [4]:
app_name = project + '/kfdrc-lancet-wf-benchmark'
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/all_wgs_bams.csv')
ref_objs = get_refs(api, project)
head = next(manifest)
bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
for case_id in bam_dict:
    in_dict = {}
    for key in ref_objs:
        in_dict[key] = ref_objs[key]
    tumor = ''
    normal = ''
    for stype in bam_dict[case_id]:
        if stype == 'Normal':
            in_dict['input_normal_aligned'] = bam_dict[case_id][stype]['file_obj']
            normal = bam_dict[case_id][stype]['sid']
        else:
            in_dict['input_tumor_aligned'] = bam_dict[case_id][stype]['file_obj']
            tumor = bam_dict[case_id][stype]['sid']
    # task_name = 'LANCET EXOME REGION: ' + case_id + ' ' + tumor + ' ' + normal
    task_name = 'LANCET WGS BENCHMARK: ' + case_id + ' ' + tumor + ' ' + normal
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    

## Benchmark mutect2

In [22]:
def get_mutect_refs(api, project):
    ref_dict = {}
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['hs37d5.fa'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['hs37d5.dict'])[0]
    ref_dict['wgs_calling_interval_list'] = api.files.query(project=project, names=['b37_wgs_calling_regions.bed'])[0]
    ref_dict['af_only_gnomad_vcf'] = api.files.query(project=project, names=['af-only-gnomad.raw.sites.b37.vcf.gz'])[0]
    ref_dict['exac_common_vcf'] = api.files.query(project=project, names=['small_exac_common_3_b37.vcf.gz'])[0]
    ref_dict['exome_flag'] = 'N'
    ref_dict['select_vars_mode'] = 'gatk'
    return ref_dict


In [23]:
app_name = project + '/kfdrc-mutect2-sans-vep'
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/all_wgs_bams.csv')
ref_objs = get_mutect_refs(api, project)
head = next(manifest)
bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
for case_id in bam_dict:
    in_dict = {}
    for key in ref_objs:
        in_dict[key] = ref_objs[key]
    tumor = ''
    normal = ''
    for stype in bam_dict[case_id]:
        if stype == 'Normal':
            in_dict['input_normal_aligned'] = bam_dict[case_id][stype]['file_obj']
            normal = bam_dict[case_id][stype]['sid']
            in_dict['input_normal_name'] = normal
        else:
            in_dict['input_tumor_aligned'] = bam_dict[case_id][stype]['file_obj']
            tumor = bam_dict[case_id][stype]['sid']
            in_dict['input_tumor_name'] = tumor
    task_name = 'MUTECT2 WGS BENCHMARK: ' + case_id + ' ' + tumor + ' ' + normal
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    

## Benchmark strelka2

In [18]:
def get_strelka2_refs(api, project):
    ref_dict = {}
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['hs37d5.fa'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['hs37d5.dict'])[0]
    ref_dict['hg38_strelka_bed'] = api.files.query(project=project, names=['hs37d5_strelka2_canonical.bed.gz'])[0]
    ref_dict['exome_flag'] = 'N'
    ref_dict['select_vars_mode'] = 'gatk'
    return ref_dict

In [19]:
app_name = project + '/kfdrc-strelka2-benchmark-wf'
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/all_but_GBM_bam_manifest.csv')
ref_objs = get_strelka2_refs(api, project)
head = next(manifest)
bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
for case_id in bam_dict:
    in_dict = {}
    for key in ref_objs:
        in_dict[key] = ref_objs[key]
    tumor = ''
    normal = ''
    for stype in bam_dict[case_id]:
        if stype == 'Normal':
            in_dict['input_normal_aligned'] = bam_dict[case_id][stype]['file_obj']
            normal = bam_dict[case_id][stype]['sid']
            in_dict['input_normal_name'] = normal
        else:
            in_dict['input_tumor_aligned'] = bam_dict[case_id][stype]['file_obj']
            tumor = bam_dict[case_id][stype]['sid']
            in_dict['input_tumor_name'] = tumor
    task_name = 'STRELKA2 WGS BENCHMARK: ' + case_id + ' ' + tumor + ' ' + normal
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    

## Benchmark Vardict Java

In [20]:
def get_vardict_refs(api, project):
    ref_dict = {}
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['hs37d5.fa'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['hs37d5.dict'])[0]
    ref_dict['wgs_calling_interval_list'] = api.files.query(project=project, names=['b37_wgs_calling_regions.bed'])[0]
    ref_dict['exome_flag'] = 'N'
    ref_dict['select_vars_mode'] = 'gatk'
    return ref_dict

In [21]:
app_name = project + '/kfdrc-vardict-benchmark'
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/all_wgs_bams.csv')
ref_objs = get_vardict_refs(api, project)
head = next(manifest)
bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
for case_id in bam_dict:
    in_dict = {}
    for key in ref_objs:
        in_dict[key] = ref_objs[key]
    tumor = ''
    normal = ''
    for stype in bam_dict[case_id]:
        if stype == 'Normal':
            in_dict['input_normal_aligned'] = bam_dict[case_id][stype]['file_obj']
            normal = bam_dict[case_id][stype]['sid']
            in_dict['input_normal_name'] = normal
        else:
            in_dict['input_tumor_aligned'] = bam_dict[case_id][stype]['file_obj']
            tumor = bam_dict[case_id][stype]['sid']
            in_dict['input_tumor_name'] = tumor
    task_name = 'VARDICT WGS BENCHMARK: ' + case_id + ' ' + tumor + ' ' + normal
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    

### Tag benchmark outputs

In [2]:
tasks = api.tasks.query(project=project, status="COMPLETED").all()
# task name search phrase
phrase = "KFDRC CONSENSUS CALLER: DO8264"
# modify this to set which input file to use to tag the outputs with, may need to modify code if an array element
in_key = 'lancet_vcf'
for task in tasks:
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        metadata = task.inputs[in_key].metadata
        for out_key in task.outputs:
            try:
                if type(task.outputs[out_key]) is not list:
                    file_obj = api.files.get(task.outputs[out_key].id)
                    for key in metadata:
                        file_obj.metadata[key] = metadata[key]
                    file_obj.save()
                else:
                    for output in task.outputs[out_key]:
                        file_obj = api.files.get(output.id)
                        for key in metadata:
                            file_obj.metadata[key] = metadata[key]
                        file_obj.save()
            except Exception as e:
                print(e)
                print("Skipping " + task.name + " due to error")

Valid task found KFDRC CONSENSUS CALLER: DO8264


### Tag from nested inputs

In [27]:
tasks = api.tasks.query(project=project, status="COMPLETED").all()
# task name search phrase
phrase = "BCBIO ENSEMBLE CONSENSUS"
# modify this to set which input file to use to tag the outputs with, may need to modify code if an array element
in_key = 'input_vcfs'
for task in tasks:
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        fset = task.inputs[in_key]
        for i in range(len(fset)):
            metadata = fset[i][0].metadata
            for out_key in task.outputs:
                try:
                    file_obj = api.files.get(task.outputs[out_key][i])
                    for key in metadata:
                        file_obj.metadata[key] = metadata[key]
                    file_obj.save()
                except Exception as e:
                    print(e)
                    print("Skipping " + task.name + " due to error")

Valid task found BCBIO ENSEMBLE CONSENSUS SNV MNV: PLUS VARDICT LANCET
Valid task found BCBIO ENSEMBLE CONSENSUS INDEL: PLUS VARDICT LANCET
Valid task found BCBIO ENSEMBLE CONSENSUS INDEL: PLUS VARDICT
Valid task found BCBIO ENSEMBLE CONSENSUS SNV MNV: PLUS VARDICT
Valid task found BCBIO ENSEMBLE CONSENSUS SNV MNV: PLUS LANCET
Valid task found BCBIO ENSEMBLE CONSENSUS INDEL: PLUS LANCET


### Tag batch outputs

In [3]:
batch_id = 'cc6f68cc-3aea-498b-ba08-70cce50a5477'
batch_task = api.tasks.get(batch_id)
for task in batch_task.get_batch_children():
    # pdb.set_trace()
    sys.stderr.write('Valid task found ' + task.name + '\n')
    metadata = task.inputs['input_vcf'].metadata
    for out_key in task.outputs:
        try:
            file_obj = api.files.get(task.outputs[out_key].id)
            for key in metadata:
                file_obj.metadata[key] = metadata[key]
            file_obj.save()
        except Exception as e:
            print(e)
            print("Skipping " + task.name + " due to error")

Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: d42a3496-1367-4880-87ca-bdebb0624bf4.vardict.PASS.vcf.gz
Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: 4181ddf3-2c95-4ac8-bee3-f04d2971efb2.vardict.PASS.vcf.gz
Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: ecf49c6e-2e82-49e5-88e6-3fb66f02d399.vardict.PASS.vcf.gz
Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: 85c4023d-74a3-4beb-999b-87a14c8e7cee.vardict.PASS.vcf.gz
Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: 4886c27f-fcdb-4993-bd08-f9c0a6d37806.vardict.PASS.vcf.gz
Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: fa419682-ad0f-4d4f-ad35-576804ce5cde.vardict.PASS.vcf.gz
Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: 92fd1cc1-30c6-42da-9c4a-e9c246d23dbc.vardict.PASS.vcf.gz
Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: d83802a6-1976-4c6f-a638-f912cbe7f537.vardict.PASS.vcf.gz
Valid task found BCFTOOLS NORM SPLIT: VARDICT RPT: file: 7c8cb026-27aa-4c0b-bbbb-2486b14

## Add note to outputs

In [5]:
tasks = api.tasks.query(project=project, status="COMPLETED").all()
phrase = "window 300 "
note = "Run with non default 300bp window to test change in accuracy with increase of speed"
for task in tasks:
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        for out_key in task.outputs:
            file_obj = api.files.get(task.outputs[out_key].id)
            file_obj.metadata['notes'] = note
            file_obj.save()

## Benchmark PINDEL

In [12]:
def get_pindel_refs(api, project):
    ref_dict = {}
    ref_dict['reference_fasta'] = api.files.query(project=project, names=['hs37d5.fa'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['hs37d5.dict'])[0]
    ref_dict['wgs_calling_bed'] = api.files.query(project=project, names=['b37_wgs_calling_regions.bed'])[0]
    ref_dict['exome_flag'] = 'N'
    ref_dict['genome_assembly'] = 'hs37d5'
    ref_dict['insert_length'] = 250
    return ref_dict

In [13]:
app_name = project + '/kfdrc-pindel-benchmark-wf'
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/all_but_GBM_bam_manifest.csv')
ref_objs = get_pindel_refs(api, project)
head = next(manifest)
bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
for case_id in bam_dict:
    in_dict = {}
    for key in ref_objs:
        in_dict[key] = ref_objs[key]
    tumor = ''
    normal = ''
    for stype in bam_dict[case_id]:
        if stype == 'Normal':
            in_dict['input_normal_aligned'] = bam_dict[case_id][stype]['file_obj']
            normal = bam_dict[case_id][stype]['sid']
            in_dict['input_normal_name'] = normal
        else:
            in_dict['input_tumor_aligned'] = bam_dict[case_id][stype]['file_obj']
            tumor = bam_dict[case_id][stype]['sid']
            in_dict['input_tumor_name'] = tumor
    task_name = 'PINDEL WGS BENCHMARK: ' + case_id + ' ' + tumor + ' ' + normal
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    

### bcf filter pindel unfiltered outputs

In [2]:
app_name = project + '/bcftools-filter-pindel'
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/pindel_re-filter/pindel_unfiltered-manifest.csv')
tool_name = 'pindel'
depth= 20
vaf = 0.1
head = next(manifest)
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    in_dict = {}
    in_dict['vaf'] = vaf
    in_dict['depth'] = depth
    in_dict['tool_name'] = tool_name
    in_dict['input_vcf'] = api.files.get(fid)
    task_name = 'BCFTOOLS PINDEL FILTER: ' + case_id + ' ' + sample_id
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()

## BENCHMARK CAVEMAN

In [14]:
def get_caveman_refs(api, project):
    ref_dict = {}
    ref_dict['bed_refs_tar'] = api.files.query(project=project, names=['caveman_hs37d5_bed_refs.merged.tar.gz'])[0]
    ref_dict['blacklist'] = api.files.query(project=project, names=['hs37d5_blacklist.tsv'])[0]
    ref_dict['flag_config'] = api.files.query(project=project, names=['caveman_flags_config.ini'])[0]
    ref_dict['flag_convert'] = api.files.query(project=project, names=['caveman_flag_convert.ini'])[0]
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['hs37d5.fa'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['hs37d5.dict'])[0]
    ref_dict['split_size'] = 128
    ref_dict['assay_type'] = 'WGS'
    ref_dict['species'] = 'Human'
    ref_dict['threads'] = 48
    ref_dict['genome_assembly'] = 'hs37d5'
    return ref_dict

In [15]:
app_name = project + '/kfdrc-caveman-snv-benchmark-wf'
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/all_but_GBM_bam_manifest.csv')
ref_objs = get_caveman_refs(api, project)
head = next(manifest)
bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
for case_id in bam_dict:
    in_dict = {}
    for key in ref_objs:
        in_dict[key] = ref_objs[key]
    tumor = ''
    normal = ''
    for stype in bam_dict[case_id]:
        if stype == 'Normal':
            in_dict['input_normal_aligned'] = bam_dict[case_id][stype]['file_obj']
            normal = bam_dict[case_id][stype]['sid']
            in_dict['input_normal_name'] = normal
        else:
            in_dict['input_tumor_aligned'] = bam_dict[case_id][stype]['file_obj']
            tumor = bam_dict[case_id][stype]['sid']
            in_dict['input_tumor_name'] = tumor
    task_name = 'CAVEMAN SNV WGS BENCHMARK: ' + case_id + ' ' + tumor + ' ' + normal
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    

## PCAWG Annotate

### Variant bam

In [2]:
def get_varbam_refs(api):
    refs = {}
    refs['reference_fasta'] = api.files.get('5d0a7b64e4b07ea2bda1b7df')
    return refs

In [3]:
bam_manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/PCAWG_TEST/bams-manifest.csv')
snv_manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/PCAWG_TEST/snv_vcf-manifest.csv')
indel_manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/PCAWG_TEST/indel_vcf-manifest.csv')

app_name = project +"/pcawg-variant-bam"
ref_objs = get_varbam_refs(api)

head = next(bam_manifest)
bam_dict = {}
for line in bam_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
snv_dict = {}
head = next(snv_manifest)
for line in snv_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    fid = info[0]
    cur = api.files.get(fid)
    tool = cur.tags[0]
    if case_id not in snv_dict:
        snv_dict[case_id] = {}
    snv_dict[case_id][tool] = cur
indel_dict = {}
tool_dict = {}
head = next(indel_manifest)
for line in indel_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    fid = info[0]
    cur = api.files.get(fid)
    tool = cur.tags[0]
    tool_dict[tool] = 1
    if case_id not in indel_dict:
        indel_dict[case_id] = {}
    indel_dict[case_id][tool] = cur
    
for case_id in bam_dict:
    for tool in tool_dict:
        for stype in bam_dict[case_id]:
            in_dict = {}
            in_dict['reference_fasta'] = ref_objs['reference_fasta']
            in_dict['snv_vcf'] = snv_dict[case_id][tool]
            in_dict['indel_vcf'] = indel_dict[case_id][tool]
            in_dict['tool_name'] = tool
            in_dict['input_bam_aligned'] = bam_dict[case_id][stype]['file_obj']
            task_name = 'PCAWG VARIANT BAM: ' + case_id + " " + bam_dict[case_id][stype]['sid'] + " " + tool
            task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
            task.save()
            

In [21]:
tasks = api.tasks.query(project=project, status="DRAFT")
for task in tasks:
   task.run()

### annotate vcfs

In [6]:
def get_annot_refs(api):
    refs = {}
    refs['bgzipped_reference_fasta'] = api.files.get('5d24fb80e4b07ea29a76e786')
    return refs

#### annotate snvs

In [17]:
bam_manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/PCAWG_TEST/variant_bams_only.csv')
snv_manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/PCAWG_TEST/snv_vcf-manifest.csv')

app_name = project + "/pcawg-annot-snv"
ref_objs = get_annot_refs(api)

head = next(bam_manifest)
bam_dict = {}
tool_list = ['LANCET', 'STRELKA2', 'MUTECT2', 'VARDICT', 'SANGER', 'CAVEMAN', 'PINDEL']
for line in bam_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    bam = api.files.get(fid)
    tool = ''
    for tag in bam.tags:
        if tag in tool_list:
            tool = tag
            break
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    if tool not in bam_dict[case_id]:
        bam_dict[case_id][tool] = {}
    bam_dict[case_id][tool][stype] = {}
    bam_dict[case_id][tool][stype]['sid'] = sample_id
    bam_dict[case_id][tool][stype]['file_obj'] = bam
snv_dict = {}
tool_dict = {}
head = next(snv_manifest)
for line in snv_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    fid = info[0]
    cur = api.files.get(fid)
    tool = cur.tags[0]
    if case_id not in snv_dict:
        snv_dict[case_id] = {}
    snv_dict[case_id][tool] = cur
    tool_dict[tool] = 1
    
for case_id in bam_dict:
    for tool in tool_dict:
        in_dict = {}
        in_dict['bgzipped_reference_fasta'] = ref_objs['bgzipped_reference_fasta']
        in_dict['input_snv_vcf'] = snv_dict[case_id][tool]
        for stype in bam_dict[case_id][tool]:
            if stype == 'Normal':
                in_dict['input_normal_variant_bam'] = bam_dict[case_id][tool][stype]['file_obj']
                normal = bam_dict[case_id][tool][stype]['sid']
            else:
                in_dict['input_tumor_variant_bam'] = bam_dict[case_id][tool][stype]['file_obj']
                tumor = bam_dict[case_id][tool][stype]['sid']
        task_name = 'PCAWG ANNOT SNV RERUN: ' + case_id + " " + tumor + " " + normal + " " + tool
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.save()
            

#### annotate indels

In [20]:
bam_manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/PCAWG_TEST/variant_bams_only.csv')
indel_manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/PCAWG_TEST/indel_vcf-manifest.csv')

app_name = project + "/pcawg-annot-indel"
ref_objs = get_annot_refs(api)

head = next(bam_manifest)
bam_dict = {}
tool_list = ['LANCET', 'STRELKA2', 'MUTECT2', 'VARDICT', 'SANGER', 'CAVEMAN', 'PINDEL']
for line in bam_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[-1]
    sample_id = info[-3]
    fid = info[0]
    bam = api.files.get(fid)
    tool = ''
    for tag in bam.tags:
        if tag in tool_list:
            tool = tag
            break
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    if tool not in bam_dict[case_id]:
        bam_dict[case_id][tool] = {}
    bam_dict[case_id][tool][stype] = {}
    bam_dict[case_id][tool][stype]['sid'] = sample_id
    bam_dict[case_id][tool][stype]['file_obj'] = bam
tool_dict = {}
head = next(indel_manifest)
for line in indel_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    fid = info[0]
    cur = api.files.get(fid)
    tool = cur.tags[0]
    if case_id not in indel_dict:
        indel_dict[case_id] = {}
    indel_dict[case_id][tool] = cur
    tool_dict[tool] = 1
    
for case_id in bam_dict:
    for tool in tool_dict:
        in_dict = {}
        in_dict['bgzipped_reference_fasta'] = ref_objs['bgzipped_reference_fasta']
        in_dict['input_indel_vcf'] = indel_dict[case_id][tool]
        for stype in bam_dict[case_id][tool]:
            if stype == 'Normal':
                in_dict['input_normal_variant_bam'] = bam_dict[case_id][tool][stype]['file_obj']
                normal = bam_dict[case_id][tool][stype]['sid']
            else:
                in_dict['input_tumor_variant_bam'] = bam_dict[case_id][tool][stype]['file_obj']
                tumor = bam_dict[case_id][tool][stype]['sid']
        task_name = 'PCAWG ANNOT INDEL RERUN: ' + case_id + " " + tumor + " " + normal + " " + tool
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.save()


#### Review failed mutect2 tasks

In [11]:
tasks = api.tasks.query(project=project, status="FAILED").all()
for task in tasks:
    if re.search("MUTECT2 WGS", task.name):
        #pdb.set_trace()
        print (task.name + "\t" + task.id + "\t" + str(task.created_time) + "\t" + task.execution_status.message)

MUTECT2 WGS BENCHMARK: DO10900 SA135301 SA135424	9628f79f-3ab9-434d-996d-ad7793cb472b	2019-06-19 17:41:02	Command /gatk Mutect2 --java-options "-Xmx6000m" -R /sbgenomics/Projects/87fde798-4ab5-40e1-8ef9-7468e4a905de/hg19.fa -I /sbgenomics/Projects/87fde798-4ab5-40e1-8ef9-7468e4a905de/BAM/PCAWG.64d83e97-f798-45d1-b9e6-efaa635b4abb.bam -I /sbgenomics/Projects/87fde798-4ab5-40e1-8ef9-7468e4a905de/BAM/PCAWG.fa4fa49d-6d53-4ffa-9759-ffb884b28d17.bam -tumor SA135301 -normal SA135424 --disable-read-filter MateOnSameContigOrNoMappedMateReadFilter -L /sbgenomics/workspaces/87fde798-4ab5-40e1-8ef9-7468e4a905de/tasks/9628f79f-3ab9-434d-996d-ad7793cb472b/gatk_intervallisttools/temp_0041_of_44/scattered.interval_list.0041.bed --germline-resource /sbgenomics/Projects/87fde798-4ab5-40e1-8ef9-7468e4a905de/af-only-gnomad.raw.sites.hg19.vcf.gz --f1r2-tar-gz PCAWG.64d83e97-f798-45d1-b9e6-efaa635b4abb.scattered.interval_list.0041.f1r2_counts.tar.gz -O PCAWG.64d83e97-f798-45d1-b9e6-efaa635b4abb.scattered.in

#### Repeat failed tasks

In [17]:
app_name = project + '/kfdrc-mutect2-sans-vep'
for task_id in open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/mutect2_re-run/failed_tasks_to_rerun.txt'):
    old_task = api.tasks.get(task_id.rstrip('\n'))
    new_task = api.tasks.create(name=old_task.name, project=project, app=app_name, inputs=old_task.inputs, run=False)
    new_task.inputs['output_basename'] = new_task.id
    new_task.save()


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

## Merge pindel mnv, caveman snv

In [8]:
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/2019-Jul-12_big_run/pindel_re-filter/updated_cave_pindel_merge.csv')
head = next(manifest)
pt_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    finfo = info[0:2]
    # pdb.set_trace()
    pt_id = info[8]
    if pt_id not in pt_dict:
        pt_dict[pt_id] = []
    pt_dict[pt_id].append(finfo)
manifest.close()
dict_file = api.files.get('5d0a7b63e4b07ea2bda1b7da')
tool_name = 'caveman_plus_pindel_mnv.bcfNorm'
app_name = project + "/gatk4-mergevcfs"
for pt_id in pt_dict:
    in_dict = {}
    
    in_dict['input_vcfs'] = []
    for finfo in pt_dict[pt_id]:
        try:
            # pdb.set_trace()
            in_dict['input_vcfs'].append(api.files.get(finfo[0]))
        except Exception as e:
            pdb.set_trace()
            sys.stderr.write('Got error processing ' + finfo[1] + ' with ID ' + finfo[0] + '\n')
            sys.stderr.write(str(e) + '\n')
            exit(1)
    in_dict['reference_dict'] = dict_file
    in_dict['tool_name'] = tool_name
    task_name = 'GATK MERGE CAVEMANsnv PINDELmnv UPDATED: ' + pt_id
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    
    

## BCBIO ensemble consensus call

In [11]:
manifest = open('/Users/brownm28/Documents/2019-Jun-6_benchmarking/CONSENSUS_CALLS/vardict_filtering/indel_to_call.csv')
tool_list = ('mutect2', 'strelka2', 'lancet', 'vardict_vaf10')
short = 'vaf10_indel'
task_name = 'BCBIO ENSEMBLE CONSENSUS INDEL: VAF10'
reference = api.files.get('5d0a7b64e4b07ea2bda1b7df')
head = next(manifest)
vcf_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    fid = info[0]
    fname = info[1]
    case_id = info[-1]
    sample_id = info[-2]
    for i in range(len(tool_list)):
        if re.search(tool_list[i], fname):
            if case_id not in vcf_dict:
                vcf_dict[case_id] = [None] * len(tool_list)
            vcf_dict[case_id][i] = api.files.get(fid)
            break
tools_csv = ",".join(tool_list)
app_name = project + '/kfdrc-bcbio-consensus-wf'
in_dict = {}
in_dict['reference'] = reference
in_dict['tool_name_csv'] = tools_csv
in_dict['output_basename'] = []
in_dict['input_vcfs'] = []
for case_id in vcf_dict:
    in_dict['output_basename'].append(case_id + '_' + short)
    in_dict['input_vcfs'].append(vcf_dict[case_id])

task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)