In [None]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])

In [None]:
def get_relevant_file_objs(api, project):
    # will hold all keys for reference inputs
    ref_obj_dict = {}
    ref_obj_dict['exome_target_bed'] = api.files.query(project=project, names=['Strexome_targets_intersect_sorted_padded100.GRCh38.bed.gz'])[0]
    ref_obj_dict['ref_tar_gz'] = api.files.query(project=project, names=['hg38_snpeff.tgz'])[0]
    ref_obj_dict['vep_cache'] = api.files.query(project=project, names=['homo_sapiens_vep_93_GRCh38_convert_cache.tar.gz'])[0]
    ref_obj_dict ['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    # will hold all input reference objects
    return ref_obj_dict
    

In [None]:
def get_tn_pairs(fn):
    fh = open(fn)
    next(fh)
    tn_dict = {}
    for line in fh:
        info = line.rstrip('\n').split('\t')
        tn_dict[info[1]] = info[4]
    return tn_dict

In [None]:
def get_inputs(api, project, tn_dict):
    bs_cram_dict = {}
    tasks = api.tasks.query(project = project, status = 'COMPLETED').all()
    for task in tasks:
        parts = task.name.split('-')
        bs_cram_dict[parts[1]] = task.outputs['cram']
    return bs_cram_dict

In [None]:
def create_tasks(api, project, tn_dict, bs_dict, refs):
    app_name = project + '/kfdrc-pnoc-wes-somatic-workflow'
    out_task = open('/Users/brownm28/Documents/2018-Nov-6_open_dipg/somatic_calls/pnoc_somatic_call_tasks.txt', 'w')
    for bs_id in tn_dict:
        inputs = refs
        norm = tn_dict[bs_id]
        task_name = 'PNOC-WES-somatic-' + bs_id + '_' + norm
        inputs['tumor_cram'] = bs_dict[bs_id]
        inputs['normal_cram'] = bs_dict[norm]
        inputs['normal_id'] = norm
        inputs['tumor_id'] = bs_id
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=inputs, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
        out_task.write('\t'.join((task_name, task.id)) + '\n')
    out_task.close()

In [None]:
refs = get_relevant_file_objs(api=api, project=pnoc_wes)
tbl = '/Users/brownm28/Documents/2018-Nov-6_open_dipg/pnoc_inputs.txt'
tn_pairs = get_tn_pairs(tbl)
bs_inputs = get_inputs(api, pnoc_wes, tn_pairs)
#pdb.set_trace()
create_tasks(api, pnoc_wes, tn_pairs, bs_inputs, refs)

In [None]:
def run_task(api, entry):
    info = entry.rstrip('\n').split()
    # (tag, tid) = info[-1].split(':')
    task = api.tasks.get(id=info[1])
    if task.status == 'DRAFT':
        task.run()

In [None]:
task_fn = '/Users/brownm28/Documents/2018-Nov-6_open_dipg/somatic_calls/pnoc_somatic_call_tasks.txt'
with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(run_task, api, entry): entry for entry in open(task_fn)}

## WES Test

### HC Germ Pre-process

In [None]:
def get_germ_refs(api):
    ref_dict = {}
    ref_dict['axiomPoly_resource_vcf'] = api.files.query(project=project, names=['Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_dict['dbsnp_vcf'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dbsnp138.vcf'])[0]
    ref_dict['hapmap_resource_vcf'] = api.files.query(project=project, names=['hapmap_3.3.hg38.vcf.gz'])[0]
    ref_dict['mills_resource_vcf'] = api.files.query(project=project, names=['Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'])[0]
    ref_dict['omni_resource_vcf'] = api.files.query(project=project, names=['1000G_omni2.5.hg38.vcf.gz'])[0]
    ref_dict['one_thousand_genomes_resource_vcf'] = api.files.query(project=project, names=['1000G_phase1.snps.high_confidence.hg38.vcf.gz'])[0]
    ref_dict['reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['unpadded_intervals_file'] = api.files.query(project=project, names=['Strexome_canonical_unpadded_GRCh38.sorted.merged.interval_list'])[0]
    ref_dict['wgs_evaluation_interval_list'] = api.files.query(project=project, names=['Strexome_canonical_100bp_padded_GRCh38.bed'])[0]
    ref_dict['snp_sites'] = api.files.query(project=project, names=['1000G_phase3_v4_20130502.sites.hg38.vcf'])[0]
    return ref_dict

In [None]:
project = 'zhangb1/kf-somatic-tools-test'
ref_obj = get_germ_refs(api)
app_name = project + "/kfdrc-single-genotype-basic"
in_dict = {}
for key in ref_obj:
    in_dict[key] = ref_obj[key]
in_dict['input_vcfs'] = [api.files.get('5dcaf4eee4b0549589782983')]
task_name = "SINGLE GENOTYPE GATK: P-01 TEST"
task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, execution_settings = {'use_memoization': True}, run=False)
task.inputs['output_basename'] = task.id

task.save()

### Workflow

In [7]:
def get_wf_refs(api):
    ref_dict = {}
    ref_dict['cfree_chr_len'] = api.files.query(project=project, names=['hs38_chr.len'])[0]
    ref_dict['cnvkit_annotation_file'] = api.files.query(project=project, names=['refFlat_HG38.txt'])[0]
    ref_dict['hg38_strelka_bed'] = api.files.query(project=project, names=['Strexome_targets_intersect_sorted_padded100.GRCh38.bed.gz'])[0]
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['mutect2_af_only_gnomad_vcf'] = api.files.query(project=project, names=['af-only-gnomad.hg38.vcf.gz'])[0]
    ref_dict['mutect2_exac_common_vcf'] = api.files.query(project=project, names=['small_exac_common_3.hg38.vcf.gz'])[0]
    ref_dict['padded_capture_regions'] = api.files.query(project=project, names=['Strexome_canonical_100bp_padded_GRCh38.bed'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_dict['reference_fai'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta.fai'])[0]
    ref_dict['unpadded_capture_regions'] = api.files.query(project=project, names=['Strexome_canonical_unpadded_GRCh38.sorted.merged.bed'])[0]
    ref_dict['vep_cache'] = api.files.query(project=project, names=['homo_sapiens_vep_93_GRCh38_convert_cache.tar.gz'])[0]
    ref_dict['cfree_ploidy'] = [2,3,4]
    return ref_dict
    

In [9]:
project = 'zhangb1/kf-somatic-tools-test'
app_name = project + '/kfdrc-production-somatic-wes-variant-cnv-wf'

ref_objs = get_wf_refs(api)
#pdb.set_trace()
in_dict = {}
for key in ref_objs:
    in_dict[key] = ref_objs[key]
in_dict['input_tumor_aligned'] = api.files.get('5d375182e4b0359d9af6350e')
in_dict['input_tumor_name'] = 'BS_GBT44HST'
in_dict['input_normal_aligned'] = api.files.get('5d375182e4b0359d9af6350f')
in_dict['input_normal_name'] = 'BS_JHMWZ3NH'
in_dict['cfree_sex'] = 'XX'
in_dict['cnvkit_sex'] = 'Female'
in_dict['b_allele'] = api.files.get('5dcb0ab4e4b09d9a3247e994')
task_name = "KFDRC SOMATIC PROD TEST: P-01 BS_GBT44HST BS_JHMWZ3NH"
task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
task.inputs['output_basename'] = task.id
task.save()

<Task: id=2677e12f-a8ff-4cd2-a735-f6c02e9ff6d1>