In [4]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])

In [None]:
def get_refs(api, project, exome_flag, calling_list, mode):
    ref_dict = {}
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_dict['af_only_gnomad_vcf'] = api.files.query(project=project, names=['af-only-gnomad.hg38.vcf.gz'])[0]
    ref_dict['exac_common_vcf'] = api.files.query(project=project, names=['small_exac_common_3.hg38.vcf.gz'])[0]
    ref_dict['wgs_calling_interval_list'] = api.files.query(project=project, names=[calling_list])[0]
    ref_dict['exome_flag'] = exome_flag
    ref_dict['vep_cache'] = api.files.query(project=project, names=['homo_sapiens_vep_93_GRCh38_convert_cache.tar.gz'])[0]
    ref_dict['select_vars_mode'] = mode
    return ref_dict

In [None]:
def get_inputs(src_task, prefix):
    if re.search(prefix, src_task.name) and (suffix is None or re.search(suffix, src_task.name)):
        try:
            tumor_id = src_task.inputs[src_tumor_id]
            normal_id = src_task.inputs[src_normal_id]
            new_task_name = task_prefix + tumor_id + " " + normal_id
            if new_task_name in task_dict:
                sys.stderr.write('Duplicated, check inputs for ' + new_task_name + '\n')
                return None
            else:
                task_dict[new_task_name] = 1
            inputs = {}
            for key in ref_objs:
                inputs[key] = ref_objs[key]
            inputs['input_tumor_name'] = tumor_id
            inputs['input_normal_name'] = normal_id
            inputs['input_tumor_aligned'] = src_task.inputs[src_tumor_align]
            inputs['input_normal_aligned'] = src_task.inputs[src_normal_align]
            task = api.tasks.create(name=new_task_name, project=project, app=app_name, inputs=inputs, run=False)
            task.inputs['output_basename'] = task.id
            task.save()
            return task.name + '\t' + task.id + '\n'
        except Exception as e:
            sys.stderr.write(str(e) + '\n')
            sys.stderr.write('Failed to parse and process input task ' + src_task.name + ' ' + src_task.id + '\n')
            exit(1)

In [None]:
project='kfdrc-harmonization/sd-bhjxbdqk-10'
exome_flag = 'N'
# set input names as different versions of pipe have them named differently
src_tumor_align = 'input_tumor_aligned'
src_normal_align = 'input_normal_aligned'
src_tumor_id = 'input_tumor_name'
src_normal_id = 'input_normal_name'
task_prefix = 'CNMC_MUTECT2_SOMATIC RPT: '
prefix = 'CNMC_MUTECT2_SOMATIC:'
suffix = None

ref_objs = get_refs(api, project, exome_flag)

tasks = api.tasks.query(project=project, status='COMPLETED').all()
app_name = project + '/kfdrc-mutect2-wf'
i = 1
n = 50
task_dict={}
out_fh = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/pnoc_wgs_tasks.txt', 'w')
with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(get_inputs, task, prefix): task for task in tasks}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks set up\n')
        i += 1
        if result.result() is not None:
            out_fh.write(result.result())
out_fh.close()


### get error logs

In [None]:
project='kfdrc-harmonization/sd-bhjxbdqk-8'
for line in open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/mutect2_fail_task_ids.txt'):
    tid = line.rstrip('\n')
    task = api.tasks.get(tid)
    i = 0
    for job in task.get_execution_details().jobs:
        if job.status == 'FAILED':
            log_obj = api.files.get(id=job.logs['job.err.log'].id)
            log_obj.download('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/' + tid + '_' + job.name + '.' + log_obj.name)

### removed failed run outputs

In [None]:
project = 'kfdrc-harmonization/sd-bhjxbdqk-08'
tasks = api.tasks.query(project=project, status='FAILED').all()
files = api.files.query(project=project).all()
out_fail = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/2019-04-16_1538_FAILED.txt', 'w')
out_fail.write('Task ID\tTask Name\n')
fail_list = []
for task in tasks:
    fail_list.append(task.id)
    out_fail.write(task.id + '\t' + task.name + '\n')
out_fail.close()
del_log = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/2019-04-16_1538_FAILED_DEL.log', 'w')
for fobj in files:
    parts = fobj.name.split('.')
    if parts[0] in fail_list:
        # pdb.set_trace()
        del_log.write('DELETING file from failed task ' + parts[0] + ' ' + fobj.name + '\n')
        del_log.flush()
        # fobj.delete()
        # break
del_log.close()

### remove deprecated run outputs

In [None]:
project = 'kfdrc-harmonization/sd-bhjxbdqk-08'
tasks = api.tasks.query(project=project, status='COMPLETED').all()
files = api.files.query(project=project).all()
out_fail = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/2019-04-17_1530_DEPRECATED.txt', 'w')
out_fail.write('Task ID\tTask Name\n')
fail_list = []
for task in tasks:
    if task.id == '5ea21b42-488a-4680-aced-225af61d0843':
        pdb.set_trace()
        hold=1
    if re.search('CBTTC_MUTECT2_SOMATIC:', task.name):
        fail_list.append(task.id)
        out_fail.write(task.id + '\t' + task.name + '\t' + task.status + '\n')
out_fail.close()
#pdb.set_trace()
# del_log = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/2019-04-16_1600_DEPRECATED_DEL.log', 'w')
# for fobj in files:
#     parts = fobj.name.split('.')
#     if parts[0] in fail_list:
#         # pdb.set_trace()
#         del_log.write('DELETING file from failed task ' + parts[0] + ' ' + fobj.name + '\n')
#         del_log.flush()
#         fobj.delete()
#         # break
# del_log.close()

In [None]:
project = 'kfdrc-harmonization/sd-bhjxbdqk-09'
tasks = api.tasks.query(project=project, status='COMPLETED').all()
out_fh = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/pnoc_tasks.txt', 'w')
for task in tasks:
    if re.search('PNOC_WGS_MUTECT2', task.name):
        out_fh.write(task.name  + '\t' + task.id + '\n')
out_fh.close()

## pnoc WES mutect2 run

In [None]:
project = 'kfdrc-harmonization/sd-m3dbxd12'
calling_list = 'Strexome_targets_intersect_sorted_padded100.GRCh38.bed.gz'
exome_flag = 'Y'
mode = 'gatk'
ref_objs = get_refs(api, project, exome_flag, calling_list, mode)
manifest = open('/Users/brownm28/Documents/2019-Feb-27_cbttc_ngs_checkmate/pnoc_wes/tn_pairs_from_tasks.txt')
head = next(manifest)
tn_pairs = []
for line in manifest:
    info = line.rstrip('\n').split('\t')
    tn_pairs.append([info[0], info[1]])
files = api.files.query(project=project).all()
cram_file_objs = {}
for file_obj in files:
    if file_obj.name[-4:] == 'cram' and 'Reharmonization' in file_obj.tags:
        cram_file_objs[file_obj.metadata['Kids First Biospecimen ID']] = file_obj
app_name = project + '/kfdrc-mutect2-wf'
out_fh = open('/Users/brownm28/Documents/2019-Apr-24_pnoc_wes_mutect2/pnoc_wes_tasks.txt', 'w')
for pair in tn_pairs:
    inputs = ref_objs
    inputs['input_tumor_name'] = pair[0]
    inputs['input_normal_name'] = pair[1]
    inputs['input_tumor_aligned'] = cram_file_objs[pair[0]]
    inputs['input_normal_aligned'] = cram_file_objs[pair[1]]
    task_name = 'PNOC_WES_MUTECT2_SOMATIC: ' + pair[0] + ' ' + pair[1]
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=inputs, run=False)
    inputs['output_basename'] = task.id
    task.save()
    out_fh.write(task_name + '\t' + task.id + '\n')
out_fh.close()


In [None]:
## manual run tasks

In [None]:
for task_info in open('/Users/brownm28/Documents/2019-Apr-24_pnoc_wes_mutect2/pnoc_wes_tasks.txt'):
    (tname, tid) = task_info.rstrip('\n').split('\t')
    task = api.tasks.get(tid)
    if task.status == 'DRAFT':
        try:
            task.run()
        except Exception as e:
            sys.stderr.write('Could not run task becaause of ' + str(e) + '\n')

In [None]:
### compare task ID with output IDs

In [None]:
prefix = 'PNOC_WES_MUTECT2_SOMATIC'
project = 'kfdrc-harmonization/sd-m3dbxd12'
tasks = api.tasks.query(status='COMPLETED', project=project).all()
for task in tasks:
    if re.match(prefix, task.name):
        print (task.name + '\t' + task.id + '\t' + task.outputs['mutect2_vep_vcf'].name)

## PNOC WGS Single Genotype Run

In [None]:
def get_refs(api):
    ref_dict = {}
    ref_dict['axiomPoly_resource_vcf'] = api.files.query(project=project, names=['Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_dict['dbsnp_vcf'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dbsnp138.vcf'])[0]
    ref_dict['hapmap_resource_vcf'] = api.files.query(project=project, names=['hapmap_3.3.hg38.vcf.gz'])[0]
    ref_dict['mills_resource_vcf'] = api.files.query(project=project, names=['Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'])[0]
    ref_dict['omni_resource_vcf'] = api.files.query(project=project, names=['1000G_omni2.5.hg38.vcf.gz'])[0]
    ref_dict['one_thousand_genomes_resource_vcf'] = api.files.query(project=project, names=['1000G_phase1.snps.high_confidence.hg38.vcf.gz'])[0]
    ref_dict['ref_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['ref_tar_gz'] = api.files.query(project=project, names=['hg38_snpeff.tgz'])[0]
    ref_dict['unpadded_intervals_file'] = api.files.query(project=project, names=['hg38.even.handcurated.20k.intervals'])[0]
    ref_dict['wgs_evaluation_interval_list'] = api.files.query(project=project, names=['wgs_evaluation_regions.hg38.interval_list'])[0]
    return ref_dict

In [None]:
project = 'kfdrc-harmonization/sd-bhjxbdqk-09'
manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/controlfreec/pnoc_snp_Call/gvcf-manifest.csv')
head = next(manifest)
ref_obj = get_refs(api)
app_name = project + "/kf-single-genotype"
for line in manifest:
    info = line.rstrip('\n').split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_dict['input_vcfs'] = api.files.get(info[0])
    task_name = "SINGLE GENOTYPE GATK: " + info[6] + " " + info[12]
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_vcf_basename'] = task.id
    task.save()

## Rerun VCF2MAF

In [1]:
def get_vep_refs(api):
    try:
        ref_dict = {}
        ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
        #ref_dict['cache'] = api.files.query(project=project, names=['homo_sapiens_vep_93_GRCh38_convert_cache.tar.gz'])[0]
        ref_dict['cache'] = api.files.get('5d701792e4b0950c45b0d798')
        ref_dict['tool_name'] = tool_name
        ref_dict['strip_info'] = strip_info
        return ref_dict
    except Exception as e:
        sys.stderr.write(str(e) + "failed setting up refs")


In [2]:
def set_up_vep_task(line):
    try:
        info = line.rstrip('\n').split(',')
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        in_dict['tumor_id'] = info[tid]
        in_dict['normal_id'] = info[nid]
        in_dict['input_vcf'] = api.files.get(info[0])
        task_name = task_prefix + " " + info[tid] + " " + info[nid]
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
    except Exception as e:
        sys.stderr.write(str(e) + "failed setting up task")
#         pdb.set_trace()
#         hold = 1

In [5]:
project = 'kfdrc-harmonization/pbta-lancet-vardict-analysis'
app_name = project + '/temp-rerun-vcf2maf'
tool_name = 'lancet'
strip_info = "INFO/CSQ"
task_prefix = "VCF2MAF LANCET WGS PBTA RPT:"
ref_objs = get_vep_refs(api)
manifest_fn = '/Users/brownm28/Documents/2019-Nov-12_PBTA_v10/pbta_lancet_WGS-manifest.csv'
manifest = open(manifest_fn)
head = next(manifest)
header = head.rstrip('\n').split(',')
tid = -5
nid = -7
# pdb.set_trace()
with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(set_up_vep_task, line): line for line in manifest}
# for line in manifest:
#     set_up_vep_task(line)
