In [4]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])

In [5]:
def get_refs(api, project):
    ref_dict = {}
    ref_dict['axiomPoly_resource_vcf'] = api.files.query(project=project, names=['Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz'])[0]
    ref_dict['dbsnp_vcf'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dbsnp138.vcf'])[0]
    ref_dict['hapmap_resource_vcf'] = api.files.query(project=project, names=['hapmap_3.3.hg38.vcf.gz'])[0]
    ref_dict['mills_resource_vcf'] = api.files.query(project=project, names=['Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'])[0]
    ref_dict['omni_resource_vcf'] = api.files.query(project=project, names=['1000G_omni2.5.hg38.vcf.gz'])[0]
    ref_dict['one_thousand_genomes_resource_vcf'] = api.files.query(project=project, names=['1000G_phase1.snps.high_confidence.hg38.vcf.gz'])[0]    
    ref_dict['ref_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_dict['unpadded_intervals_file'] = api.files.query(project=project, names=['hg38.even.handcurated.20k.intervals'])[0]
    ref_dict['vep_cache'] = api.files.query(project=project, names=['homo_sapiens_vep_93_GRCh38_convert_cache.tar.gz'])[0]
    ref_dict['wgs_evaluation_interval_list'] = api.files.query(project=project, names=['wgs_evaluation_regions.hg38.interval_list'])[0]
    return ref_dict

In [8]:
project = 'gaonkark/cbttc-dev'
app_name = project + '/kfdrc-single-genotyping-workflow'
ref_objs = get_refs(api, project)
manifest = open('/Users/brownm28/Documents/2019-May-31_singleton_gvcf_run/1559320280191-manifest.csv')
head  = next(manifest)
for line in manifest:
    info = line.rstrip('\n').split(',')
    bs_id = info[12]
    case_id = info[-2]
    file_id = info[0]
    gvcf = api.files.get(file_id)
    in_dict = {}
    for key in ref_objs:
        in_dict[key] = ref_objs[key]
    in_dict['input_gvcf'] = [gvcf]
    in_dict['input_id'] = bs_id
    task_name = 'PNOC_SINGELTON_GENOTYPE: ' + bs_id + ' ' + case_id
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    print (task_name + '\t' + task.id)

PNOC_SINGELTON_GENOTYPE: BS_Q7R8BT07 P-07	6f331e9d-35ab-4809-8f1f-768a5cfd8e40
PNOC_SINGELTON_GENOTYPE: BS_29YQSB5E P-18	af215068-cfc9-47e8-8ccd-16194e10c12c
PNOC_SINGELTON_GENOTYPE: BS_STZ2C71Q P-06	88d4315d-c943-46f8-a0b5-602a76f4e115
PNOC_SINGELTON_GENOTYPE: BS_1GWZCWVG P-04	d9f1033b-cab7-48d3-92fe-1b2bb900b3f7
PNOC_SINGELTON_GENOTYPE: BS_QPSQPDR8 P-08	2d2215aa-8385-441d-a151-ba58c7fab609
PNOC_SINGELTON_GENOTYPE: BS_6GS4XT7F P-01	a5887808-14c8-46c8-9b0d-8cb8ddd8f85f
PNOC_SINGELTON_GENOTYPE: BS_HJ7HYZ7N P-04	381441a9-3735-4478-a319-a5c720d8b296
PNOC_SINGELTON_GENOTYPE: BS_9TSKXKGH P-12	c589ac85-5f2a-492f-b64c-09f764ebba9f
PNOC_SINGELTON_GENOTYPE: BS_Z370T42N P-10	014566cf-799b-4c0a-a368-cc91b319d93b
PNOC_SINGELTON_GENOTYPE: BS_NY9MPC8F P-11	a63947a2-9b84-4a25-aac1-54818cceebe7
PNOC_SINGELTON_GENOTYPE: BS_BKCPNFZ5 P-14	8118812c-51e8-4076-9876-30bdbd659e15
PNOC_SINGELTON_GENOTYPE: BS_MVYA262V P-02	bc0b9de9-59f3-4db6-b646-c67a34bc4588
PNOC_SINGELTON_GENOTYPE: BS_3PNWA7WT P-07	821d3a88-5

### Run vep vcf2maf tool

In [32]:
def get_maf_refs(api, project):
    ref_dict = {}
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['cache'] = api.files.query(project=project, names=['homo_sapiens_vep_93_GRCh38_convert_cache.tar.gz'])[0]
    ref_dict['tool_name'] = 'gatk_vqsr'
    return ref_dict

In [33]:
def draft_tasks(line):
    try:
        info = line.rstrip('\n').split(',')
        bs_id = info[11]
        sample_id = info[8]
        file_id = info[0]
        input_vcf = api.files.get(file_id)
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        in_dict['input_vcf'] = input_vcf
        in_dict['tumor_id'] = bs_id
        task_name = 'CBTTC_VCF2MAF: ' + bs_id + ' ' + sample_id
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
        return (task_name + '\t' + task.id + '\n')
    except Exception as e:
        print (e)

In [34]:
project = 'gaonkark/cbttc-dev'
app_name = project + '/kfdrc-vep-single-sample-annotate-maf'
ref_objs = get_maf_refs(api, project)
manifest = open('/Users/brownm28/Documents/2019-May-31_singleton_gvcf_run/yiran_vcf_manifest.csv')
head  = next(manifest)
x = 1
n = 50
out_fh = open('/Users/brownm28/Documents/2019-May-31_singleton_gvcf_run/vcf2maf_tasks.txt','w')
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(draft_tasks, line): line for line in manifest}
    for result in concurrent.futures.as_completed(results):
        if x % n == 0:
            sys.stderr.write(str(x) + ' tasks created, ' + str(api.remaining) + ' api calls left\n')
        x += 1
        out_fh.write(result.result())
out_fh.close()
    

50 tasks created, 9724 api calls left
100 tasks created, 9542 api calls left
150 tasks created, 9346 api calls left
200 tasks created, 9148 api calls left
250 tasks created, 8950 api calls left
300 tasks created, 8758 api calls left
350 tasks created, 8562 api calls left
400 tasks created, 8362 api calls left
450 tasks created, 8138 api calls left
500 tasks created, 7950 api calls left
550 tasks created, 7757 api calls left
600 tasks created, 7563 api calls left
650 tasks created, 7343 api calls left
700 tasks created, 7166 api calls left
750 tasks created, 6952 api calls left
800 tasks created, 6750 api calls left
850 tasks created, 6555 api calls left


### Run all tasks in list

In [9]:
task_list = open('/Users/brownm28/Documents/2019-May-31_singleton_gvcf_run/full_single_tasks.txt')
for task in task_list:
    (tname, tid) = task.rstrip('\n').split('\t')
    task_obj = api.tasks.get(tid)
    task_obj.run()

### tag outputs with input data

In [44]:
task_list = open('/Users/brownm28/Documents/2019-May-31_singleton_gvcf_run/full_single_tasks.txt')
for task in task_list:
    (tname, tid) = task.rstrip('\n').split('\t')
    task_obj = api.tasks.get(tid)
    #pdb.set_trace()
    metadata = task_obj.inputs['input_gvcf'][0].metadata
    for outkey in task_obj.outputs:
        if type(task_obj.outputs[outkey]) is list:
            for sub_file in task_obj.outputs[outkey]:
                for key in metadata:
                    sub_file.metadata[key] = metadata[key]
                
        else:
            cur_file = api.files.get(task_obj.outputs[outkey].id)
        for key in metadata:
            cur_file.metadata[key] = metadata[key]
        cur_file.save()

In [47]:
def tag_task_outputs(task):
    (tname, tid) = task.rstrip('\n').split('\t')
    task_obj = api.tasks.get(tid)
    #pdb.set_trace()
    metadata = task_obj.inputs['input_vcf'].metadata
    for outkey in task_obj.outputs:
        if type(task_obj.outputs[outkey]) is list:
            for sub_file in task_obj.outputs[outkey]:
                for key in metadata:
                    sub_file.metadata[key] = metadata[key]
                
        else:
            cur_file = api.files.get(task_obj.outputs[outkey].id)
        for key in metadata:
            cur_file.metadata[key] = metadata[key]
        cur_file.save()

In [48]:
task_list = open('/Users/brownm28/Documents/2019-May-31_singleton_gvcf_run/vcf2maf_tasks.txt')
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(tag_task_outputs, task): task for task in task_list}
    for result in concurrent.futures.as_completed(results):
        if x % n == 0:
            sys.stderr.write(str(x) + ' outputs tagged, ' + str(api.remaining) + ' api calls left\n')
        x += 1

1800 outputs tagged, 8871 api calls left
1850 outputs tagged, 8320 api calls left
1900 outputs tagged, 7738 api calls left
1950 outputs tagged, 7163 api calls left
2000 outputs tagged, 6655 api calls left
2050 outputs tagged, 6123 api calls left
2100 outputs tagged, 5562 api calls left
2150 outputs tagged, 4992 api calls left
2200 outputs tagged, 4433 api calls left
2250 outputs tagged, 3877 api calls left
2300 outputs tagged, 3324 api calls left
2350 outputs tagged, 2787 api calls left
2400 outputs tagged, 2254 api calls left


2450 outputs tagged, 1649 api calls left
2500 outputs tagged, 1103 api calls left
2550 outputs tagged, 583 api calls left
2600 outputs tagged, 21 api calls left
2650 outputs tagged, 9455 api calls left


### tag input files using task info

In [16]:
def tag_file(file_obj):
    try:
        task = api.tasks.get(file_obj.origin.task)
        metadata = task.inputs['input_vcfs'].metadata
        for key in metadata:
            file_obj.metadata[key] = metadata[key]
        file_obj.save()
    except Exception as e:
        print (e)
    

In [17]:
input_manifest = open('/Users/brownm28/Documents/2019-May-31_singleton_gvcf_run/yiran_vcf_manifest.csv')
file_ids = []
head = next(input_manifest)
for line in input_manifest:
    info = line.rstrip('\n').split(',')
    file_ids.append(info[0])
file_objs = []
max_j = 100
total = len(file_ids)
for i in range(0, total, max_j):
    uset = i + max_j
    sys.stderr.write('Processing ' + str(uset) + ' set\n')
    if uset > total:
        uset = total
    subset = api.files.bulk_get(files=file_ids[i:uset])
    for obj in subset:
        file_objs.append(obj.resource)
x = 1
n = 50
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(tag_file, cur_file): cur_file for cur_file in file_objs}
    for result in concurrent.futures.as_completed(results):
        if x % n == 0:
            sys.stderr.write(str(x) + ' outputs tagged, ' + str(api.remaining) + ' api calls left\n')
        x += 1


Processing 100 set
Processing 200 set
Processing 300 set
Processing 400 set
Processing 500 set
Processing 600 set
Processing 700 set
Processing 800 set
Processing 900 set
50 outputs tagged, 9701 api calls left
100 outputs tagged, 9521 api calls left
150 outputs tagged, 9334 api calls left
200 outputs tagged, 9120 api calls left
250 outputs tagged, 8939 api calls left
300 outputs tagged, 8745 api calls left
350 outputs tagged, 8545 api calls left
400 outputs tagged, 8334 api calls left
450 outputs tagged, 8130 api calls left
500 outputs tagged, 7923 api calls left
550 outputs tagged, 7733 api calls left
600 outputs tagged, 7533 api calls left
650 outputs tagged, 7289 api calls left
700 outputs tagged, 7126 api calls left
750 outputs tagged, 6910 api calls left
800 outputs tagged, 6737 api calls left
850 outputs tagged, 6534 api calls left


In [15]:
test = api.files.get('5cf14c33e4b0c5cd4120b057')
pdb.set_trace()
hold=1

--Return--
> <ipython-input-15-d9c811b8f9a0>(2)<module>()->None
-> pdb.set_trace()
(Pdb) p dir(test)
['FOLDER_TYPE', '_API', '_URL', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_api', '_data', '_dirty', '_fields', '_modified_data', '_old', '_query', 'bulk_delete', 'bulk_edit', 'bulk_get', 'bulk_update', 'content', 'copy', 'copy_to_folder', 'create_folder', 'created_on', 'deepcopy', 'delete', 'download', 'download_info', 'equals', 'field', 'get', 'href', 'id', 'is_folder', 'list_files', 'metadata', 'modified_on', 'move_to_folder', 'name', 'origin', 'parent', 'project', 'query', 'reload', 'save', 'size', 'storage', 'stream', 'tags', 'type', 'upload']
(Pdb) p test.origin
<FileOrigin: task=7eaf

BdbQuit: 