In [1]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])

In [19]:
def get_refs(api, project, read_len):
    ref_dict = {}
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['ref_chrs'] = api.files.query(project=project, names=['GRCh38_everyChrs.tar.gz'])[0]
    ref_dict['interval_list'] = api.files.query(project=project, names=['GRCh38.d1.vd1.fa.150mer.merged.bed'])[0]
    ref_dict['rlen'] = read_len
    return ref_dict

In [20]:
def draft_task(case_id):
    try:
        for bs_id in cram_meta[case_id]['Tumor']:
            in_dict = {}
            for key in ref_objs:
                in_dict[key] = ref_objs[key]
            norm_bs_id = list(cram_meta[case_id]['Normal'].keys())[0]
            task_name = 'BIC-SEQ2: ' + case_id + " " + bs_id + " " + norm_bs_id
            tum_cram_id = cram_meta[case_id]['Tumor'][bs_id]
            norm_cram_id = cram_meta[case_id]['Normal'][norm_bs_id]
            in_dict['input_tumor_align'] = cram_objs[tum_cram_id]
            in_dict['input_normal_align'] = cram_objs[norm_cram_id]
            #pdb.set_trace()
            task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
            task.inputs['output_basename'] = task.id
            task.save()
            return '\t'.join((task_name, task.id)) + '\n'
    except Exception as e:
        sys.stderr.write(str(e) + '\n')
        sys.stderr.write('Failed to create task for ' + task_name + '\n')
        exit(1)


In [21]:
project='kfdrc-harmonization/proteomics-bic-seq2'
read_len = 150
app_name = project + '/kfdrc-bic-seq2-workflow'
manifest = open('/Users/brownm28/Documents/2019-May-17_proteomics_bicseq2/1558118472415-manifest.csv')
out_task = open('/Users/brownm28/Documents/2019-May-17_proteomics_bicseq2/tasks_drafted.txt', 'w')
cram_meta = {}
file_id_list = []
head = next(manifest)
c_idx = 18
bs_idx = 12
s_idx = 10
for line in manifest:
    info = line.rstrip('\n').split(',')
    (case_id, bs_id, stype) = (info[c_idx], info[bs_idx], info[s_idx])
    file_id_list.append(info[0])
    if case_id not in cram_meta:
        cram_meta[case_id] = {}
    if stype not in cram_meta[case_id]:
        cram_meta[case_id][stype] = {}
    cram_meta[case_id][stype][bs_id] = info[0]
file_bulk = []
x = 1
max_j = 100
total = len(file_id_list)
for i in range(0, total, max_j):
    uset = i + max_j
    if uset > total:
        uset = total
    file_bulk.extend(api.files.bulk_get(files=file_id_list[i:uset]))
cram_objs = {}
for file_res in file_bulk:
    cram_objs[file_res.resource.id] = file_res.resource
    
ref_objs = get_refs(api, project, read_len)
x = 1
n = 50
with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(draft_task, case_id): case_id for case_id in cram_meta}
    for result in concurrent.futures.as_completed(results):
        out_task.write(result.result())
        if x % n == 0:
            sys.stderr.write(str(x) + ' task sets drafted, ' + str(api.remaining) + ' api calls left\n')
        x += 1


50 task sets drafted, 9820 api calls left
100 task sets drafted, 9677 api calls left
150 task sets drafted, 9527 api calls left


## Copy metadata from tumor cram to outputs

In [5]:
def tag_outputs(task):
    try:
        metadata = task.inputs['input_tumor_align'].metadata
        norm_bs_id = task.inputs['input_normal_align'].metadata['Kids First Biospecimen ID']
        for output in task.outputs:
            cur_file = task.outputs[output]
            for key in metadata:
                if key not in black_list:
                    cur_file.metadata[key] = metadata[key]
            cur_file.metadata['Kids First Biospecimen ID Tumor'] = task.inputs['input_tumor_align'].metadata['Kids First Biospecimen ID']
            cur_file.metadata['Kids First Biospecimen ID Normal'] = norm_bs_id
            cur_file.save()
    except Exception as e:
        print(e)


In [6]:
project='kfdrc-harmonization/proteomics-bic-seq2'
prefix = "BIC-SEQ2:"
tasks = api.tasks.query(project="kfdrc-harmonization/proteomics-bic-seq2", status="COMPLETED").all()
black_list = ['sample_id', 'aliquot_id', 'Kids First Biospecimen ID', 'sample_type']
with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(tag_outputs, task): task for task in tasks} 
        

## Assess failed tasks

In [25]:
project='kfdrc-harmonization/proteomics-bic-seq2'
tasks = api.tasks.query(project=project, status="FAILED")
for task in tasks:
    i = 0
    print (task.id)
    for job in task.get_execution_details().jobs:
        if job.status == 'FAILED':
            log_obj = api.files.get(id=job.logs['job.err.log'].id)
            log_obj.download('/Users/brownm28/Documents/2019-May-17_proteomics_bicseq2/' + task.id + '_' + job.name + '.' + log_obj.name)

77656683-f06e-425a-b7a4-29f5c31a4f24
7b5b391b-19a7-493b-a1d2-88bf48b10669
01a62537-ccb9-48f1-bf62-5c1161966388
c5ff6a45-5dff-40a2-9eca-aff1ee8b2f6f
fb332b06-f6b6-47a5-b5d2-b9ae29d9b730
c4f818f6-bda8-4d6d-9c52-8c21c72ffb4e
f060a561-fc9e-4273-838e-8859daa58772
5da3de1f-7fb2-446d-acae-e69e307b3441
fe7fc999-64a2-4d8f-ad8e-a38b8967495f
794d79db-f185-49f1-bc62-f5709255aa66
820c0ad2-5ed4-4e89-b509-efebd4d21115
b6a0968f-8294-40b5-9d58-d2a1b6c65499
7b6620da-bb63-43a8-874d-7daba2868ebe
95a13810-a92f-451c-9f6b-554645056b49
ecc59649-a797-4daa-bfa1-1b211fd3eda5
a510bb99-8d56-44e7-b987-2e8fdb71f2dd
2d33e964-a033-41a1-8bbb-f704c3f53c93
334456b7-bfec-4806-961a-1e05e66b61cd
c5ceb0af-fb5b-4045-80bb-715f82f89322
f7635469-eb56-45e3-aa6a-42ae5fad1502
08861d17-6e38-4aa1-b607-93cd0f748a8c
aca45a5c-dec0-41ab-9330-1c1fd3282b49
401bc480-1b5f-49db-b596-c955907fe1bc
5958b219-3452-4177-a6c6-6a9dbf042f76
6f3907dd-7ff4-4b0f-86e1-2286ab426799
39604bec-5d3a-43c7-8607-daee4a03fa47
dec382cf-95b3-4e38-bb0e-096a3d0c0be2
0

## Re-run failed tasks

In [5]:
def get_fa_refs(api, project):
    ref_dict = {}
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['ref_chrs'] = api.files.query(project=project, names=['GRCh38_everyChrs.tar.gz'])[0]
    return ref_dict

In [6]:
def get_invtl_refs(api, project):
    intvl_dict = {}
    intvl_dict['150'] = api.files.query(project=project, names=['GRCh38.d1.vd1.fa.150mer.merged.bed'])[0]
    intvl_dict['100'] = api.files.query(project=project, names=['hg38_100bp_gem_mm2_mappability.merged.bed'])[0]
    return intvl_dict

In [7]:
project='kfdrc-harmonization/proteomics-bic-seq2'
ref_fa = get_fa_refs(api, project)
ref_intvl = get_invtl_refs(api, project)
app_name = project + '/kfdrc-bic-seq2-workflow'
task_list = open ('/Users/brownm28/Documents/2019-May-17_proteomics_bicseq2/rlen_fail_re-run/task_summary.txt')
for line in task_list:
    info = line.rstrip('\n').split('\t')
    task_name = info[0]
    old_task_id = info[1]
    old_task = api.tasks.get(old_task_id)
    t_rlen = info[7]
    n_rlen = info[8]
    in_dict = {}
    for key in ref_fa:
        in_dict[key] = ref_fa[key]
    in_dict['t_rlen'] = int(t_rlen)
    in_dict['n_rlen'] = int(n_rlen)
    in_dict['t_interval_list'] = ref_intvl[t_rlen]
    in_dict['n_interval_list'] = ref_intvl[n_rlen]
    in_dict['input_tumor_align'] = old_task.inputs['input_tumor_align']
    in_dict['input_normal_align'] = old_task.inputs['input_normal_align']
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    print ('\t'.join((task_name, task.id)))


BIC-SEQ2: C28659 BS_HW4MYBZ2 BS_8DTDM1MK	39ba4d07-2a3c-4202-a02f-dad4540936ab
BIC-SEQ2: C82041 BS_ZX1YPQ88 BS_F4V6RWJ9	d288d55c-1bcc-4d34-9847-5f101297bd1b
BIC-SEQ2: C33825 BS_16FT8V4B BS_APFT88Y3	9f2ac3a6-4a6d-4f63-83d9-009a91fe7925
BIC-SEQ2: C39606 BS_17AXPP1Y BS_4KXBM5J8	2f5d2ec4-c082-4d03-8960-dd17f302d333
BIC-SEQ2: C71094 BS_S7S6YEHA BS_ZDSXV5N1	d943e27e-60c3-4709-ac59-75a1303481e8
BIC-SEQ2: C65559 BS_K14VJ1E3 BS_6PFQJMM3	518e4efc-c505-403a-8b94-5c5c2aefe850
BIC-SEQ2: C70971 BS_MWXDJFWW BS_WCQW99V3	dad8ef56-dd7d-443f-82a2-86883682f09a
BIC-SEQ2: C29151 BS_XSWF2MR1 BS_4RT8H37A	3f9a8e6c-d546-47e3-bf4e-dc3ea49b2f40
BIC-SEQ2: C41451 BS_5Z4XQC9X BS_JNFPEM6F	6000b8f1-3be5-4309-98c6-3dfa3a673728


In [8]:
task_file = open('/Users/brownm28/Documents/2019-May-17_proteomics_bicseq2/rlen_fail_re-run/rerun-tasks.txt')
for line in task_file:
    (tname, tid) = line.rstrip('\n').split('\t')
    task = api.tasks.get(tid)
    task.run()

### Get matched controlfreeC calls

In [None]:
project = 'kfdrc-harmonization/sd-bhjxbdqk-08'
manifest = open('/Users/brownm28/Documents/2019-May-17_proteomics_bicseq2/completed_results.csv')
head = next(manifest)
for line in manifest:
    info = line.rstrip('\n').split(',')    
    flist = api.files.query(project=project, metadata = {'Kids First Biospecimen ID Tumor': info[14]}, tags = ['CONTROLFREEC'] )
    pdb.set_trace()
    hold=1
    print(flist[0].id + '\t' flist[0].name)
