In [1]:
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import pdb
import concurrent.futures
from requests import request
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])

### Process tasks and tag files

In [17]:
def tag_file(fobj, r_dict, task):
    try:
        for key in r_dict:
            fobj.metadata[key] = r_dict[key]
        # fobj.save()
        if somatic_flag == 1:
            # pdb.set_trace()
            fobj.metadata['Kids First Biospecimen ID Normal'] = task.inputs[norm_key]
    except Exception as e:
        log_file.write('Could not tag ' + fobj.name + ', got error ' + str(e) + '\n')
        log_file.flush()
        sys.exit(1)
            

In [18]:
def process_task(api, r_dict, task, run_pre, id_pre, log_file):
    try:
        if re.search(run_pre, task.name) and re.search(id_pre, task.name):
            id_parse = re.search(id_pre, task.name)
            id_value = id_parse.group(0)
            log_file.write('Tagging files from ' + task.id + ' ' + task.name + '\n')
            log_file.flush()
            id_list = []
            for output in task.outputs:
                if task.outputs[output] is None:
                    log_file.write('No file for key ' + output + ', skipping' + '\n')
                    next
                elif type(task.outputs[output]) is list:
                    for subfile in task.outputs[output]:
                        id_list.append(subfile.id)
                else:
                    id_list.append(task.outputs[output].id)
            task_files = api.files.bulk_get(id_list)
            fobj_list = []
            for task_file in task_files:
                tag_file(task_file.resource, r_dict[id_value], task)
                fobj_list.append(task_file.resource)
            api.files.bulk_update(fobj_list)
    except Exception as e:
        log_file.write('Error processing task ' + task.id + ' ' + task.name + ' with error: ' + str(e) + '\n')
        log_file.flush()
        sys.exit()
    

### Initialize metadata dict

In [22]:
# var that might need editing
metadata = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/metadata_tagging/pnoc_wes_metadata.txt')
project = 'kfdrc-harmonization/sd-m3dbxd12'
run_pre = 'PNOC_WES_MUTECT2_SOMATIC'
id_pre = 'BS_\w+'
norm_key = 'input_normal_name'
# parse sample ID rnaseq style - with 7316 only (set to 1), or as-is in DS (set to 0)
samp_flag = 0
# if somatic, will grab normal id from task
somatic_flag = 1
log_file = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/metadata_tagging/pnoc_wes_tagging.log', 'a')
rna_tblhead2sbg_dict = {'BS_ID':'Kids First Biospecimen ID', 'PT_ID': 'Kids First Participant ID',
               'external_aliquot_id': 'aliquot_id', 'source_text_tissue_type': 'sample_type',
                    'source_text_tumor_descriptor': 'Tumor Descriptor', 'composition': 'Composition', 'external_sample_id': 'sample_id',
                   'external_id': 'case_id', 'gender': 'gender', 'ethnicity': 'ethnicity', 'race': 'race',
                   'source_text_diagnosis': 'disease_type', 'source_text_tumor_location': 'primary_site',
                   'age_at_event_days': 'age_at_diagnosis', 'platform': 'platform', 'experiment_strategy': 'experimental_strategy'}

# SOMATIC CALLER MODE
somatic_tblhead2sbg_dict = {'BS_ID':'Kids First Biospecimen ID Tumor', 'PT_ID': 'Kids First Participant ID',
               'external_aliquot_id': 'aliquot_id', 'source_text_tissue_type': 'sample_type',
                    'source_text_tumor_descriptor': 'Tumor Descriptor', 'composition': 'Composition', 'external_sample_id': 'sample_id',
                   'external_id': 'case_id', 'gender': 'gender', 'ethnicity': 'ethnicity', 'race': 'race',
                   'source_text_diagnosis': 'disease_type', 'source_text_tumor_location': 'primary_site',
                   'age_at_event_days': 'age_at_diagnosis'}

tblhead2sbg_dict = somatic_tblhead2sbg_dict

r_dict = {}
fixed_values = {'reference_genome': 'GRCh38', 'experimental_strategy': 'WGS', 'platform': 'Illumina' }
head = next(metadata)
header = head.rstrip('\n').split('\t')

for line in metadata:
    info = line.rstrip('\n').split('\t')
    bs_id = info[0] 
    r_dict[bs_id] = {}
    for i in range(0, len(header)):
        if header[i] in tblhead2sbg_dict:
            try:
                if header[i] == 'age_at_event_days':
                    if info[i] == 'None' or info[i] == '':
                        r_dict[bs_id][tblhead2sbg_dict[header[i]]] = None
                    else:
                        ages = info[i].split(';')
                        for j in range(len(ages)):
                            if ages[j] == 'None':
                                del ages[j]
                                break
                        ages = list(map(int, ages))
                        ages.sort()
                        r_dict[bs_id][tblhead2sbg_dict[header[i]]] = ages[0]
                elif header[i] == 'external_sample_id' and samp_flag == 1:
                    samp_parts = info[i].split('-')
                    r_dict[bs_id][tblhead2sbg_dict[header[i]]] = samp_parts[0] + '-' + samp_parts[1]
                elif header[i] == 'source_text_diagnosis':
                    info[i] = info[i].replace("'", "")
                    r_dict[bs_id][tblhead2sbg_dict[header[i]]] = info[i]
                else:
                    r_dict[bs_id][tblhead2sbg_dict[header[i]]] = info[i]
            except Exception as e:
                sys.stderr.write('Error processing metadata ' + str(e) + '\n')
                pdb.set_trace()
                hold = 1
        for fixed_value in fixed_values:
            r_dict[bs_id][fixed_value] = fixed_values[fixed_value]

tasks = api.tasks.query(project=project, status='COMPLETED').all()
# quick_test
# tasks = []
# tasks.append(api.tasks.get(id='19b9b64d-6fc8-46b6-b9dc-5b0dfbedf4f2'))
# tasks.append(api.tasks.get(id='612679ff-e6ba-425f-b0c7-8ef67ba09474'))
# for task in tasks:
#    process_task(api, r_dict, task, run_pre, id_pre, log_file)

with concurrent.futures.ThreadPoolExecutor(4) as executor:
    results = {executor.submit(process_task, api, r_dict, task, run_pre, id_pre, log_file): task for task in tasks}
log_file.write('Done!\n')
log_file.flush()
log_file.close()

## clear out errant metadata

In [9]:
def untag_file(fobj):
    try:
        for key in fobj.metadata:
            fobj.metadata[key] = None
    except Exception as e:
        log_file.write('Could not tag ' + fobj.name + ', got error ' + str(e) + '\n')
        log_file.flush()
        sys.exit(1)

In [10]:
def deprocess_task(api, task, run_pre, id_pre, log_file):
    try:
        if re.search(run_pre, task.name) and re.search(id_pre, task.name):
            id_parse = re.search(id_pre, task.name)
            id_value = id_parse.group(0)
            log_file.write('Untagging files from ' + task.id + ' ' + task.name + '\n')
            log_file.flush()
            id_list = []
            for output in task.outputs:
                if task.outputs[output] is None:
                    log_file.write('No file for key ' + output + ', skipping' + '\n')
                    next
                elif type(task.outputs[output]) is list:
                    for subfile in task.outputs[output]:
                        id_list.append(subfile.id)
                else:
                    id_list.append(task.outputs[output].id)
            task_files = api.files.bulk_get(id_list)
            fobj_list = []
            for task_file in task_files:
                untag_file(task_file.resource)
                fobj_list.append(task_file.resource)
            api.files.bulk_update(fobj_list)
    except Exception as e:
        log_file.write('Error processing task ' + task.id + ' ' + task.name + ' with error: ' + str(e) + '\n')
        log_file.flush()
        sys.exit()
    

In [11]:
check = input()
if check != 'YASS':
    raise SystemExit()
project = 'kfdrc-harmonization/sd-bhjxbdqk-08'
#tasks = api.tasks.query(project=project, status='COMPLETED').all()
# quick_test
tasks = []
tasks.append(api.tasks.get(id='089b5063-561c-46e2-82c4-21d0ed283720'))
tasks.append(api.tasks.get(id='7c6e5751-2bd3-4f0d-ba77-57c7a8231b49'))
run_pre = 'CBTTC_MUTECT2_SOMATIC_RPT'
id_pre = 'BS_\w+'
log_file = open('/Users/brownm28/Documents/2019-Apr-9_mutect2_run/metadata_tagging/untag_log.txt', 'a')
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(deprocess_task, api, task, run_pre, id_pre, log_file): task for task in tasks}
log_file.write('Done!\n')
log_file.flush()
log_file.close()

YASS


In [2]:
project='kfdrc-harmonization/sd-bhjxbdqk-11'
tasks = api.tasks.query(project=project, status='COMPLETED')
for task in tasks:
    if re.search('RNAfusion-', task.name):
        print (task.name)

RNAfusion-FQ_INPUT: BS_F0JB4EAK C021_0003
RNAfusion-FQ_INPUT: BS_21ET39G7 C021_0006_progression
RNAfusion-FQ_INPUT: BS_49CJNZ06 C021_0011
RNAfusion-FQ_INPUT: BS_H97S5SQN C021_0032
RNAfusion-FQ_INPUT: BS_NGSG2KB6 C021_0012
RNAfusion-FQ_INPUT: BS_XM1AHBDJ C021_0017
RNAfusion-FQ_INPUT: BS_JQVAWTTM C021_0001
RNAfusion-FQ_INPUT: BS_7WM3MNZ0 C021_0029
RNAfusion-FQ_INPUT: BS_R7NTZR4C C021_0009
RNAfusion-FQ_INPUT: BS_W7MFJZ5A C021_0019
RNAfusion-FQ_INPUT: BS_XGDPK33A C021_0035
RNAfusion-FQ_INPUT: BS_QNNX91SM C021_0020
RNAfusion-FQ_INPUT: BS_YDEVMD24 C021_0005
RNAfusion-FQ_INPUT: BS_8ZY4GST0 C021_0004
RNAfusion-FQ_INPUT: BS_X4DD4KSZ C021_0030
RNAfusion-FQ_INPUT: BS_5VPM0F36 C021_0010
RNAfusion-FQ_INPUT: BS_ZF6BSFNF C021_0022
RNAfusion-FQ_INPUT: BS_2JP7RBMB C021_0027
RNAfusion-FQ_INPUT: BS_0ZA67BBC C021_0016
RNAfusion-FQ_INPUT: BS_1N7MQZGR C021_0005_progression
RNAfusion-FQ_INPUT: BS_R9B92M75 C021_0036
RNAfusion-FQ_INPUT: BS_NEVYM2FP C021_0031
RNAfusion-FQ_INPUT: BS_JB43XBCQ C021_0013
RNAfusion-

In [None]:
## Tag PNOC 0008 outputs using table

In [3]:
meta_table = open('/Users/brownm28/Documents/2019-May-7_PNOC008/annotated_metadata.txt')
head = next(meta_table)
header = head.rstrip('\n').split('\t')
i_start = 3
id_list = []
meta_dict = {}
for line in meta_table:
    info = line.rstrip('\n').split('\t')
    fid = info[0]
    id_list.append(fid)
    meta_dict[fid] = {}
    for i in range(i_start, len(header), 1):
        meta_dict[fid][header[i]] = info[i]
file_bulk = api.files.bulk_get(id_list)

f_objs = []
for f_bulk in file_bulk:
    cur_obj = f_bulk.resource
    cur_id = cur_obj.id
    for key in meta_dict[cur_id]:
        cur_obj.metadata[key] = meta_dict[cur_id][key]
    f_objs.append(cur_obj)
update = api.files.bulk_update(f_objs)

