In [27]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])
project = 'kfdrc-harmonization/pbta-consensus-calls'

### expand view

In [31]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

## Copy outputs from other projects

In [11]:
def mt_cp_file(task):
    try:
        if re.search(phrase, task.name):
            for key in outs:
                task.outputs[key].copy(project=dest)
        return 0
    except Exception as e:
        sys.stderr.write(str(e) + "\nFailed to copy from task " + task.name + "\n")
        return 1

In [16]:
src = 'kfdrc-harmonization/sd-bhjxbdqk-08'
phrase = 'Rerun_strelka_with_vcf2maf_v1.6.17'
dest = 'kfdrc-harmonization/pbta-consensus-calls'
outs = ['output_vcf', 'output_tbi']
#outs = ['annotated_vcf', 'annotated_tbi']
tasks = api.tasks.query(project=src, status="COMPLETED").all()
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(mt_cp_file, task): task for task in tasks}
    for result in concurrent.futures.as_completed(results):
        if result.result() == 1:
            sys.stderr.write("Copy error, quitting\n")
            sys.exit(1)

## Run consensus calls

In [24]:
def setup_cc_task(tum_bs_id):
    try:
        in_dict = {}
        for key in ref_dict:
            in_dict[key] = ref_dict[key]
        pt_id = vcf_dict[tum_bs_id]['pt_id']
        norm_bs_id = vcf_dict[tum_bs_id]['norm_bs_id']
        in_dict['input_tumor_name'] = tum_bs_id
        in_dict['input_normal_name'] = norm_bs_id
        for tool in tools:
            in_dict[tool + '_vcf'] = vcf_dict[tum_bs_id][tool]
    
        task_name = 'KFDRC PBTA CONSENSUS: ' + pt_id + " " + tum_bs_id + " " + norm_bs_id
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
    except Exception as e:
        sys.stderr.write(str(e) + " failed processing " + tum_bs_id + "\n")
        sys.exit(1)

In [25]:
def mt_get_vcfs(line):
    try:
        info = line.rstrip('\n').split(',')
        if info[b_idx] in bs_ids:
            bs_id = info[b_idx]
            if bs_id not in vcf_dict:
                vcf_dict[bs_id] = {}
            for tool in tools:
                if re.search(tool, info[1]):
                    # sys.stderr.write('Found valid file for ' + tool + ' for ' + info[1] + '\n')
                    file_obj = api.files.get(info[0])
                    vcf_dict[bs_id][tool] = file_obj
                    if 'pt_id' not in vcf_dict[bs_id]:
                        vcf_dict[bs_id]['pt_id'] = file_obj.metadata['Kids First Participant ID']
                        vcf_dict[bs_id]['norm_bs_id'] = file_obj.metadata['Kids First Biospecimen ID Normal']
                    break
    except Exception as e:
        sys.stderr.write(str(e) + '\nCould not get file in for for data with info: ' + line)


In [26]:
app_name = project + "/kfdrc-consensus-calling"
openpbta_manifest = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/pbta-histologies.tsv')
vcf_manifest = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/pbta_all_somatic_vcf-manifest.csv')
ref_dict = {}
ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
ref_dict['vep_cache'] = api.files.query(project=project, names=['homo_sapiens_vep_93_GRCh38_convert_cache.tar.gz'])[0]
ref_dict['strip_info'] = "INFO/CSQ"
bs_dict = {}
head = next(openpbta_manifest)
header = head.rstrip('\n').split('\t')
b_idx = header.index('Kids_First_Biospecimen_ID')
e_idx = header.index('experimental_strategy')
s_idx = header.index('sample_type')
bs_ids = []
for line in openpbta_manifest:
    info = line.rstrip('\n').split('\t')
    if info[e_idx] != 'RNA-seq' and info[s_idx] == 'Tumor':
        bs_ids.append(info[b_idx])
vcf_dict = {}
tools = ['strelka2', 'mutect2', 'lancet', 'vardict']
head = next(vcf_manifest)
header = head.rstrip('\n').split(',')
b_idx = header.index('Kids First Biospecimen ID Tumor')

i = 1
n = 100
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(mt_get_vcfs, line): line for line in vcf_manifest}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' files processed\n')
        i += 1
                
i = 1
n = 50

with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(setup_cc_task, bs_id): bs_id for bs_id in vcf_dict}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks set up\n')
        i += 1


100 files processed
200 files processed
300 files processed
400 files processed
500 files processed
600 files processed
700 files processed
800 files processed
900 files processed
1000 files processed
1100 files processed
1200 files processed
1300 files processed
1400 files processed
1500 files processed
1600 files processed
1700 files processed
1800 files processed
1900 files processed
2000 files processed
2100 files processed
2200 files processed
2300 files processed
2400 files processed
2500 files processed
2600 files processed
2700 files processed
2800 files processed
2900 files processed
3000 files processed
3100 files processed
3200 files processed
3300 files processed
3400 files processed
3500 files processed
3600 files processed
3700 files processed
3800 files processed
3900 files processed
50 tasks set up
100 tasks set up
150 tasks set up
200 tasks set up
250 tasks set up
300 tasks set up
350 tasks set up
400 tasks set up
450 tasks set up
500 tasks set up
550 tasks set up
600 

## Tag outputs

In [28]:
def mt_tag_outputs(task):
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        metadata = task.inputs[in_key].metadata
        for out_key in task.outputs:
            try:
                if type(task.outputs[out_key]) is not list:
                    file_obj = api.files.get(task.outputs[out_key].id)
                    for key in metadata:
                        file_obj.metadata[key] = metadata[key]
                    file_obj.save()
                else:
                    for output in task.outputs[out_key]:
                        file_obj = api.files.get(output.id)
                        for key in metadata:
                            file_obj.metadata[key] = metadata[key]
                        file_obj.save()
            except Exception as e:
                print(e)
                print("Skipping " + task.name + " due to error")

In [29]:
tasks = api.tasks.query(project=project, status="COMPLETED").all()
# task name search phrase
phrase = "KFDRC PBTA CONSENSUS"
# modify this to set which input file to use to tag the outputs with, may need to modify code if an array element
in_key = 'strelka2_vcf'
with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(mt_tag_outputs, task): task for task in tasks}

Valid task found KFDRC PBTA CONSENSUS: PT_VPEMAQBN BS_VXDGXQKZ BS_D48QXYW6
Valid task found KFDRC PBTA CONSENSUS: PT_NK8A49X5 BS_HEJ72V3F BS_668350EZ
Valid task found KFDRC PBTA CONSENSUS: PT_9GKVQ9QS BS_JRFVST47 BS_MVYA262V
Valid task found KFDRC PBTA CONSENSUS: PT_KBFM551M BS_QZRP3NSG BS_9H6Z0MEG
Valid task found KFDRC PBTA CONSENSUS: PT_WGVEF96B BS_4DQAQFQH BS_36YFSGDX
Valid task found KFDRC PBTA CONSENSUS: PT_KTRJ8TFY BS_AF5D41PD BS_SNRF1RKC
Valid task found KFDRC PBTA CONSENSUS: PT_KZ56XHJT BS_YHXMYDBN BS_3PNWA7WT
Valid task found KFDRC PBTA CONSENSUS: PT_KZ56XHJT BS_AK9BV52G BS_3PNWA7WT
Valid task found KFDRC PBTA CONSENSUS: PT_KZ56XHJT BS_1Q524P3B BS_3PNWA7WT
Valid task found KFDRC PBTA CONSENSUS: PT_MNSEJCDM BS_Y74XAFJX BS_29YQSB5E
Valid task found KFDRC PBTA CONSENSUS: PT_KZ56XHJT BS_D6STCMQS BS_3PNWA7WT
Valid task found KFDRC PBTA CONSENSUS: PT_MNSEJCDM BS_ZSH09N84 BS_29YQSB5E
Valid task found KFDRC PBTA CONSENSUS: PT_V1HNAC2Q BS_NNFDFAFM BS_E5RKHG41
Valid task found KFDRC PB

Valid task found KFDRC PBTA CONSENSUS: PT_32J909WM BS_JDMM2XG4 BS_ABDWNW1A
Valid task found KFDRC PBTA CONSENSUS: PT_CK41DW03 BS_3AERRZQT BS_4RV40GPD
Valid task found KFDRC PBTA CONSENSUS: PT_JW6FBEFK BS_3P96RT4E BS_6K3Y2WXF
Valid task found KFDRC PBTA CONSENSUS: PT_XP8AM1CH BS_1EQGGHHQ BS_938K7FSV
Valid task found KFDRC PBTA CONSENSUS: PT_2WVW55DA BS_R94EDP5H BS_M4KBSYZ2
Valid task found KFDRC PBTA CONSENSUS: PT_EPFX7V9H BS_NB509KST BS_Z9WDGDVR
Valid task found KFDRC PBTA CONSENSUS: PT_12JTZFC2 BS_P0KQFXNC BS_49Z8WJP2
Valid task found KFDRC PBTA CONSENSUS: PT_T93J9EGZ BS_3J4T2YYW BS_Y2Z06EDG
Valid task found KFDRC PBTA CONSENSUS: PT_HVZTF42R BS_53TV75NN BS_M28J7CGM
Valid task found KFDRC PBTA CONSENSUS: PT_82MX6J77 BS_P42V737Y BS_8782G07F
Valid task found KFDRC PBTA CONSENSUS: PT_ECBD1D5J BS_84AQCZE4 BS_ST9MMP22
Valid task found KFDRC PBTA CONSENSUS: PT_89XRZBSG BS_X1DSETVE BS_XHT3F34T
Valid task found KFDRC PBTA CONSENSUS: PT_SJEG0MKD BS_JJZPY3BG BS_Z5JH1FMC
Valid task found KFDRC PB

Valid task found KFDRC PBTA CONSENSUS: PT_XNTFM9XY BS_WD37M8SD BS_G473TPVG
Valid task found KFDRC PBTA CONSENSUS: PT_E79807KT BS_3A0CJMRH BS_N0X0S3JD
Valid task found KFDRC PBTA CONSENSUS: PT_YJBPEF4V BS_PC2YQ4F6 BS_MTDQBCQK
Valid task found KFDRC PBTA CONSENSUS: PT_9YMRNZHH BS_F8KR1QWE BS_5YCVM9WF
Valid task found KFDRC PBTA CONSENSUS: PT_EB89G70N BS_KB9GJDCS BS_8WTBCJT5
Valid task found KFDRC PBTA CONSENSUS: PT_Q25V9Y6M BS_9HDPJT21 BS_YVWRMH36
Valid task found KFDRC PBTA CONSENSUS: PT_DACV00WF BS_PVZJCXS1 BS_TQYACCGG
Valid task found KFDRC PBTA CONSENSUS: PT_R6WWH1QX BS_1AZ8YJSH BS_0Y1Y2WA4
Valid task found KFDRC PBTA CONSENSUS: PT_DH8A0SNQ BS_B5Z2XV03 BS_S7MPY3T4
Valid task found KFDRC PBTA CONSENSUS: PT_JYZA0PNV BS_8W0FNBWF BS_Q7NF95N0
Valid task found KFDRC PBTA CONSENSUS: PT_3RN24N9X BS_VW96PFKQ BS_AZG06JFG
Valid task found KFDRC PBTA CONSENSUS: PT_RJ1TJ2KH BS_6GN1FGTB BS_KBM2MTND
Valid task found KFDRC PBTA CONSENSUS: PT_SRWZDG3N BS_95RRF4NE BS_K7R4CV56
Valid task found KFDRC PB

Valid task found KFDRC PBTA CONSENSUS: PT_YFS3AWC0 BS_10V9SAG8 BS_TDN8NVYJ
Valid task found KFDRC PBTA CONSENSUS: PT_6BX46NVR BS_CFPWSZB5 BS_4RVAJBCR
Valid task found KFDRC PBTA CONSENSUS: PT_9PJR0ZK7 BS_A1DV9T7G BS_GJWAV3E5
Valid task found KFDRC PBTA CONSENSUS: PT_HEGTYPQJ BS_VKH9KYDB BS_TV12GQHS
Valid task found KFDRC PBTA CONSENSUS: PT_DTP4MMRA BS_NASADC3P BS_CEZVJC67
Valid task found KFDRC PBTA CONSENSUS: PT_4RJ2EATN BS_F7KYPE79 BS_4CVBQP6X
Valid task found KFDRC PBTA CONSENSUS: PT_2ECVKTTQ BS_DW1CYEXP BS_2EAATK5V
Valid task found KFDRC PBTA CONSENSUS: PT_VH6S71AC BS_8BD8WD38 BS_QEGZ6CYZ
Valid task found KFDRC PBTA CONSENSUS: PT_N8W26H19 BS_TZPJHKQR BS_251Z13NC
Valid task found KFDRC PBTA CONSENSUS: PT_5BWZA0NT BS_BFDEZK1C BS_85TADN40
Valid task found KFDRC PBTA CONSENSUS: PT_3AWKWXEV BS_CZRA594T BS_YKT1NAJQ
Valid task found KFDRC PBTA CONSENSUS: PT_HFQNKP5X BS_FBJ516WW BS_SDDK3Q44
Valid task found KFDRC PBTA CONSENSUS: PT_2YT37G8P BS_ZJZFT0AJ BS_9YHX7QFW
Valid task found KFDRC PB

Valid task found KFDRC PBTA CONSENSUS: PT_9DMMZ08D BS_FVYBGMG1 BS_XEY2VWR9
Valid task found KFDRC PBTA CONSENSUS: PT_3T3VGWC6 BS_NCGGFG6H BS_31E3HZPZ
Valid task found KFDRC PBTA CONSENSUS: PT_PNED3GG3 BS_WMEYJGDC BS_6458RHRT
Valid task found KFDRC PBTA CONSENSUS: PT_YKVK5HTK BS_3QDRWJCF BS_14G6WY88
Valid task found KFDRC PBTA CONSENSUS: PT_4XWYZYJS BS_VHD4ARSG BS_9E41M9S0
Valid task found KFDRC PBTA CONSENSUS: PT_BQ8BQ01J BS_0DKPGQWD BS_DMP1BV1H
Valid task found KFDRC PBTA CONSENSUS: PT_9HMVHQEH BS_H2K7XX4X BS_61C97RSA
Valid task found KFDRC PBTA CONSENSUS: PT_AW8WV14Y BS_YYAPSA5P BS_EQCAS6HV
Valid task found KFDRC PBTA CONSENSUS: PT_ZN1F7RMP BS_BR1X3R68 BS_BGW6PHHK
Valid task found KFDRC PBTA CONSENSUS: PT_8RB7TPS2 BS_ZS1QRMXS BS_J9M165VZ
Valid task found KFDRC PBTA CONSENSUS: PT_MV7Y44QQ BS_E0AAK0V0 BS_DW6P4EXB
Valid task found KFDRC PBTA CONSENSUS: PT_HN0AXSZ0 BS_Z34PQDYT BS_SSVBWYPN
Valid task found KFDRC PBTA CONSENSUS: PT_RQ21EBK9 BS_39Q80NT3 BS_D70Z2E3A
Valid task found KFDRC PB

Valid task found KFDRC PBTA CONSENSUS: PT_QPY0H50V BS_Z4S81HG1 BS_MKGT5WZD
Valid task found KFDRC PBTA CONSENSUS: PT_R10BSB9F BS_C5RDMCCS BS_618WFRXY
Valid task found KFDRC PBTA CONSENSUS: PT_DFQAH7RS BS_ZKFEMJZ8 BS_0V76MCTW
Valid task found KFDRC PBTA CONSENSUS: PT_HJMP6PH2 BS_9SVMVJ45 BS_0TTQDZ2K
Valid task found KFDRC PBTA CONSENSUS: PT_G8DRDA33 BS_32VQRFDS BS_DH8Z9WDK
Valid task found KFDRC PBTA CONSENSUS: PT_T5KEXCK7 BS_19EJ85F8 BS_SZYRB89G
Valid task found KFDRC PBTA CONSENSUS: PT_CGWPWSXM BS_9W6FK6X4 BS_N6XWSD15
Valid task found KFDRC PBTA CONSENSUS: PT_4159VCJY BS_HTGDSPQA BS_E7JTAP8Y
Valid task found KFDRC PBTA CONSENSUS: PT_1BV6ND1D BS_MZJJRDZF BS_RG2CTN65
Valid task found KFDRC PBTA CONSENSUS: PT_N32BYY8A BS_D6XHKZDZ BS_579DARY4
Valid task found KFDRC PBTA CONSENSUS: PT_DPVW4M1W BS_SQSH5G7A BS_GTY5J0S4
Valid task found KFDRC PBTA CONSENSUS: PT_1VQWQ0TC BS_QXZTRMWM BS_P65SHPNZ
Valid task found KFDRC PBTA CONSENSUS: PT_DA2TJAV1 BS_Y4ZX0YX7 BS_5N4ZSC43
Valid task found KFDRC PB

Valid task found KFDRC PBTA CONSENSUS: PT_PFA762TK BS_32JF8TPP BS_HM2ZG2J7
Valid task found KFDRC PBTA CONSENSUS: PT_76Q72M5T BS_DT92H1MJ BS_MN13H3T4
Valid task found KFDRC PBTA CONSENSUS: PT_BBHAD51D BS_WYF8NGSD BS_FW28DA07
Valid task found KFDRC PBTA CONSENSUS: PT_CB9PD1YP BS_WSFB4SA2 BS_Z8E250HZ
Valid task found KFDRC PBTA CONSENSUS: PT_9YNQJZ5G BS_C3X4AEK5 BS_BVRH3WGW
Valid task found KFDRC PBTA CONSENSUS: PT_5SX0GRW1 BS_KP0YNTAK BS_TP6GSAB6
Valid task found KFDRC PBTA CONSENSUS: PT_2FVTD0WR BS_8TFBNHT0 BS_C8EYP5BV
Valid task found KFDRC PBTA CONSENSUS: PT_DNAJYFZT BS_EJ1H9PZY BS_5PYJQBEA
Valid task found KFDRC PBTA CONSENSUS: PT_F822DBA0 BS_KPWRDRFP BS_874XHFP7
Valid task found KFDRC PBTA CONSENSUS: PT_3AWKWXEV BS_ERAWW3H7 BS_YKT1NAJQ
Valid task found KFDRC PBTA CONSENSUS: PT_E78NHRWQ BS_3DYRAZE7 BS_T5WVTGV8
Valid task found KFDRC PBTA CONSENSUS: PT_Z9QE51C3 BS_FNF1TCSW BS_0TX5YDGH
Valid task found KFDRC PBTA CONSENSUS: PT_BDNE55YT BS_Y43VC4VG BS_3E3VYSXT
Valid task found KFDRC PB

Valid task found KFDRC PBTA CONSENSUS: PT_N673SYWW BS_PMFR8NPN BS_PTYP388A
Valid task found KFDRC PBTA CONSENSUS: PT_TXFYRDP4 BS_5Z4XQC9X BS_JNFPEM6F
Valid task found KFDRC PBTA CONSENSUS: PT_6N1QWCXE BS_D2BZ25FW BS_M5M3BPDH
Valid task found KFDRC PBTA CONSENSUS: PT_AQWDQW27 BS_EDZCAHR2 BS_8R1DYXMB
Valid task found KFDRC PBTA CONSENSUS: PT_ND34A6PR BS_QMHND3QK BS_FNQMY128
Valid task found KFDRC PBTA CONSENSUS: PT_NPETR8RY BS_FSM39DXA BS_Q5ABAC4M
Valid task found KFDRC PBTA CONSENSUS: PT_1H2REHT2 BS_79SYEHY3 BS_0KKH9VKP
Valid task found KFDRC PBTA CONSENSUS: PT_C2VY5NM6 BS_N0Z6V2YG BS_FQ58DAZV
Valid task found KFDRC PBTA CONSENSUS: PT_SZJ7WZZW BS_WP9J88EB BS_FHDR7FJE
Valid task found KFDRC PBTA CONSENSUS: PT_E3S6H59E BS_T54TAE3R BS_C2AV5D4A
Valid task found KFDRC PBTA CONSENSUS: PT_S4YNE17X BS_CRKBDAYZ BS_HHJZ05XC
Valid task found KFDRC PBTA CONSENSUS: PT_6E8JYRXM BS_3R0KN6Z4 BS_TYCE05MW
Valid task found KFDRC PBTA CONSENSUS: PT_3V5GTCJR BS_QZTPJ8YW BS_63AK4SCF
Valid task found KFDRC PB

Valid task found KFDRC PBTA CONSENSUS: PT_GY0QSY4H BS_FWCKP4X1 BS_BG996C4D
Valid task found KFDRC PBTA CONSENSUS: PT_9S6WMQ92 BS_WY5KYSHJ BS_YVRSCEC6
Valid task found KFDRC PBTA CONSENSUS: PT_JN3S17B3 BS_1EYKDS0Z BS_1MMZN94J
Valid task found KFDRC PBTA CONSENSUS: PT_GY99ZJHV BS_17FVWMNV BS_61AK2H2N
Valid task found KFDRC PBTA CONSENSUS: PT_M5HDHQNQ BS_GKN0B47F BS_JGTBRH16
Valid task found KFDRC PBTA CONSENSUS: PT_CX58G6P6 BS_X6361T19 BS_GAPT45QB
Valid task found KFDRC PBTA CONSENSUS: PT_ZRQQC2S9 BS_PEFRDKDZ BS_QX8PTX5X
Valid task found KFDRC PBTA CONSENSUS: PT_A4KA01ZH BS_RCAZ62J7 BS_Z82320N1
Valid task found KFDRC PBTA CONSENSUS: PT_90W4HCNR BS_M1TDS9CT BS_GAFJBRM5
Valid task found KFDRC PBTA CONSENSUS: PT_NV584F7Q BS_X7QJCVJB BS_318P54W7
Valid task found KFDRC PBTA CONSENSUS: PT_17W3GJPT BS_ETTZM63Y BS_1XMBRB6K
Valid task found KFDRC PBTA CONSENSUS: PT_6PFFSB6D BS_4Z6F1HJZ BS_D4X1QMG0
Valid task found KFDRC PBTA CONSENSUS: PT_JP1FDKN9 BS_FF73TT6D BS_R333SVK1
Valid task found KFDRC PB

### Quick filter manifest on openPBTA

In [30]:
openpbta_manifest = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/pbta-histologies.tsv')
cav_manifest = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/1a_cavatica_merged_manifest.csv')
head = next(openpbta_manifest)
header = head.rstrip('\n').split('\t')
b_idx = header.index('Kids_First_Biospecimen_ID')
e_idx = header.index('experimental_strategy')
s_idx = header.index('sample_type')
bs_ids = []
for line in openpbta_manifest:
    info = line.rstrip('\n').split('\t')
    if info[s_idx] == 'Tumor':
        bs_ids.append(info[b_idx])
out = open('filtered_cavatica_manifest.csv', 'w')
head = next(cav_manifest)
out.write(head)
for line in cav_manifest:
    info = line.rstrip('\n').split(',')
    if info[-1] in bs_ids:
        out.write(line)
    else:
        sys.stderr.write(line)
out.close()


5c7db72ce4b0359d91c0d4e0,e335e487-c0bd-4548-a652-0d33d3f5d1d8.rsem.genes.results.gz,kfdrc-harmonization/sd-bhjxbdqk-06,,,BS_26RVYMMF
5c7dc069e4b0359d91c10b5c,589eca55-075e-4e73-a208-f42d44e10279.rsem.genes.results.gz,kfdrc-harmonization/sd-bhjxbdqk-06,,,BS_08HWDBX5
5c7dd517e4b0c5cd2e21e2db,9c1fb991-ceff-4d27-a46d-427cb946bbbf.rsem.genes.results.gz,kfdrc-harmonization/sd-bhjxbdqk-06,,,BS_0FRZD0VR
5c7dd6bae4b0c5cd2e21e7e1,c5f67e2e-795a-48e4-a1e6-3310de28191e.rsem.genes.results.gz,kfdrc-harmonization/sd-bhjxbdqk-06,,,BS_0DEVPYPE
5c7dd8dde4b0359d91c17f9c,4bb3eec9-229c-4f56-9ef8-5fc0e2f14ade.rsem.genes.results.gz,kfdrc-harmonization/sd-bhjxbdqk-06,,,BS_0MK8N9KZ
5c7dd92ae4b0c5cd2e21f041,a66f8fb0-b003-47a3-a93c-c66c22041770.rsem.genes.results.gz,kfdrc-harmonization/sd-bhjxbdqk-06,,,BS_02EBZYZ3
5c7ddf12e4b0c5cd2e220081,74640b56-5412-4bf1-aea2-10ebc0c6eab5.rsem.genes.results.gz,kfdrc-harmonization/sd-bhjxbdqk-06,,,BS_21ASRS3T
5c7de3bce4b0c5cd2e220a89,38d7a789-1989-4abb-a97b-7d7cb288f9d6.rsem.ge

## Split support files to CBTTC + PNOC

In [32]:
def process_file(bs_dict, bs_field, cohorts, file_in, file_outs, suffix):
    head = next(file_in)
    for cohort in cohorts:
        file_outs[cohort + suffix].write(head)
    header = head.rstrip('\n').split('\t')
    b_idx = header.index(bs_field)
    for line in file_in:
        info = line.rstrip('\n').split('\t')
        file_outs[(bs_dict[info[b_idx]] + suffix)].write(line)
    file_in.close()

In [34]:
openpbta_manifest = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/pbta-histologies.tsv')
cav_manifest = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/master_cav_manifest_task_list.txt')
tum_ds = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/tum_bs_ds_info.txt')
norm_ds = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/norm_bs_ds_info.txt')

bs_dict = {}
head = next(openpbta_manifest)
header = head.rstrip('\n').split('\t')
b_idx = header.index('Kids_First_Biospecimen_ID')
c_idx = header.index('cohort')

for line in openpbta_manifest:
    info = line.rstrip('\n').split('\t')
    bs_dict[info[b_idx]] = info[c_idx]
cohorts = ['CBTTC', 'PNOC003']
file_outs = {}
file_outs['CBTTC_cav'] = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/CBTTC_cav.txt', 'w')
file_outs['PNOC003_cav'] = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/PNOC003_cav.txt', 'w')
file_outs['CBTTC_tum_bs_ds_info'] = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/CBTTC_tum_bs_ds_info.txt', 'w')
file_outs['PNOC003_tum_bs_ds_info'] = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/PNOC003_tum_bs_ds_info.txt', 'w')
file_outs['CBTTC_norm_bs_ds_info'] = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/CBTTC_norm_bs_ds_info.txt', 'w')
file_outs['PNOC003_norm_bs_ds_info'] = open('/Users/brownm28/Documents/PORTAL_LOADS/CBTTC_COMPLETE/2020-Jan-21_KF_UPDATE/PNOC003_norm_bs_ds_info.txt', 'w')

process_file(bs_dict, 'T/CL BS ID', cohorts, cav_manifest, file_outs, '_cav' )
process_file(bs_dict, 'BS_ID', cohorts, tum_ds ,file_outs, '_tum_bs_ds_info' )
process_file(bs_dict, 'BS_ID', cohorts, norm_ds ,file_outs, '_norm_bs_ds_info')

for key in file_outs:
    file_outs[key].close()
