## Setup imports and main url

In [52]:
import sys
from requests import request
import os
import math

url = 'https://kf-api-dataservice.kidsfirstdrc.org'

### Def to get single patient info by bs id

In [47]:
def parse_pt(url, bs_id):
    bs_url = url + '/biospecimens/' + bs_id
    bs_data = request('GET', bs_url)
    pt_link = bs_data.json()['_links']['participant']
    pt_id = pt_link.split('/')[2]
    pt_data = request('GET', url + pt_link)
    other_sample_id = bs_data.json()['results']['external_sample_id']
    ethnicity = pt_data.json()['results']['ethnicity']
    gender = pt_data.json()['results']['gender']
    race = pt_data.json()['results']['race']
    return (ethnicity, other_sample_id, gender, race, pt_id)
    

### Def to get dx data by biospec ID

In [48]:
def parse_dx(url, bs_id, dx):
    dx_url = url + '/diagnoses?biospecimen_id=' + bs_id
    try:
        dx_data = request('GET', dx_url)
        for res in dx_data.json()['results']:
            if res['source_text_diagnosis'] == dx:
                age = str(math.floor(res['age_at_event_days']/365.25))
                location = res['source_text_tumor_location']
                return age, location
    except:
        sys.stderr.write('Failed at getting data from link ' + dx_url + '\n')
        exit(1)

### Def to intialize file outputs using a string to create output folder

In [49]:
def init_outputs(dx, pt_fh, samp_fh, pt_head, samp_head):
    os.mkdir(dx)
    pt_fh[dx] = open(dx + '/data_clinical_patient.txt', 'w')
    pt_fh[dx].write(pt_head)
    samp_fh[dx] = open(dx + '/data_clinical_sample.txt', 'w')
    samp_fh[dx].write(samp_head)
    return(pt_fh, samp_fh)

### Def to process bs_id info

In [82]:
def process_bs_ids(url, bs_link, study_dict):
    pt_bs_link = url + bs_link + '&limit=100'
    pt_bs_info = request('GET', pt_bs_link)
    tum_bs = []
    tum_ext = []
    assay = []
    age = []
    # dx array will be special and output as semi-colon, csv
    dx = {}
    cdx = {}
    loc = {}
    norm_bs = []
    norm_ext = []
    flag = len(pt_bs_info.json()['results'])
    for bs in pt_bs_info.json()['results']:
        cur_bs_id = bs['kf_id']
        if bs['source_text_tissue_type'] == 'Tumor':
            tum_bs.append(cur_bs_id)
            tum_ext.append(bs['external_sample_id'].split('.')[0])
            assay.append(bs['analyte_type'])
            try:
                age.append(str(math.floor(bs['age_at_event_days']/365.25)))
            except:
                age.append('NA')
            dx_link = url + bs['_links']['diagnoses']
            bs_dx = request('GET', dx_link)
            dx[cur_bs_id] = []
            cdx[cur_bs_id] = []
            loc[cur_bs_id] = []
            for dx_res in bs_dx.json()['results']:
                loc[cur_bs_id].append(dx_res['source_text_tumor_location'])
                dx[cur_bs_id].append(dx_res['source_text_diagnosis'])
                if dx_res['source_text_diagnosis'] in study_dict:
                    cdx[cur_bs_id].append(study_dict[dx_res['source_text_diagnosis']])
                else:
                    cdx[cur_bs_id].append('MISSING')
        else:
            norm_bs.append(cur_bs_id)
            norm_ext.append(bs['external_sample_id'].split('.')[0])
            if bs['analyte_type'] == 'RNA':
                sys.stderr.write('WARNING: normal sample ' + cur_bs_id + ' has RNA.  Rethink pairs\n')
    return tum_bs, tum_ext, assay, age, dx, cdx, loc, norm_bs, norm_ext, flag

### Def to iterate through array of patient result objects

In [86]:
def process_pt_batch(results_obj, study_dict, out_fh):
    for res in results_obj:
        pt_id = res['kf_id']
        ethnicity = res['ethnicity']
        gender = res['gender']
        race = res['race']
        (tum_bs_csv, tum_ext_csv, assay_csv, age_csv, dx_scsv, cdx_scsv, loc_scsv, norm_bs_csv, norm_ext_csv, flag) = process_bs_ids(url, res['_links']['biospecimens'], study_dict)
        dx_str = []
        cdx_str = []
        loc_str = []
        if flag > 0:
            for tum_id in tum_bs_csv:
                dx_str.append(';'.join(dx_scsv[tum_id]))
                cdx_str.append(';'.join(cdx_scsv[tum_id]))
                loc_str.append(';'.join(loc_scsv[tum_id]))
            # dx array will be special and output as semi-colon, pipe
            out_fh.write('\t'.join((pt_id, ethnicity, race, gender, ','.join(tum_bs_csv), ','.join(tum_ext_csv), ','.join(assay_csv), ','.join(age_csv), '|'.join(loc_str), '|'.join(dx_str), '|'.join(cdx_str), ','.join(norm_bs_csv), ','.join(norm_ext_csv))) + '\n')
        else:
            sys.stderr.write(pt_id + ' has 0 biospecimen data.  SKIPPING!\n')

### Def to get all info on participants in a single study

In [66]:
def gather_data(url, study_id, study_dict, out_fh):
    f = 0
    lim = 100
    x = lim
    next_link = ''
    init_link = url + '/participants?study_id=' + study_id + '&limit=' + str(lim)
    sys.stderr.write('Processing first batch of ' + str(lim) + ' ' + init_link + '\n' )
    init_pt = request('GET', init_link)
    if 'next' in init_pt.json()['_links']:
        next_link = url + init_pt.json()['_links']['next'] + '&limit=' + str(lim)
    else:
        f = 1
    process_pt_batch(init_pt.json()['results'], study_dict, out_fh)
    while f == 0:
        x += lim
        sys.stderr.write('Processing next batch of ' + str(lim) + ' ' + next_link + '\n' )
        next_pt = request('GET', next_link)
        if 'next' in next_pt.json()['_links']:
            next_link = url + next_pt.json()['_links']['next'] + '&limit=' + str(lim)
        else:
            f = 1
            sys.stderr.write('Last batch\n')
        if x >= 40:
            f = 1
            sys.stderr.write('Last batch TEST MODE\n')
        process_pt_batch(next_pt.json()['results'], study_dict, out_fh) 

## Workflow that's mostly universal for initializing portal loading metadata files

In [87]:
study_index = open('/Users/brownm28/Documents/2018-Aug-15_cbttc_launch/portal_metadata/dx_index.txt')
cwd = '/Users/brownm28/Documents/2018-Aug-15_cbttc_launch/portal_metadata/'
os.chdir(cwd)
study_dict = {}
for line in study_index:
    info = line.rstrip('\n').split('\t')
    cid = info[1]
    study_dx = info[0]
    study_dict[study_dx] = cid

pt_fh = {}
samp_fh = {}
pt_head = '#Patient Identifier\tGENDER\tAGE\tTUMOR_SITE\tRACE\tETHNICITY\n'\
'#Patient identifier\tGender or sex of the patient\tAge at which the condition or disease was first diagnosed, in years\tTumor location\tracial demographic\tethnic demographic\n'\
'#STRING\tSTRING\tNUMBER\tSTRING\tSTRING\tSTRING\n'\
'#1\t1\t1\t1\t1\t1\n'\
'PATIENT_ID\tGENDER\tAGE\tTUMOR_SITE\tRACE\tETHNICITY\n'

sys.stderr.write(pt_head)
# IMPORTANT! will use external sample id as sample id, and bs id as a specimen id
samp_head = '#Patient Identifier\tSample Identifier\tSPECIMEN_ID\tCANCER_TYPE\tCANCER_TYPE_DETAILED\tTUMOR_TISSUE_SITE\tTUMOR_TYPE\tMATCHED_NORMAL_SAMPLE_ID\tMATCHED_NORMAL_SPECIMEN_ID\n'\
'#Patient identifier\tSample Identifier using external_sample_id\tkfdrc tumor biopsecimen ID\tStudy-defined cancer type\tStudy-defined cancer type detail\ttumor tissue location\tprimary v metastatic tumor designation\tmatched normal external_sample_id\tkfdrc matched normal biospecimen ID\n'\
'#STRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\n'\
'#1\t1\t1\t1\t1\t1\t1\t1\t1\n'\
'PATIENT_ID\tSAMPLE_ID\tSPECIMEN_ID\tCANCER_TYPE\tCANCER_TYPE_DETAILED\tTUMOR_TISSUE_SITE\tTUMOR_TYPE\tMATCHED_NORMAL_SAMPLE_ID\tMATCHED_NORMAL_SPECIMEN_ID\n'

sys.stderr.write(samp_head)


#Patient Identifier	GENDER	AGE	TUMOR_SITE	RACE	ETHNICITY
#Patient identifier	Gender or sex of the patient	Age at which the condition or disease was first diagnosed, in years	Tumor location	racial demographic	ethnic demographic
#STRING	STRING	NUMBER	STRING	STRING	STRING
#1	1	1	1	1	1
PATIENT_ID	GENDER	AGE	TUMOR_SITE	RACE	ETHNICITY
#Patient Identifier	Sample Identifier	SPECIMEN_ID	CANCER_TYPE	CANCER_TYPE_DETAILED	TUMOR_TISSUE_SITE	TUMOR_TYPE	MATCHED_NORMAL_SAMPLE_ID	MATCHED_NORMAL_SPECIMEN_ID
#Patient identifier	Sample Identifier using external_sample_id	kfdrc tumor biopsecimen ID	Study-defined cancer type	Study-defined cancer type detail	tumor tissue location	primary v metastatic tumor designation	matched normal external_sample_id	kfdrc matched normal biospecimen ID
#STRING	STRING	STRING	STRING	STRING	STRING	STRING	STRING	STRING
#1	1	1	1	1	1	1	1	1
PATIENT_ID	SAMPLE_ID	SPECIMEN_ID	CANCER_TYPE	CANCER_TYPE_DETAILED	TUMOR_TISSUE_SITE	TUMOR_TYPE	MATCHED_NORMAL_SAMPLE_ID	MATCHED_NORMAL_SPECIME

## Top-down portal metadata worflow

In [88]:
study_id = 'SD_BHJXBDQK'
tbl_head = 'Participant ID\tEthnicity\tRace\tGender\tTumor_bs_ids\tTumor_ext_ids\tassay_types\tage_list\tlocation_list\tdx_list\tbio_dx_list\tNormal_bs_ids\tNormal_ext_ids\n'
out_fh = open('portal_staging_table.txt', 'w')
out_fh.write(tbl_head)
gather_data(url, study_id, study_dict, out_fh)
out_fh.close()
sys.stderr.write('FIN!\n')

Processing first batch of 20 https://kf-api-dataservice.kidsfirstdrc.org/participants?study_id=SD_BHJXBDQK&limit=20
PT_9A2NW4XJ has 0 biospecimen data.  SKIPPING!
PT_FT333PXW has 0 biospecimen data.  SKIPPING!
PT_XJX2AA4N has 0 biospecimen data.  SKIPPING!
PT_T5QQQ1TA has 0 biospecimen data.  SKIPPING!
PT_F5Y27VCN has 0 biospecimen data.  SKIPPING!
PT_Y674TRHE has 0 biospecimen data.  SKIPPING!
PT_CNCJZ4JD has 0 biospecimen data.  SKIPPING!
PT_HYBNXPN8 has 0 biospecimen data.  SKIPPING!
PT_64C7QNBP has 0 biospecimen data.  SKIPPING!
PT_1FP0ZWQB has 0 biospecimen data.  SKIPPING!
PT_NCF47KJM has 0 biospecimen data.  SKIPPING!
PT_GSS468PJ has 0 biospecimen data.  SKIPPING!
PT_QD35M3XB has 0 biospecimen data.  SKIPPING!
PT_XEVP2MH2 has 0 biospecimen data.  SKIPPING!
PT_AA0J9GBF has 0 biospecimen data.  SKIPPING!
PT_QVCXSYEJ has 0 biospecimen data.  SKIPPING!
PT_B1M7TA7S has 0 biospecimen data.  SKIPPING!
PT_SC8E6AQX has 0 biospecimen data.  SKIPPING!
Processing next batch of 20 https://kf

## bottom-up metadata workflow using input table

In [51]:
master_table = open('/Users/brownm28/Documents/2018-Aug-15_cbttc_launch/portal_metadata/atrt_test.txt')
pt_used = {}
for line in master_table:
    sys.stderr.write('Processing line ' + line)
    info = line.rstrip('\n').split('\t')
    for i in range(0, 3, 1):
        cur_dx = info[(7 + i)]
        if cur_dx != '0':
            # check if disease-specific out files are open, initialize if not
            bio_dx = study_dict[cur_dx]
            if bio_dx not in pt_fh:
                (pt_fh, samp_fh) = init_outputs(bio_dx, pt_fh, samp_fh, pt_head, samp_head)
            tum_bs_id = info[2]
            norm_bs_id = info[5]
            cancer_type = cur_dx
            tumor_type = 'primary'
            if cur_dx == 'Metastatic secondary tumors':
                tumor_type = 'metastatic'
            (ethnicity, other_sample_id, gender, race, pt_id) = parse_pt(url, tum_bs_id)
            (ethnicity, norm_other_sample_id, gender, race, pt_id) = parse_pt(url, norm_bs_id)
            try:
                (age, location) = parse_dx(url, tum_bs_id, cur_dx)
            except:
                sys.stderr.write('Could not process dx for ' + tum_bs_id + ' ' + cur_dx + '\n')
                exit(1)
            if pt_id not in pt_used:
                pt_fh[bio_dx].write('\t'.join((pt_id, gender, age, location, race, ethnicity)) + '\n')
                pt_used[pt_id] = 1
            samp_fh[bio_dx].write('\t'.join((pt_id, tum_bs_id, cancer_type, cancer_type, location, tumor_type, other_sample_id, norm_bs_id, norm_other_sample_id)) + '\n')
for bio_dx in pt_fh:
    pt_fh[bio_dx].close()
    samp_fh[bio_dx].close()

Processing line C921639	7316-2276	BS_0FQKT8EY	0f3d9f4f-3d81-48ca-8780-9d5c7bdc8440.cram	7316-2276	BS_S2N3XTKS	83ea5d1e-02a8-4f68-ac5f-65e8a8e82d0e.cram	Atypical Teratoid Rhabdoid Tumor (ATRT)	0	0	cbttc-dna-launch-BS_0FQKT8EY_BS_S2N3XTKS	08296b5c-5a27-425d-ae2a-03c2ccbbf86b
Processing line C85854	7316-386	BS_GTAJFTTR	35aca629-94f5-4274-9269-71e604ac6cbd.cram	7316-386	BS_0PB4PTHT	fa8227bb-8c65-427f-a90b-99239112d2f1.cram	Atypical Teratoid Rhabdoid Tumor (ATRT)	0	0	cbttc-dna-launch-BS_GTAJFTTR_BS_0PB4PTHT	2618bbb5-78fe-40b1-8376-7401f6b363d1
Processing line C374412	7316-1771	BS_859AV1DB	194f12cd-a548-47f9-8603-83199f06d041.cram	7316-1771	BS_1DM8G1V8	c511fa28-6ef3-4051-864f-a98f1d5d41d7.cram	Atypical Teratoid Rhabdoid Tumor (ATRT)	0	0	cbttc-dna-launch-BS_859AV1DB_BS_1DM8G1V8	2f398eb5-a62f-4888-9565-9c68b25daf21
Processing line C886092	7316-2269	BS_ZAT0XJE5	2858e25a-2d27-4882-88fc-01844e8b6a44.cram	7316-2269	BS_GERET530	9706a97e-6eba-4c34-bdec-3d352da0a670.cram	Atypical Teratoid Rhabdoid Tu