In [14]:
import sys
from requests import request
import os
import math
import pdb

## Create master data sample and patient sheets

In [24]:
dna_pairs = {}
rna_data = {}
norm_samp_id = {}
dx_index = {}
out_dir = '/Users/brownm28/Documents/2018-Aug-15_cbttc_launch/portal_metadata/check_and_balances/audit/cavatica-up_audit/data_sheets/'
cav_fn = '/Users/brownm28/Documents/2018-Aug-15_cbttc_launch/portal_metadata/check_and_balances/audit/cavatica-up_audit/cbttc_cavatica_file_info.txt'
cav_fh = open(cav_fn)
next(cav_fh)
norm_fn = '/Users/brownm28/Documents/2018-Aug-15_cbttc_launch/portal_metadata/check_and_balances/audit/cavatica-up_audit/norm_bs_ds_info.txt'
norm_fh = open(norm_fn)
tum_info = '/Users/brownm28/Documents/2018-Aug-15_cbttc_launch/portal_metadata/check_and_balances/audit/cavatica-up_audit/tum_bs_ds_info.txt'
tum_fh = open(tum_info)
dx_fn = '/Users/brownm28/Documents/2018-Aug-15_cbttc_launch/portal_metadata/check_and_balances/audit/cavatica-up_audit/dx_index.txt'
dx_fh = open(dx_fn)
# gathering DNA somatic bs id pairs and RNA bs ids run on cavatica
blacklist = {}
for line in cav_fh:
    info = line.rstrip('\n').split('\t')
    bs_id = info[0]
    if info[4] == 'RNA':
        rna_data[bs_id] = 0
    else:
        if bs_id not in dna_pairs:
            dna_pairs[bs_id] = []
            dna_pairs[bs_id] = info[1]
        else:
            blacklist[bs_id] = 'Double norm'
            sys.stderr.write('WARN: tumor bs id ' + bs_id + ' already associated with a normal sample.  Skipping!\n')
cav_fh.close()
next(norm_fh)
# gathering norm sample IDS 
for line in norm_fh:
    info = line.rstrip('\n').split('\t')
    norm_samp_id[info[0]] = info[6]
norm_fh.close()

next(dx_fh)
# create a cbttc dx-to-cbio index
for line in dx_fh:
    (cbttc, cbio_short, cbio_long) = line.rstrip('\n').split('\t')
    dx_index[cbttc] = cbio_short
dx_fh.close()
# has all info necessary to flatten by pt and sample, key hierarchy: dx -> pt_id -> pt_attrs, samples -> sample_id -> samp attrs
master_dict = {}
# dict of samples that fail certain criteria, like unique sample ID per paitent, per anaylte type

next(tum_fh)
# collate tum info with DNA pair info and norm info by dx
sys.stderr.write('Collating information for data sheet\n')
for line in tum_fh:
    info = line.rstrip('\n').split('\t')
    bs_id = info[0]
    pt_id = info[1]
    ana_type = info[3]
    samp_id = info[6].split('.')[0]
    samp_type = info[5]
    if samp_type == 'Derived Cell Line':
        samp_id += '-CL'
    age = ''
    if info[7] != 'NULL':
        age = str(math.floor(float(info[7])/365.25))
    cdx_list = info[12].split(';')
    loc_list = info[13].split(';')
    for i in range(0, len(cdx_list), 1):
        if cdx_list[i] != '' and cdx_list[i] in dx_index:
            cur_dx = dx_index[cdx_list[i]]
            if cur_dx not in master_dict:
                master_dict[cur_dx] = {}
            if pt_id not in master_dict[cur_dx]:
                master_dict[cur_dx][pt_id] = {}
                cur_pt = master_dict[cur_dx][pt_id]
                cur_pt['age'] = age
                cur_pt['gender'] = info[9]
                cur_pt['ethnicity'] = info[10]
                cur_pt['race'] = info[11]
                cur_pt['tumor_site'] = loc_list[i]
                cur_pt['samples'] = {}
            cur_pt = master_dict[cur_dx][pt_id]
            if (age != '' and cur_pt['age'] != '' and int(age) < int(cur_pt['age'])) or (cur_pt['age'] == '' and age != ''):
                cur_pt['age'] = age

            if samp_id not in cur_pt['samples']:
                cur_pt['samples'][samp_id] = {}
            cur_samp = master_dict[cur_dx][pt_id]['samples'][samp_id]
            if ana_type in cur_samp:
                blacklist[samp_id] = 'Double samp id'
                sys.stderr.write('WARN: Two biospecimens of the same analyte type share sample ID ' + samp_id + ', skipping!\n')
            else:
                cur_samp[ana_type] = {}
                cur_samp[ana_type]['specimen_id'] = bs_id
                cur_samp[ana_type]['tumor_site'] = loc_list[i]
                cur_samp[ana_type]['cancer_type'] = cdx_list[i]
                tumor_type = 'primary'
                if cdx_list[i] == 'Metastatic secondary tumors':
                    tumor_type = 'metastatic'
                cur_samp[ana_type]['tumor_type'] = tumor_type
                cur_samp[ana_type]['sample_type'] = samp_type
                if ana_type == 'DNA':
                    norm_spec = dna_pairs[bs_id]
                    norm_samp = norm_samp_id[norm_spec]
                    cur_samp[ana_type]['matched_norm_samp'] = norm_samp
                    cur_samp[ana_type]['matched_norm_spec'] = norm_spec
        else:
            blacklist[bs_id] = 'No dx\n'
            sys.stderr.write('WARN: biospecimen ' + bs_id + ' with dx ' + cdx_list[i] + ' is invalid, skipping!\n')

sys.stderr.write('Creating dx-specific data sheets\n')
pt_head = '#Patient Identifier\tGENDER\tAGE\tTUMOR_SITE\tRACE\tETHNICITY\n'\
'#Patient identifier\tGender or sex of the patient\tAge at which the condition or disease was first diagnosed, in years\tTumor location\tracial demographic\tethnic demographic\n'\
'#STRING\tSTRING\tNUMBER\tSTRING\tSTRING\tSTRING\n'\
'#1\t1\t1\t1\t1\t1\n'\
'PATIENT_ID\tGENDER\tAGE\tTUMOR_SITE\tRACE\tETHNICITY\n'

# IMPORTANT! will use external sample id as sample id, and bs id as a specimen id
samp_head = '#Patient Identifier\tSample Identifier\tSPECIMEN_ID\tCANCER_TYPE\tCANCER_TYPE_DETAILED\tTUMOR_TISSUE_SITE\tTUMOR_TYPE\tSAMPLE_TYPE\tMATCHED_NORMAL_SAMPLE_ID\tMATCHED_NORMAL_SPECIMEN_ID\n'\
'#Patient identifier\tSample Identifier using external_sample_id\tkfdrc tumor biopsecimen ID\tStudy-defined cancer type\tStudy-defined cancer type detail\ttumor tissue location\tprimary v metastatic tumor designation\tpatient tissue sample or cell line\tmatched normal external_sample_id\tkfdrc matched normal biospecimen ID\n'\
'#STRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\tSTRING\n'\
'#1\t1\t1\t1\t1\t1\t1\t1\t1\t1\n'\
'PATIENT_ID\tSAMPLE_ID\tSPECIMEN_ID\tCANCER_TYPE\tCANCER_TYPE_DETAILED\tTUMOR_TISSUE_SITE\tTUMOR_TYPE\tSAMPLE_TYPE\tMATCHED_NORMAL_SAMPLE_ID\tMATCHED_NORMAL_SPECIMEN_ID\n'

for dx in master_dict:
    sys.stderr.write('Outputting results for ' + dx + '\n')
    os.mkdir(out_dir + dx)
    out_samp = open(out_dir + dx + '/data_clinical_sample.txt', 'w')
    out_pt = open(out_dir + dx + '/data_clinical_patient.txt', 'w')
    out_samp.write(samp_head)
    out_pt.write(pt_head)
    for pt_id in master_dict[dx]:
        f = 0
        cur_pt = master_dict[dx][pt_id]
        for samp_id in cur_pt['samples']:
            if samp_id not in blacklist:
                f = 1
                cur_samp = cur_pt['samples'][samp_id]
                norm_samp = ''
                norm_spec = ''
                cdx = ''
                loc = ''
                tumor_type = ''
                samp_type = ''
                bs_ids = []
                if 'DNA' in cur_samp:
                    bs_ids.append(cur_samp['DNA']['specimen_id'])
                    norm_samp = cur_samp['DNA']['matched_norm_samp']
                    norm_spec = cur_samp['DNA']['matched_norm_spec']
                    cdx = cur_samp['DNA']['cancer_type']
                    loc = cur_samp['DNA']['tumor_site']
                    tumor_type = cur_samp['DNA']['tumor_type']
                    samp_type = cur_samp['DNA']['sample_type']
                if 'RNA' in cur_samp:
                    bs_ids.append(cur_samp['RNA']['specimen_id'])
                    if 'DNA' not in cur_samp:
                        cdx = cur_samp['RNA']['cancer_type']
                        loc = cur_samp['RNA']['tumor_site']
                        tumor_type = cur_samp['RNA']['tumor_type']
                        samp_type = cur_samp['RNA']['sample_type']
                out_samp.write('\t'.join((pt_id, samp_id, ';'.join(bs_ids), cdx, cdx, loc, tumor_type, samp_type, norm_samp, norm_spec)) + '\n')
        if f == 1:
            cur_pt = master_dict[dx][pt_id]
            out_pt.write('\t'.join((pt_id, cur_pt['gender'], cur_pt['age'], cur_pt['tumor_site'], cur_pt['race'], cur_pt['ethnicity'])) + '\n')
        else:
            sys.stderr.write('WARN: ' + pt_id + ' skipped, all samples were in blacklist\n')
    out_samp.close()
    out_pt.close()
sys.stderr.write('Outputting complete, check files\n')
        

WARN: tumor bs id BS_H1K33JVK already associated with a normal sample.  Skipping!
WARN: tumor bs id BS_5Z4XQC9X already associated with a normal sample.  Skipping!
Collating information for data sheet


KeyError: 'phgg'