# Compare genomic file names in a study with fastq and bam file names on cavatica

In [38]:
def process_bs_batch(url, bs_res, f_dict, b_dict):
    if len(bs_res) == 0:
        sys.stderr.write('WARN: above patient has 0 biospecimens!\n')
    else:
        for bs in bs_res:
            cur_bs_id = bs['kf_id']
            if cur_bs_id in b_dict:
                sys.stderr.write('ERROR! ' + cur_bs_id + ' already seen.  Call the po-lice!\n')
            else:
                b_dict[cur_bs_id] = []
            gf_link = url + bs['_links']['biospecimen_genomic_files'] + '&limit=100'
            gf_obj = request('GET', gf_link)
            gf_res = gf_obj.json()['results'] 
            if len(gf_res) == 0:
                sys.stderr.write('WARN:' + bs + ' has 0 associated genomic files\n')
            else:
                for cur_gf_res in gf_res:
                    file_url = url + cur_gf_res['_links']['genomic_file']
                    gf_file = request('GET', file_url)
                    fn = gf_file.json()['results']['file_name']
                    f_type = gf_file.json()['results']['file_format']
                    if f_type == 'fastq' or f_type == 'bam':
                        if fn in f_dict:
                            sys.stderr.write('ERROR! ' + fn + ' already seen.  Call the fire dept!\n')
                        f_dict[fn] = 0
                        b_dict[cur_bs_id].append(fn)
    return f_dict, b_dict
            
            

In [34]:
def process_pt_batch(url, results_obj, f_dict, b_dict):
    for res in results_obj:
        sys.stderr.write('Processing patient ' + res['kf_id'] + '\n')
        bs_link = url + res['_links']['biospecimens']
        bs_res = request('GET', bs_link)
        (f_dict, bs_dict) = process_bs_batch(url, bs_res.json()['results'], f_dict, b_dict)
    return f_dict, bs_dict

In [35]:
def luke_study_walker(url, study_id):
    f = 0
    lim = 20
    x = lim
    next_link = ''
    init_link = url + '/participants?study_id=' + study_id + '&limit=' + str(lim)
    sys.stderr.write('Processing first batch of ' + str(lim) + ' ' + init_link + '\n' )
    init_pt = request('GET', init_link)
    file_dict = {}
    bs_id_dict = {}
    if 'next' in init_pt.json()['_links']:
        next_link = url + init_pt.json()['_links']['next'] + '&limit=' + str(lim)
    else:
        f = 1
    process_pt_batch(url, init_pt.json()['results'], file_dict, bs_id_dict)
    while f == 0:
        x += lim
        sys.stderr.write('Processing next batch of ' + str(lim) + ' ' + next_link + '\n' )
        next_pt = request('GET', next_link)
        if 'next' in next_pt.json()['_links']:
            next_link = url + next_pt.json()['_links']['next'] + '&limit=' + str(lim)
        else:
            f = 1
            sys.stderr.write('Last batch\n')
        if x >= 40:
            f = 1
            sys.stderr.write('Last batch TEST MODE\n')
        (file_dict, bs_id_dict) =  process_pt_batch(url, next_pt.json()['results'], file_dict, bs_id_dict)
    return file_dict, bs_id_dict
    

In [57]:
def check_cavatica(project, api, ds_dict):
    proj_files = api.files.query(project=project).all()
    cav_out = open('/Users/brownm28/Documents/2018-Sep-11_Rios-Wise/cavatica_file_list.txt', 'w')
    for f_obj in proj_files:
        fn = f_obj.name
        parts = fn.split('.')
        if parts[-1] == 'bam' or parts[-2] == 'fastq':
            cav_out.write(fn + '\n')
            if fn in ds_dict:
                ds_dict[fn] += 1
            else:
                sys.stderr.write('ERROR: ' + fn + ' in cavatica, but not dataservice. Call Magnum PI!\n')
    for fn in ds_dict:
        if ds_dict[fn] < 2:
            sys.stderr.write('ERROR: '+ fn + ' in dataservice, but not cavcatica\n')
    cav_out.close()

## Imports

In [36]:
import sevenbridges as sbg
import sys
from requests import request
config = sbg.Config(profile='cavatica')
api = sbg.Api(config=config)

In [58]:
kf_url = 'https://kf-api-dataservice.kidsfirstdrc.org/'
project = 'kfdrc-harmonization/sd-rm8afw0r'
study_id = 'SD_RM8AFW0R'
(fn_ds_dict, ds_bs_dict) = luke_study_walker(kf_url, study_id)
sys.stderr.write('Completed strolling through data service.  Check and outputting data service dict\n')
ds_out = open('/Users/brownm28/Documents/2018-Sep-11_Rios-Wise/data_service_gf_files.txt', 'w')
for bs_id in ds_bs_dict:
    for fn in ds_bs_dict[bs_id]:
        if fn not in fn_ds_dict:
            sys.stderr.write('Missed adding ' + fn + ' in bs dict\n')
        else:
            fn_ds_dict[fn] = 1
        ds_out.write(bs_id + '\t' + fn + '\n')
ds_out.close()
sys.stderr.write('Checking file dict\n')
for fn in fn_ds_dict:
    if fn_ds_dict[fn] == 0:
        sys.stderr.write('Missing adding ' + fn + ' in file dict\n')
sys.stderr.write('Searching cavatica and validating files\n')
check_cavatica(project, api, fn_ds_dict)
sys.stderr.write('FIN!\n')


Processing first batch of 20 https://kf-api-dataservice.kidsfirstdrc.org//participants?study_id=SD_RM8AFW0R&limit=20
Processing patient PT_VVP2BNC9
Processing patient PT_V92HV7HN
Processing patient PT_C31VEA3C
Processing patient PT_1X0DDYKF
Processing patient PT_P80QME5E
Processing patient PT_72YB2MRN
WARN: above patient has 0 biospecimens!
Processing patient PT_DFT82PKP
Processing patient PT_72G0CFQP
Processing patient PT_VZMKSQG7
Processing patient PT_5Q324KXA
Processing patient PT_Q0JDGATG
Processing patient PT_2FTTHKQ6
Processing patient PT_A6ZPXWVS
Processing patient PT_4JGBCKCA
Processing patient PT_95115C3T
Processing patient PT_VFX5WJN5
Processing patient PT_6JKM29AN
Processing patient PT_2V85FFTE
Processing patient PT_YCYGQFHV
Processing patient PT_QZQ0XC5K
Processing next batch of 20 https://kf-api-dataservice.kidsfirstdrc.org//participants?after=1536072029.309735&study_id=SD_RM8AFW0R&limit=20
Last batch TEST MODE
Processing patient PT_Q4CA5EFM
Processing patient PT_5RW5XM04


ERROR: HGNKMALXX_s6_1_GSLv3-7_70_SL243522.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HGNKMALXX_s6_2_GSLv3-7_70_SL243522.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HGNKMALXX_s7_1_GSLv3-7_71_SL243523.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HGNKMALXX_s7_2_GSLv3-7_71_SL243523.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HGNKMALXX_s8_1_GSLv3-7_72_SL243524.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HGNKMALXX_s8_2_GSLv3-7_72_SL243524.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HHFHJALXX_s3_1_GSLv3-7_43_SL248443.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HHFHJALXX_s3_2_GSLv3-7_43_SL248443.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HHFHJALXX_s4_1_GSLv3-7_44_SL248444.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HHFHJALXX_s4_2_GSLv3-7_44_SL248444.fastq.gz in cavatica, but not dataservice. Call M

ERROR: HJ5W2ALXX_s7_1_GSLv3-7_31_SL243483.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJ5W2ALXX_s7_2_GSLv3-7_31_SL243483.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJ5W2ALXX_s8_1_GSLv3-7_32_SL243484.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJ5W2ALXX_s8_2_GSLv3-7_32_SL243484.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJGNVALXX_s8_1_GSLv3-7_01_SL250490.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJGNVALXX_s8_2_GSLv3-7_01_SL250490.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJHGNALXX_s1_1_GSLv3-7_18_SL250507.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJHGNALXX_s1_2_GSLv3-7_18_SL250507.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJHGNALXX_s2_1_GSLv3-7_19_SL250508.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HJHGNALXX_s2_2_GSLv3-7_19_SL250508.fastq.gz in cavatica, but not dataservice. Call M

ERROR: HL33WALXX_s6_2_GSLv3-7_64_SL248464.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL33WALXX_s7_1_GSLv3-7_81_SL248481.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL33WALXX_s7_2_GSLv3-7_81_SL248481.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL33WALXX_s8_1_GSLv3-7_45_SL248445.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL33WALXX_s8_2_GSLv3-7_45_SL248445.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL3CCALXX_s1_1_GSLv3-7_08_SL250497.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL3CCALXX_s1_1_GSLv3-7_09_SL250498.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL3CCALXX_s1_1_GSLv3-7_90_SL250579.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL3CCALXX_s1_2_GSLv3-7_08_SL250497.fastq.gz in cavatica, but not dataservice. Call Magnum PI!
ERROR: HL3CCALXX_s1_2_GSLv3-7_09_SL250498.fastq.gz in cavatica, but not dataservice. Call M

ERROR: SL243491.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243492.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243493.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243494.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243495.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243496.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243497.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243498.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243499.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243500.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243501.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243502.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243503.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243504.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL243505.bam 

ERROR: SL250525.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250526.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250527.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250528.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250529.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250530.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250531.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250532.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250533.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250534.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250535.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250536.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250537.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250538.bam in cavatica, but not dataservice. Call Magnum PI!
ERROR: SL250539.bam 