# Run FQ Input alignment pipeline for Rios-Wise project

## Set up imports

In [1]:
#!/usr/bin/env python3
import sevenbridges as sbg
import sys
import re
from requests import request

config = sbg.Config(profile='cavatica')
api = sbg.Api(config=config)

### Get Read Group Info

In [2]:
def get_rg(url, bs_id, fq_list):
    # fq _list is a reference to dict by bs_id
    #HHNVFALXX_s4_1_GSLv3-7_76_SL248476.fastq.gz
    #HHNVFALXX_s4_2_GSLv3-7_76_SL248476.fastq.gz
    #'@RG\tID:HHNVFALXX_s4_GSLv3-7_76_SL248476\tLB:SL248476\tPL:ILLUMINA\tSM:BS_5QS5KRKF'
    rg_common = '\\tPL:ILLUMINA\\tSM:' + bs_id
    bs_url = url + '/biospecimens/' + bs_id
    bs_info = request('GET', bs_url)
    bs_gf_link = url + bs_info.json()['_links']['biospecimen_genomic_files']
    rel_gfs = request('GET', bs_gf_link)
    temp = {}
    for gf_obj in rel_gfs.json()['results']:
        gf_link = url + gf_obj['_links']['genomic_file']
        gf_obj = request('GET', gf_link)
        res = gf_obj.json()['results']
        fname_parts = res['file_name'].split('_')
        if res['file_format'] == 'fastq':
            f_root = '_'.join((fname_parts[0:2])) + '_' + '_'.join(fname_parts[3:-1]) + '_' + fname_parts[-1].split('.')[0]
            # validate that file root name matches ext ID in rg entry
            rg_link = url + gf_obj.json()['_links']['read_groups']
            rg_info = request('GET', rg_link)
            if rg_info.json()['results'][0]['external_id'] != f_root:
                sys.stderr.write('Ext ID ' + rg_info.json()['results'][0]['external_id'] + ' does not match predicted ' + f_root + ' for bs id ' + bs_id + ', rethink your life\n')
                exit(1)
            else:
                temp[f_root] = 1
                se_link = url + gf_obj.json()['_links']['sequencing_experiment']
                se_info = request('GET', se_link)
                lb = se_info.json()['results']['library_name']
                rg = '@RG\\tID:' + f_root + '\\tLB:' + lb + rg_common
                print ('\t'.join([bs_id, f_root, ','.join(fq_list[bs_id][f_root]['files']), rg]))
                fq_list[bs_id][f_root]['rg'] = rg
    return fq_list
            
            

### Create task

In [9]:
def setup_task(api, project, fq_dict, bs_id, indexed_reference_fasta, knownsites, reference_dict, wgs_coverage_interval_list):
    task_name = 'alignment-' + bs_id
    app_name = project + '/kfdrc-alignment-fqinput-cram-only-wf'
    
    inputs = {}
    inputs['files_R1'] = []
    inputs['files_R2'] = []
    inputs['rgs'] = []
    inputs['indexed_reference_fasta'] = indexed_reference_fasta
    inputs['knownsites'] = knownsites
    inputs['reference_dict'] = reference_dict
    inputs['wgs_coverage_interval_list'] = wgs_coverage_interval_list
    for fq_root in fq_dict:
        fq_dict[fq_root]['files'].sort()
        fq1 = fq_dict[fq_root]['files'][0]
        fq2 = fq_dict[fq_root]['files'][1]
        inputs['files_R1'].append(api.files.query(project=project, names=[fq1])[0])
        inputs['files_R2'].append(api.files.query(project=project, names=[fq2])[0])
        inputs['rgs'].append(fq_dict[fq_root]['rg'])

    #try:
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=inputs, run=False)
    task.inputs['output_basename'] = task.id
    task.save()
    print ('Task name:' + task.name, 'Task ID:' + task.id)
    #except:
    #    print('Could not create task for ' + task_name + '!\n')
    

### Get refs

In [10]:
def get_refs(api):
    indexed_reference_fasta = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    knownsites = []
    ks_files = ['1000G_phase1.snps.high_confidence.hg38.vcf.gz', 'Homo_sapiens_assembly38.known_indels.vcf.gz', 'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz']
    for ks in ks_files:
        knownsites.append(api.files.query(project=project, names=[ks])[0])
    reference_dict = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    wgs_coverage_interval_list = api.files.query(project=project, names=['wgs_coverage_regions.hg38.interval_list'])[0]
    return indexed_reference_fasta, knownsites, reference_dict, wgs_coverage_interval_list

## Main setup

In [13]:
url = 'https://kf-api-dataservice.kidsfirstdrc.org'
bs_fn_pairs = open('/Users/brownm28/Documents/2018-Sep-11_Rios-Wise/build_task/test_in.txt')
bs_file_dict = {}
project = 'kfdrc-harmonization/sd-rm8afw0r'
(indexed_reference_fasta, knownsites, reference_dict, wgs_coverage_interval_list) = get_refs(api)
for pair in bs_fn_pairs:
    (bs_id, fq_name) = pair.rstrip('\n').split('\t')
    fname_parts = fq_name.split('_')
    f_root = '_'.join((fname_parts[0:2])) + '_' + '_'.join(fname_parts[3:-1]) + '_' + fname_parts[-1].split('.')[0]
    print(f_root)
    if bs_id not in bs_file_dict:
        bs_file_dict[bs_id] = {}
    if f_root not in bs_file_dict[bs_id]:
        bs_file_dict[bs_id][f_root] = {}
        bs_file_dict[bs_id][f_root]['files'] = []
    bs_file_dict[bs_id][f_root]['files'].append(fq_name)
bs_fn_pairs.close()

for bs_id in bs_file_dict:
    bs_file_dict = get_rg(url, bs_id, bs_file_dict)
    setup_task(api, project, bs_file_dict[bs_id], bs_id, indexed_reference_fasta, knownsites, reference_dict, wgs_coverage_interval_list)

    

HHT52ALXX_s8_GSLv3-7_91_SL248491
HHT52ALXX_s8_GSLv3-7_91_SL248491
HL5GNALXX_s3_GSLv3-7_91_SL248491
HL5GNALXX_s3_GSLv3-7_91_SL248491
HHNW2ALXX_s8_GSLv3-7_72_SL248472
HHNW2ALXX_s8_GSLv3-7_72_SL248472
HL5GNALXX_s3_GSLv3-7_72_SL248472
HL5GNALXX_s3_GSLv3-7_72_SL248472
BS_082CXWXG	HHT52ALXX_s8_GSLv3-7_91_SL248491	HHT52ALXX_s8_1_GSLv3-7_91_SL248491.fastq.gz,HHT52ALXX_s8_2_GSLv3-7_91_SL248491.fastq.gz	@RG\tID:HHT52ALXX_s8_GSLv3-7_91_SL248491\tLB:SL248491\tPL:ILLUMINA\tSM:BS_082CXWXG
BS_082CXWXG	HHT52ALXX_s8_GSLv3-7_91_SL248491	HHT52ALXX_s8_1_GSLv3-7_91_SL248491.fastq.gz,HHT52ALXX_s8_2_GSLv3-7_91_SL248491.fastq.gz	@RG\tID:HHT52ALXX_s8_GSLv3-7_91_SL248491\tLB:SL248491\tPL:ILLUMINA\tSM:BS_082CXWXG
BS_082CXWXG	HL5GNALXX_s3_GSLv3-7_91_SL248491	HL5GNALXX_s3_1_GSLv3-7_91_SL248491.fastq.gz,HL5GNALXX_s3_2_GSLv3-7_91_SL248491.fastq.gz	@RG\tID:HL5GNALXX_s3_GSLv3-7_91_SL248491\tLB:SL248491\tPL:ILLUMINA\tSM:BS_082CXWXG
BS_082CXWXG	HL5GNALXX_s3_GSLv3-7_91_SL248491	HL5GNALXX_s3_1_GSLv3-7_91_SL248491.fastq.gz