## Initialize api and imports

In [1]:
import sevenbridges as sbg
import sys
from requests import request

config = sbg.Config(profile='cavatica')
api = sbg.Api(config=config)

## Setup helper defs

In [8]:
def get_bs_id(url):
    pt_info = request('GET', url)
    return pt_info.json()['results'][0]['kf_id']


def build_ped_entry(url, fam_id, out):
    info = request('GET', url)
    patient_sex = 'unkown'
    ind_id = ''
    paternal_id = ''
    maternal_id = ''
    # currently assumed all probands have phenotype!!! can actually check iof needed
    phenotype = '2'
    sex = {'Male': '1', 'Female': '2'}
    # ped file output format:
    # fam_id\tindividual_id\tpaternal_id\tmaternal_id\tsex<0,1,2>\tphenotype<-9,0,1,2>
    for person in info.json()['results']:
        bs_url = 'http://localhost:1080' + person['_links']['biospecimens']
        bs_id = get_bs_id(bs_url)
        if person['is_proband'] == True:
            if person['gender'] in sex:
                patient_sex = sex[person['gender']]
            ind_id = bs_id
        elif person['gender'] == 'Female':
            maternal_id = bs_id
        else:
            paternal_id = bs_id
    new_ped = open(out, 'w')
    new_ped.write('\t'.join((fam_id, ind_id, paternal_id, maternal_id, patient_sex, phenotype)) + '\n')
    new_ped.close()

def create_task(fam_id, ped_out, api, vcf, project):
    task_name = 'refinement-' + fam_id
    app_name = project + '/gatk-genotype-refinement'
    inputs = {}
    inputs['vqsr_vcf'] = vcf
    inputs['snp_sites'] = api.files.query(project=project, names=['1000G_phase3_v4_20130502.sites.hg38.vcf'])[0]
    inputs['reference'] =  api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    inputs['ped'] =  api.files.query(project=project, names=[ped_out])[0]
    try:
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=inputs, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
        print (task.inputs['vqsr_vcf'].name, fam_id, task.id)
    except SbError:
        print('Could not create task for ' + task_name + '!\n')
    

## Initialize inputs
#### May want to switch to stdin in if running outside of notebook

In [3]:
project = 'brownm28/kf-genotype-refinement-workflow'
tag_search = 'Trio Joint Genotyping'

## Get vcf object list

In [4]:

files = api.files.query(project=project, tags=tag_search).all()
sys.stderr.write('Getting files for ' + project + '\n')
# dir(files)
vcf_list = []
i = 0
for file_obj in files:
    #    for tag in file_obj.tags:
    #        if tag == tag_search and file_obj.name[-7:] == '.vcf.gz':
    if file_obj.name[-7:] == '.vcf.gz':
        if file_obj.name in exclude_dict:
            sys.stderr.write('File ' + file_obj.name + ' in exclude list, skipping!\n')
        else:
            vcf_list.append(file_obj)
            sys.stderr.write('Found relevant file ' + file_obj.name + '\n')
    i += 1
sys.stderr.write('Searched ' + str(i) + ' files for desired criteria\n')

Getting files for brownm28/kf-genotype-refinement-workflow
Found relevant file 5077c39a-d456-437c-89ac-f3854d2575d4.vcf.gz
Found relevant file fc46610e-f873-4998-805c-b2cd76e56c00.vcf.gz
Found relevant file fcc6436b-5874-43d8-98a7-973c421ea96c.vcf.gz
Found relevant file fcd0830b-3fd9-4220-a83e-47d2659605eb.vcf.gz
Found relevant file fceead3c-2b03-46ee-b958-bd5648d1979a.vcf.gz
Found relevant file fe6cb487-f7de-43ba-8090-f68ae0d1fba0.vcf.gz


## Create .ped files and set up cavatica jobs.  Requires data service running

In [9]:
sys.stderr.write('Building and uploading .ped files to project\n')
for vcf in vcf_list:
    fam_id =  vcf.metadata['Kids First ID']
    url = 'http://localhost:1080/participants?family_id=' + fam_id
    ped_out = fam_id + '.ped'
    build_ped_entry(url, fam_id, ped_out)
    sys.stderr.write('Uploading ' + ped_out + ' to cavatica for project ' + project + '\n')
    api.files.upload(project=project, path=ped_out)
    create_task(fam_id, ped_out, api, vcf, project)
sys.stderr.write('Completed setting up tasks, check output logs\n')

Building and uploading .ped files to project
Uploading FM_KXC88W01.ped to cavatica for project brownm28/kf-genotype-refinement-workflow
Exception in thread Thread-6:
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/sevenbridges/transfer/upload.py", line 344, in _initialize_upload
    self._URL['upload_init'], data=init_data, params=init_params
  File "/usr/local/lib/python3.6/site-packages/sevenbridges/http/client.py", line 226, in post
    data=data, append_base=append_base)
  File "/usr/local/lib/python3.6/site-packages/sevenbridges/decorators.py", line 134, in wrapper
    raise e
sevenbridges.errors.Conflict: Requested file already exists.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/site-packages/sevenbridg

5077c39a-d456-437c-89ac-f3854d2575d4.vcf.gz FM_KXC88W01 50c5047e-2e59-41aa-9b22-c2e8de103e89


Uploading FM_0RY3PYCS.ped to cavatica for project brownm28/kf-genotype-refinement-workflow


fc46610e-f873-4998-805c-b2cd76e56c00.vcf.gz FM_0RY3PYCS d05e868f-d644-49d9-9d76-5e350513c921


Uploading FM_DFW18WG8.ped to cavatica for project brownm28/kf-genotype-refinement-workflow


fcc6436b-5874-43d8-98a7-973c421ea96c.vcf.gz FM_DFW18WG8 5ffcff4f-8d12-4f30-aa60-1bdd72aface1


Uploading FM_FHTAKY86.ped to cavatica for project brownm28/kf-genotype-refinement-workflow


fcd0830b-3fd9-4220-a83e-47d2659605eb.vcf.gz FM_FHTAKY86 227a59ce-3b22-448b-889d-56040b898070


Uploading FM_35J5Y5NS.ped to cavatica for project brownm28/kf-genotype-refinement-workflow


fceead3c-2b03-46ee-b958-bd5648d1979a.vcf.gz FM_35J5Y5NS 4b8ea9f0-cbbe-41bd-a30a-59192863e288


Uploading FM_ZFMCW3G3.ped to cavatica for project brownm28/kf-genotype-refinement-workflow


fe6cb487-f7de-43ba-8090-f68ae0d1fba0.vcf.gz FM_ZFMCW3G3 1ebd67ac-5c78-415d-ab78-3ce6c2a8621d


In [11]:
project = 'kfdrc-harmonization/sd-9pyzahhe-09'
tid = 'b4f447ad-8e69-4c6a-86ef-ed8892a94cb3'
task = api.tasks.get(id=tid)
print (task.outputs['finalgathervcf'].size)
print (size in task.outputs['finalgathervcf'])

688674957


NameError: name 'size' is not defined