## NGS CHOP v Hakon

In [4]:
#!/usr/bin/env python3
import sys
import re
import sevenbridges as sbg
import pdb
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import concurrent.futures
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[
                sbg.http.error_handlers.rate_limit_sleeper,
                sbg.http.error_handlers.maintenance_sleeper])
project = 'brownm28/chop-hakon-ngs-check'

## Draft BCF Calls for Hakon Samples

In [1]:
def get_ref_objs(api):
    refs = {}
    refs['chr_list'] = api.files.get(id='5d5ef5fde4b0950c45aab99b')
    refs['reference_fasta'] = api.files.get(id='5d5ef5fde4b0950c45aab99c')
    refs['snp_bed'] = api.files.get(id='5d5ef5fde4b0950c45aab99d')
    return refs

In [2]:

manifest = open('/Users/brownm28/Documents/2019-Aug-22_cbttc_chop_hakon_checkmate/hakon_cram-manifest.csv')
head = next(manifest)
cram_list = []

for line in manifest:
    info = line.rstrip('\n').split(',')
    cram_list.append(info[0])
cram_obj_list = []
files = api.files.query(project=project).all()
for fobj in files:
    if fobj.id in cram_list:
        cram_obj_list.append(fobj)
#pdb.set_trace()
app_name = project + '/bcf-call'
ref_obj = get_ref_objs(api)
total = len(cram_obj_list)
ct = 1
bcf_tasks = open('/Users/brownm28/Documents/2019-Aug-22_cbttc_chop_hakon_checkmate/hakon_bcf_tasks.txt', 'w')
max_j = 16
for i in range(0, total, max_j):
    # pdb.set_trace()
    inputs = ref_obj
    cset = i + max_j
    if cset > total:
        cset = total
    inputs['input_align'] = cram_obj_list[i:cset]
    task_name = 'HAKON BCF CALLS: SET' + str(ct)
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=inputs, run=False)
    task.save()
    bcf_tasks.write(task.name + '\t' + task.id + '\n')
    ct += 1
bcf_tasks.close()   

NameError: name 'api' is not defined

### Tag BCF Outputs

In [5]:
tasks = api.tasks.query(project=project, status="COMPLETED").all()
# task name search phrase
phrase = "HAKON BCF CALLS:"
# modify this to set which input file to use to tag the outputs with, may need to modify code if an array element
in_key = 'input_align'
for task in tasks:
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        fset = task.inputs[in_key]
        for i in range(len(fset)):
            metadata = fset[i].metadata
            for out_key in task.outputs:
                try:
                    file_obj = api.files.get(task.outputs[out_key][i])
                    for key in metadata:
                        file_obj.metadata[key] = metadata[key]
                    file_obj.save()
                except Exception as e:
                    print(e)
                    print("Skipping " + task.name + " due to error")

Valid task found HAKON BCF CALLS: SET1
Valid task found HAKON BCF CALLS: SET2
Valid task found HAKON BCF CALLS: SET3
Valid task found HAKON BCF CALLS: SET4
Valid task found HAKON BCF CALLS: SET5
Valid task found HAKON BCF CALLS: SET6
Valid task found HAKON BCF CALLS: SET7
Valid task found HAKON BCF CALLS: SET8
Valid task found HAKON BCF CALLS: SET9
Valid task found HAKON BCF CALLS: SET10
Valid task found HAKON BCF CALLS: SET11
Valid task found HAKON BCF CALLS: SET12
Valid task found HAKON BCF CALLS: SET13
Valid task found HAKON BCF CALLS: SET14
Valid task found HAKON BCF CALLS: SET15
Valid task found HAKON BCF CALLS: SET16
Valid task found HAKON BCF CALLS: SET17
Valid task found HAKON BCF CALLS: SET18
Valid task found HAKON BCF CALLS: SET19
Valid task found HAKON BCF CALLS: SET20
Valid task found HAKON BCF CALLS: SET21
Valid task found HAKON BCF CALLS: SET22
Valid task found HAKON BCF CALLS: SET23
Valid task found HAKON BCF CALLS: SET24
Valid task found HAKON BCF CALLS: SET25
Valid tas

## Pair CHOP-HAKON matches

In [None]:
def populate_vcf(line):
    info = line.rstrip('\n').split(',')
    bs_id = info[12]
    if info[2][0:2] == "KF":
        hakon_vcfs[bs_id] = api.files.get(info[0])
    else:
        chop_vcfs[bs_id] = api.files.get(info[0])

In [None]:
manifest = open('/Users/brownm28/Documents/2019-Aug-22_cbttc_chop_hakon_checkmate/checkmate_run-manifest.csv')
head = next(manifest)
chop_vcfs = {}
hakon_vcfs = {}

with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(populate_vcf, entry): entry for entry in manifest}

inputs = {}
inputs['snp_bed'] = api.files.get(id='5d0ba11de4b0359dd9909088')
inputs['input_vcf'] = []
inputs['output_basename'] = []
for pt_id in pt_pool:
    if len(pt_pool[pt_id]) <= 1:
        sys.stderr.write('Only one sample for ' + pt_id + " skipping!\n")
    else:
        inputs['input_vcf'].append(pt_pool[pt_id])
        inputs['output_basename'].append(pt_id + "_pool")
task_name = 'MARIS_CHECKMATE_PT_POOL'
app_name = project + '/ngs-checkmate-wf'
task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=inputs, run=False)
task.save()


In [None]:
## Parse results

In [18]:
def process_list(fname):
    temp = []
    try:
        for i in hidx:
            temp.append(file_meta[fname][i])
        return temp
    except Exception as e:
        print(e)
        pdb.set_trace()
        hold = 1


fields = ("Composition", "Kids First Participant ID", "gender", 
          "ethnicity", "race", "sample_id", "Kids First Biospecimen ID", "aliquot_id", "case_id")
manifest = open('/Users/brownm28/Documents/2019-Aug-22_cbttc_chop_hakon_checkmate/checkmate_run-manifest.csv')
head = next(manifest)
header = head.rstrip('\n').split(',')
hidx = []
bidx = 0
for i in range(len(header)):
    if header[i] in fields:
        hidx.append(i)
        if header[i] == 'Kids First Biospecimen ID':
            bidx = i
file_meta = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    file_meta[info[1]] = []
    file_meta[info[1]] = info
manifest.close()
out = open('/Users/brownm28/Documents/2019-Aug-22_cbttc_chop_hakon_checkmate/chop-hakon_match_summary.txt', 'w')

match_file = open('/Users/brownm28/Documents/2019-Aug-22_cbttc_chop_hakon_checkmate/CHOP-HAKON_comparison_MATCH_ONLY.txt')
out.write("CHOP BS ID\tHakon BS ID\t" + '\t'.join(fields) + '\tcorrelation\n')
for line in match_file:
    info = line.rstrip('\n').split('\t')
    if (info[0][0:2] == 'KF' and info[2][0:2] != 'KF') or (info[0][0:2] != 'KF' and info[2][0:2] == 'KF'):
        chop_info = []
        hakon_info = []
        if info[0][0:2] == 'KF':
            hakon_info = process_list(info[0])
            chop_info = process_list(info[2])
            out.write(file_meta[info[2]][bidx] + '\t' + file_meta[info[0]][bidx])
        else:
            hakon_info = process_list(info[2])
            chop_info = process_list(info[0])
            out.write(file_meta[info[0]][bidx] + '\t' + file_meta[info[2]][bidx])
        for i in range(len(chop_info)):
            out.write("\t" + chop_info[i] + ";" + hakon_info[i])
        out.write("\t" + info[-2] + '\n')
out.close()
        
