In [1]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])
project = 'brownm28/mb-controlfreec-troubleshoot'

## Copy files

In [6]:
germ_origin = 'kfdrc-harmonization/sd-dypmehhf'
tumor_origin= 'kfdrc-harmonization/sd-dypmehhf-03/'
somatic_origin= 'kfdrc-harmonization/sd-dypmehhf-05'
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')

case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
for case_id in case_list:
    files = api.files.query(project=somatic_origin, metadata = {'case_id': case_id} )
    for file_obj in files:
        if re.search("p.value.txt$", file_obj.name):
            print("Found valid file " + file_obj.name + " " + file_obj.metadata["Kids First Participant ID"] + " " + file_obj.metadata["sample_type"])
            file_obj.copy(project=project)
            print ("Copied file over")


Found valid file ccc925f9-29ad-4f16-baec-fc955fe37782.CNVs.p.value.txt PT_9QQ37AWW Tumor
Copied file over
Found valid file e322d7c1-1654-481f-86ed-2d2f2c4518ed.CNVs.p.value.txt PT_1EQHANKW Tumor
Copied file over
Found valid file 6bb8fb90-4430-436a-8a8c-2a119f970e69.CNVs.p.value.txt PT_TAJJ9MYY Tumor
Copied file over
Found valid file 151fe94b-d483-41e3-aa0b-c98f52b193fb.CNVs.p.value.txt PT_BXYKW39H Tumor
Copied file over
Found valid file 5e1c7ea8-7032-4f37-80fe-04ce13d9cf4e.CNVs.p.value.txt PT_53M7K3JE Tumor
Copied file over
Found valid file a36f0080-c6a8-4dc8-bbce-f304802a6fad.CNVs.p.value.txt PT_1YAJEAMJ Tumor
Copied file over
Found valid file 60a9dcdc-586c-4281-be37-fbf81c317742.CNVs.p.value.txt PT_R2TRGY6N Tumor
Copied file over
Found valid file b8defbad-85da-4746-9b98-300ecbec8e41.CNVs.p.value.txt PT_KWRFGRER Tumor
Copied file over
Found valid file ec2b7437-801e-4946-9138-3948736c30f6.CNVs.p.value.txt PT_ASH4P45D Tumor
Copied file over
Found valid file bd65eb61-19e3-4b80-804d-7c51a

## HC Run

In [28]:
def get_refs(api):
    ref_dict = {}
    ref_dict['axiomPoly_resource_vcf'] = api.files.query(project=project, names=['Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_dict['dbsnp_vcf'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dbsnp138.vcf'])[0]
    ref_dict['hapmap_resource_vcf'] = api.files.query(project=project, names=['hapmap_3.3.hg38.vcf.gz'])[0]
    ref_dict['mills_resource_vcf'] = api.files.query(project=project, names=['Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'])[0]
    ref_dict['omni_resource_vcf'] = api.files.query(project=project, names=['1000G_omni2.5.hg38.vcf.gz'])[0]
    ref_dict['one_thousand_genomes_resource_vcf'] = api.files.query(project=project, names=['1000G_phase1.snps.high_confidence.hg38.vcf.gz'])[0]
    #ref_dict['ref_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    #ref_dict['ref_tar_gz'] = api.files.query(project=project, names=['hg38_snpeff.tgz'])[0]
    ref_dict['unpadded_intervals_file'] = api.files.query(project=project, names=['hg38.even.handcurated.20k.intervals'])[0]
    ref_dict['wgs_evaluation_interval_list'] = api.files.query(project=project, names=['wgs_evaluation_regions.hg38.interval_list'])[0]
    ref_dict['snp_sites'] = api.files.query(project=project, names=['1000G_phase3_v4_20130502.sites.hg38.vcf'])[0]
    return ref_dict

In [29]:
manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_gvcf_test-manifest.csv')
head = next(manifest)
ref_obj = get_refs(api)
# app_name = project + "/kf-single-genotype/0"
app_name = project + "/kfdrc-single-genotype-basic"
for line in manifest:
    info = line.rstrip('\n').split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_dict['input_vcfs'] = [api.files.get(info[0])]
    task_name = "SINGLE GENOTYPE GATK: " + info[6] + " " + info[11] + " " + info[-2]
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, execution_settings = {'use_memoization': True}, run=False)
    task.inputs['output_basename'] = task.id
    
    task.save()

### expand view

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

## ControlFreeC Run

In [13]:
def get_cf_refs(api):
    ref_dict = {}
    ref_dict['chr_len'] = api.files.query(project=project, names=['hs38_chr.len'])[0]
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['reference_fai'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta.fai'])[0]
    ref_dict['include_expression'] = 'FILTER="PASS"'
    ref_dict['coeff_var'] = 0.05
    ref_dict['mate_orientation_control'] = "FR"
    ref_dict['mate_orientation_sample'] = "FR"
    ref_dict['ploidy'] = [2,3,4]
    ref_dict['threads'] = 16
    ref_dict['contamination_adjustment'] = True
    return ref_dict

### Test Project Run

In [33]:
app_name = project + '/kfdrc-controlfreec-wf'
manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_crams-manifest.csv')
# case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/missed_case_list.txt')
sex_prediction = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_sex_info_w_case_id.txt')
# b_allele = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_ballele-manifest.csv')
b_allele = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/missed_vcf-manifest.csv')
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
ref_objs = get_cf_refs(api)
head = next(manifest)

bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[9]
    sample_id = info[11]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
    
sex_dict = {}
s_trans = {"Male": "XY", "Female": "XX"}
head = next(sex_prediction)
for line in sex_prediction:
    info = line.rstrip('\n').split('\t')
    case_id = info[4]
    pred_sex = info[-1]
    ds_sex = info[1]
    if case_id in case_list:
        if pred_sex != "Unknown" and pred_sex == ds_sex:
            sex_dict[case_id] = s_trans[pred_sex]
        else:
            sys.stderr.write("Warn, prediction for " + case_id + " was inconclusive.  Default to reported sex\n")
            sex_dict[case_id] = s_trans[ds_sex]
b_dict = {}
head = next(b_allele)
for line in b_allele:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    b_dict[case_id] = api.files.get(info[0])
for case_id in case_list:
    try:
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        tumor = ''
        normal = ''
        for stype in bam_dict[case_id]:
            if stype == 'Normal':
                in_dict['input_normal'] = bam_dict[case_id][stype]['file_obj']
                normal = bam_dict[case_id][stype]['sid']
            else:
                in_dict['input_tumor'] = bam_dict[case_id][stype]['file_obj']
                tumor = bam_dict[case_id][stype]['sid']
                in_dict['sample_name'] = tumor
        in_dict['sex'] = sex_dict[case_id]
        in_dict['b_allele'] = b_dict[case_id]
        task_name = 'CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: ' + case_id + ' ' + tumor + ' ' + normal
        # pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id + "CONTAM_ADJUST"
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error: " + case_id)

### Production Run on PNOC

In [9]:
project = 'kfdrc-harmonization/sd-bhjxbdqk-09'
app_name = project + '/kfdrc-controlfreec-wf'
manifest = open('/Users/brownm28/Documents/2019-Oct-7_pnoc_cfree/cram-manifest.csv')
sex_prediction = open('/Users/brownm28/Documents/2019-Oct-7_pnoc_cfree/2019-09-30-pbta-clin-update.tsv')
b_allele = open('/Users/brownm28/Documents/2019-Oct-7_pnoc_cfree/germline_calls-manifest.csv')
tasks = api.tasks.query(project=project, status="COMPLETED").all()
prefix = "cbttc-dna-somatic"
suffix = "Reharmonization"
tn_pairs = []
bs_ids = []
for task in tasks:
    if re.search(prefix, task.name) and re.search(suffix, task.name):
        sys.stderr.write("Valid task found: " + task.name + "\n")
        tumor_id = task.inputs['tumor_id']
        normal_id = task.inputs['normal_id']
        tn_pairs.append([tumor_id, normal_id])
        bs_ids.append(tumor_id)
        if normal_id not in bs_ids:
            bs_ids.append(normal_id)

ref_objs = get_cf_refs(api)
head = next(manifest)

cram_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    sample_id = info[12]
    fid = info[0]
    cram_dict[sample_id] = api.files.get(fid)
    
sex_dict = {}
s_trans = {"Male": "XY", "Female": "XX"}
head = next(sex_prediction)
for line in sex_prediction:
    info = line.rstrip('\n').split('\t')
    bs_id = info[1]
    pred_sex = info[-4]
    ds_sex = info[7]
    if bs_id in bs_ids and info[3] != "Tumor":
        if pred_sex != "NA" and pred_sex == ds_sex:
            sex_dict[bs_id] = s_trans[pred_sex]
        else:
            sys.stderr.write("Warn, prediction for " + case_id + " was inconclusive.  Default to reported sex\n")
            sex_dict[case_id] = s_trans[ds_sex]
b_dict = {}
head = next(b_allele)
for line in b_allele:
    info = line.rstrip('\n').split(',')
    bs_id = info[11]
    b_dict[bs_id] = api.files.get(info[0])
for pair in tn_pairs:
    (tumor, normal) = (pair[0], pair[1])
    try:
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        in_dict['input_normal'] = cram_dict[normal]
        in_dict['input_tumor'] = cram_dict[tumor]
        in_dict['sample_name'] = tumor
        in_dict['sex'] = sex_dict[normal]
        in_dict['b_allele'] = b_dict[normal]
        task_name = 'CONTROLFREEC REHARMONIZATION: ' + tumor + ' ' + normal
        # pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error: " + tumor)

Valid task found: cbttc-dna-somatic-BS_CGXTFM67_BS_6GS4XT7F-Reharmonization
Valid task found: cbttc-dna-somatic-BS_3Z40EZHD_BS_MVYA262V-Reharmonization
Valid task found: cbttc-dna-somatic-BS_JRFVST47_BS_MVYA262V-Reharmonization
Valid task found: cbttc-dna-somatic-BS_J8EK6RNF_BS_HJ7HYZ7N-Reharmonization
Valid task found: cbttc-dna-somatic-BS_M5FM63EB_BS_9H6Z0MEG-Reharmonization
Valid task found: cbttc-dna-somatic-BS_M0B42FPR_BS_9H6Z0MEG-Reharmonization
Valid task found: cbttc-dna-somatic-BS_9P4NDTKJ_BS_9H6Z0MEG-Reharmonization
Valid task found: cbttc-dna-somatic-BS_YZD4SSMA_BS_E5RKHG41-Reharmonization
Valid task found: cbttc-dna-somatic-BS_NNFDFAFM_BS_E5RKHG41-Reharmonization
Valid task found: cbttc-dna-somatic-BS_1MME7FBS_BS_STZ2C71Q-Reharmonization
Valid task found: cbttc-dna-somatic-BS_AHAXPFG3_BS_668350EZ-Reharmonization
Valid task found: cbttc-dna-somatic-BS_HEJ72V3F_BS_668350EZ-Reharmonization
Valid task found: cbttc-dna-somatic-BS_H8NWA41N_BS_Q7R8BT07-Reharmonization
Valid task f

### Test run on PNOC Subset

In [16]:
project = 'brownm28/kfdrc-purity-ploidy-dev'
app_name = project + '/kfdrc-controlfreec-wf'
manifest = open('/Users/brownm28/Documents/2019-Oct-7_pnoc_cfree/dev_project_crams-manifest.csv')
sex_prediction = open('/Users/brownm28/Documents/2019-Oct-7_pnoc_cfree/2019-09-30-pbta-clin-update.tsv')
b_allele = open('/Users/brownm28/Documents/2019-Oct-7_pnoc_cfree/dev_project_germline-manifest.csv')
pair_file = open('/Users/brownm28/Documents/2019-Oct-7_pnoc_cfree/curley_bs_id_pairs.txt')
tn_pairs = []
bs_ids = []
head = next(pair_file)

for line in pair_file:
    (tumor_id, normal_id) = line.rstrip('\n').split('\t')
    tn_pairs.append([tumor_id, normal_id])
    bs_ids.append(tumor_id)
    if normal_id not in bs_ids:
        bs_ids.append(normal_id)

ref_objs = get_cf_refs(api)
head = next(manifest)

cram_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    sample_id = info[12]
    fid = info[0]
    cram_dict[sample_id] = api.files.get(fid)
    
sex_dict = {}
s_trans = {"Male": "XY", "Female": "XX"}
head = next(sex_prediction)
for line in sex_prediction:
    info = line.rstrip('\n').split('\t')
    bs_id = info[1]
    pred_sex = info[-4]
    ds_sex = info[7]
    if bs_id in bs_ids and info[3] != "Tumor":
        if pred_sex != "NA" and pred_sex == ds_sex:
            sex_dict[bs_id] = s_trans[pred_sex]
        else:
            sys.stderr.write("Warn, prediction for " + case_id + " was inconclusive.  Default to reported sex\n")
            sex_dict[case_id] = s_trans[ds_sex]
b_dict = {}
head = next(b_allele)
for line in b_allele:
    info = line.rstrip('\n').split(',')
    bs_id = info[11]
    b_dict[bs_id] = api.files.get(info[0])
for pair in tn_pairs:
    (tumor, normal) = (pair[0], pair[1])
    try:
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        in_dict['input_normal'] = cram_dict[normal]
        in_dict['input_tumor'] = cram_dict[tumor]
        in_dict['sample_name'] = tumor
        in_dict['sex'] = sex_dict[normal]
        in_dict['b_allele'] = b_dict[normal]
        task_name = 'CFREEC PNOC CONTAM ADJ: ' + tumor + ' ' + normal
        # pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id + '.pnoc_contam_adj'
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error: " + tumor)

## CNVkit Run

In [45]:
def get_ckit_refs(api):
    ref_dict = {}
    ref_dict['annotation_file'] = api.files.query(project=project, names=['refFlat_HG38.txt'])[0]
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['include_expression'] = 'FILTER="PASS"'
    ref_dict['wgs_mode'] = 'Y'
    ref_dict['threads'] = 36
    return ref_dict

In [47]:
app_name = project + '/kfdrc-cnvkit-batch-wf'
manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_crams-manifest.csv')
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
# case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/missed_case_list.txt')
sex_prediction = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_sex_info_w_case_id.txt')
b_allele = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_ballele-manifest.csv')
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
ref_objs = get_ckit_refs(api)
head = next(manifest)
bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[9]
    sample_id = info[11]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
    
sex_dict = {}
head = next(sex_prediction)
for line in sex_prediction:
    info = line.rstrip('\n').split('\t')
    case_id = info[4]
    pred_sex = info[-1]
    ds_sex = info[1]
    if case_id in case_list:
        if pred_sex != "Unknown" and pred_sex == ds_sex:
            sex_dict[case_id] = pred_sex
        else:
            sys.stderr.write("Warn, prediction for " + case_id + " was inconclusive.  Default to reported sex\n")
            sex_dict[case_id] = s_trans[ds_sex]
b_dict = {}
head = next(b_allele)
for line in b_allele:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    b_dict[case_id] = api.files.get(info[0])
for case_id in case_list:
    try:
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        tumor = ''
        normal = ''
        for stype in bam_dict[case_id]:
            if stype == 'Normal':
                in_dict['input_control'] = bam_dict[case_id][stype]['file_obj']
                normal = bam_dict[case_id][stype]['sid']
            else:
                in_dict['input_sample'] = bam_dict[case_id][stype]['file_obj']
                tumor = bam_dict[case_id][stype]['sid']
                in_dict['tumor_sample_name'] = tumor
        in_dict['sex'] = sex_dict[case_id]
        in_dict['b_allele_vcf'] = b_dict[case_id]
        task_name = 'CNVKIT FIRST PASS RERUN: ' + case_id + ' ' + tumor + ' ' + normal
        #pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, execution_settings={'use_memoization': True}, run=False)
        task.inputs['output_basename'] = task.id + "_FIRST_PASS"
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error:" + case_id)

### tag outputs

In [109]:
tasks = api.tasks.query(project=project, status="COMPLETED").all()
# task name search phrase
phrase = "PureCN CNVKIT SEG INPUT:"
# modify this to set which input file to use to tag the outputs with, may need to modify code if an array element
in_key = 'input_seg_file'
for task in tasks:
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        metadata = task.inputs[in_key].metadata
        for out_key in task.outputs:
            try:
                if type(task.outputs[out_key]) is not list:
                    file_obj = api.files.get(task.outputs[out_key].id)
                    for key in metadata:
                        file_obj.metadata[key] = metadata[key]
                    file_obj.save()
                else:
                    for output in task.outputs[out_key]:
                        file_obj = api.files.get(output.id)
                        for key in metadata:
                            file_obj.metadata[key] = metadata[key]
                        file_obj.save()
            except Exception as e:
                print(e)
                print("Skipping " + task.name + " due to error")

Valid task found PureCN CNVKIT SEG INPUT: PASTGD03 BS_W1R54A2M


'NoneType' object has no attribute 'id'
Skipping PureCN CNVKIT SEG INPUT: PASTGD03 BS_W1R54A2M due to error


Valid task found PureCN CNVKIT SEG INPUT: PASXRJ03 BS_KXRFQF5N


'NoneType' object has no attribute 'id'
Skipping PureCN CNVKIT SEG INPUT: PASXRJ03 BS_KXRFQF5N due to error


Valid task found PureCN CNVKIT SEG INPUT: PASUTC03 BS_B80Z459E
Valid task found PureCN CNVKIT SEG INPUT: PASFKX03 BS_2X9EVKZ0


'NoneType' object has no attribute 'id'
Skipping PureCN CNVKIT SEG INPUT: PASUTC03 BS_B80Z459E due to error
Requested file does not exist.
Skipping PureCN CNVKIT SEG INPUT: PASUTC03 BS_B80Z459E due to error
Requested file does not exist.
Skipping PureCN CNVKIT SEG INPUT: PASUTC03 BS_B80Z459E due to error
'NoneType' object has no attribute 'id'
Skipping PureCN CNVKIT SEG INPUT: PASFKX03 BS_2X9EVKZ0 due to error
Requested file does not exist.
Skipping PureCN CNVKIT SEG INPUT: PASFKX03 BS_2X9EVKZ0 due to error
Requested file does not exist.
Skipping PureCN CNVKIT SEG INPUT: PASFKX03 BS_2X9EVKZ0 due to error


Valid task found PureCN CNVKIT SEG INPUT: PARUTJ03 BS_K2K5YSDS
Valid task found PureCN CNVKIT SEG INPUT: PASCWD03 BS_DWYR5CTE
Valid task found PureCN CNVKIT SEG INPUT: PASGRL03 BS_EQE447QB
Valid task found PureCN CNVKIT SEG INPUT: PASMET03 BS_RA5HNMDP
Valid task found PureCN CNVKIT SEG INPUT: PASUTC03 BS_B80Z459E
Valid task found PureCN CNVKIT SEG INPUT: PARTRP03 BS_4RX1AAVV
Valid task found PureCN CNVKIT SEG INPUT: PASFKX03 BS_2X9EVKZ0
Valid task found PureCN CNVKIT SEG INPUT: PASPGB03 BS_ACCE0MEA
Valid task found PureCN CNVKIT SEG INPUT: PASUMG03 BS_WSK7MH3C
Valid task found PureCN CNVKIT SEG INPUT: PASVCK03 BS_KQHSSRW3


### tag batch outputs

In [3]:
batch_id = '818abb02-b61d-4313-a451-27c5d27cd4e7'
batch_task = api.tasks.get(batch_id)
for task in batch_task.get_batch_children():
    # pdb.set_trace()
    sys.stderr.write('Valid task found ' + task.name + '\n')
    metadata = task.inputs['input_vcf'].metadata
    for out_key in task.outputs:
        try:
            file_obj = api.files.get(task.outputs[out_key].id)
            for key in metadata:
                file_obj.metadata[key] = metadata[key]
            file_obj.save()
        except Exception as e:
            print(e)
            print("Skipping " + task.name + " due to error")

Valid task found bcftools-filter-vcf run - 09-30-19 14:16:32: file: 0558259d-187f-4556-8304-806049897818.postCGP.Gfiltered.vcf.gz
Valid task found bcftools-filter-vcf run - 09-30-19 14:16:32: file: 2a6cd08a-09cc-4ee5-8690-a6c2abbc2253.postCGP.Gfiltered.vcf.gz
Valid task found bcftools-filter-vcf run - 09-30-19 14:16:32: file: 08bdf6d1-26b4-4be6-b268-7851a98d6e30.postCGP.Gfiltered.vcf.gz
Valid task found bcftools-filter-vcf run - 09-30-19 14:16:32: file: 2456132b-684c-490f-9d79-c5da945e6338.postCGP.Gfiltered.vcf.gz
Valid task found bcftools-filter-vcf run - 09-30-19 14:16:32: file: 7231ef87-c951-4d7b-88ab-bb596a439135.postCGP.Gfiltered.vcf.gz
Valid task found bcftools-filter-vcf run - 09-30-19 14:16:32: file: 348391d8-1dc2-4280-a59d-1122f3219c7f.postCGP.Gfiltered.vcf.gz
Valid task found bcftools-filter-vcf run - 09-30-19 14:16:32: file: ccb1acd3-059b-4641-838a-d38255c8d221.postCGP.Gfiltered.vcf.gz
Valid task found bcftools-filter-vcf run - 09-30-19 14:16:32: file: 491538c3-12f7-455c-aef

## Run PureCN

In [112]:
def get_pcn_refs(api):
    ref_dict = {}
    ref_dict['purecn_gc_ref'] = api.files.query(project=project, names=['hg38_PureCN_150bp_gc_file.txt'])[0]
    ref_dict['dbsnp_vcf'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dbsnp138.vcf.gz'])[0]
    ref_dict['include_expression'] = 'FILTER="PASS" && (INFO/STATUS="Germline" | INFO/STATUS="StrongSomatic")'
    ref_dict['genome_version'] = 'hg38'
    ref_dict['cores'] = 8
    ref_dict['max_segments'] = 2000
    return ref_dict

In [113]:
app_name = project + '/kfdrc-purecn-wf'
seg_manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/cnvkit_seg-manifest.csv')
# seg_manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/cfree_unadjusted_seg-manifest.csv')
# case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/missed_case_list.txt')
sex_prediction = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_sex_info_w_case_id.txt')
somatic_germline = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_vardict_vcfs-manifest.csv')
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
ref_objs = get_pcn_refs(api)
head = next(seg_manifest)
seg_dict = {}
for line in seg_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    sample_id = info[11]
    fid = info[0]
    seg_dict[case_id] = api.files.get(fid)
    
sex_dict = {}
s_trans = {"Male": "M", "Female": "F"}
head = next(sex_prediction)
for line in sex_prediction:
    info = line.rstrip('\n').split('\t')
    case_id = info[4]
    pred_sex = info[-1]
    ds_sex = info[1]
    if case_id in case_list:
        if pred_sex != "Unknown" and pred_sex == ds_sex:
            sex_dict[case_id] = s_trans[pred_sex]
        else:
            sys.stderr.write("Warn, prediction for " + case_id + " was inconclusive.  Default to reported sex\n")
            sex_dict[case_id] = s_trans[ds_sex]
b_dict = {}
head = next(somatic_germline)
for line in somatic_germline:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    b_dict[case_id] = api.files.get(info[0])
for case_id in case_list:
    try:
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        seg_obj = seg_dict[case_id]
        tumor_sample_id = seg_obj.metadata['Kids First Biospecimen ID']
        in_dict['input_seg_file'] = seg_obj
        in_dict['tumor_sample_id'] = tumor_sample_id
        in_dict['sex'] = sex_dict[case_id]
        in_dict['somatic_germline_vcf'] = b_dict[case_id]
        task_name = 'PureCN CNVKIT SEG 2K: ' + case_id + ' ' + tumor_sample_id
        #pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, execution_settings={'use_memoization': True}, run=False)
        task.inputs['output_basename'] = task.id + "_CNVKIT_2KSEG"
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error:" + case_id)

## Run Theta2

In [61]:
app_name = project + '/cnvkit-theta2-wf'
cns_manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/cns_segment-manifest.csv')
cnn_manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/cnn_cnvkit-manifest.csv')
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
somatic_germline = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_vardict_vcfs-manifest.csv')
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
head = next(cns_manifest)
cns_dict = {}
for line in cns_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    fid = info[0]
    cns_dict[case_id] = api.files.get(fid)
head = next(cnn_manifest)
cnn_dict = {}
for line in cnn_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    fid = info[0]
    cnn_dict[case_id] = api.files.get(fid)
b_dict = {}
tum_id_list = {}
norm_id_list = {}
head = next(somatic_germline)
for line in somatic_germline:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    norm_id = info[15]
    tum_id = info[18]
    b_dict[case_id] = api.files.get(info[0])
    tum_id_list[case_id] = tum_id
    norm_id_list[case_id] = norm_id
for case_id in case_list:
    try:
        in_dict = {}
        in_dict['include_expression'] = 'FILTER="PASS" && (INFO/STATUS="Germline" | INFO/STATUS="StrongSomatic")'
        in_dict['tumor_cns'] = cns_dict[case_id]
        in_dict['reference_cnn'] = cnn_dict[case_id]
        in_dict['tumor_ID'] = tum_id_list[case_id]
        in_dict['normal_ID'] = norm_id_list[case_id]
        in_dict['paired_vcf'] = b_dict[case_id]
        task_name = 'Theta2 Run: ' + case_id + ' ' + tumor_sample_id
        #pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error:" + case_id)

## Gather pngs

In [79]:
maris_project = 'kfdrc-harmonization/sd-dypmehhf-05'
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
qual_tags = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/JR_qual_assign.txt')
find_ext = 'NO_CONTAM_ADJUST.controlfreec.ratio.png$'
name_ext = "_contam_unadjusted.png"
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
#pdb.set_trace()
qual_dict = {}
for line in qual_tags:
    (case_id, qual) = line.rstrip('\n').split('\t')
    qual_dict[case_id] = qual
qual_tags.close()

for case_id in case_list:
    files = api.files.query(project=project, metadata = {'case_id': case_id} )
    for file_obj in files:
        if re.search(find_ext, file_obj.name):
            print("Found valid file " + file_obj.name + " " + file_obj.metadata["Kids First Participant ID"] + " " + file_obj.metadata["sample_type"])
            file_obj.download('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/controlfreec/comparisons/pngs/' + case_id + "_" + qual_dict[case_id] + name_ext)


Found valid file 64ad5224-55a2-4b87-bc01-b4038a917749NO_CONTAM_ADJUST.controlfreec.ratio.png PT_9QQ37AWW Tumor
Found valid file e41f128c-0e82-41dd-82c8-436601cbb9ceNO_CONTAM_ADJUST.controlfreec.ratio.png PT_1EQHANKW Tumor
Found valid file 189e15ee-3d00-4840-b5c7-e222aba63b3cNO_CONTAM_ADJUST.controlfreec.ratio.png PT_TAJJ9MYY Tumor
Found valid file fa7d814d-3912-49a3-82fa-8839955cdf0dNO_CONTAM_ADJUST.controlfreec.ratio.png PT_BXYKW39H Tumor
Found valid file 173cb67f-3f85-4191-8e31-f1ba1cc45073NO_CONTAM_ADJUST.controlfreec.ratio.png PT_53M7K3JE Tumor
Found valid file a1af172e-2f84-433c-9601-92497d9d9ecdNO_CONTAM_ADJUST.controlfreec.ratio.png PT_1YAJEAMJ Tumor
Found valid file 7e0ebe74-52dd-41f1-a711-9d302bb6a157NO_CONTAM_ADJUST.controlfreec.ratio.png PT_R2TRGY6N Tumor
Found valid file ba4fade5-c88b-4087-9dc7-f6e3e11d61aeNO_CONTAM_ADJUST.controlfreec.ratio.png PT_KWRFGRER Tumor
Found valid file 08e87f40-a06d-46ab-93c0-73c31c245c98NO_CONTAM_ADJUST.controlfreec.ratio.png PT_ASH4P45D Tumor
F

## Gather and parse ControlFreeC info

In [83]:
# maris_project = 'kfdrc-harmonization/sd-dypmehhf-05'
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
qual_tags = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/JR_qual_assign.txt')
find_ext = 'CONTAM_ADJUST.controlfreec.info.txt$'
# name_ext = "_contam_unadjusted.png"
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
#pdb.set_trace()
qual_dict = {}
for line in qual_tags:
    (case_id, qual) = line.rstrip('\n').split('\t')
    qual_dict[case_id] = qual
qual_tags.close()
out = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/controlfreec/comparisons/cf_purity_ploidy_out.txt', 'w')
out.write("Case ID\tContam Adjust\tWindow Size\tPloidy\tPurity\n")
for case_id in case_list:
    files = api.files.query(project=project, metadata = {'case_id': case_id} )
    for file_obj in files:
        if re.search(find_ext, file_obj.name):
            print("Found valid file " + file_obj.name + " " + file_obj.metadata["Kids First Participant ID"] + " " + file_obj.metadata["sample_type"])
            data = file_obj.content().split("\n")
            rtype = file_obj.tags[0]
            # 7, 10, 11
            (field, window) = data[7].split('\t')
            (field, ploidy) = data[10].split('\t')
            (field, purity) = data[11].split('\t')
            out.write("\t".join((case_id, rtype, window, ploidy, purity)) + "\n")
out.close()
            


Found valid file 3bfc3ac3-537b-4286-9d9a-845c95d0ba1dCONTAM_ADJUST.controlfreec.info.txt PT_9QQ37AWW Tumor
Found valid file 64ad5224-55a2-4b87-bc01-b4038a917749NO_CONTAM_ADJUST.controlfreec.info.txt PT_9QQ37AWW Tumor
Found valid file e41f128c-0e82-41dd-82c8-436601cbb9ceNO_CONTAM_ADJUST.controlfreec.info.txt PT_1EQHANKW Tumor
Found valid file ebc12216-22e6-41f7-b6e1-f6da22cc0900CONTAM_ADJUST.controlfreec.info.txt PT_1EQHANKW Tumor
Found valid file 189e15ee-3d00-4840-b5c7-e222aba63b3cNO_CONTAM_ADJUST.controlfreec.info.txt PT_TAJJ9MYY Tumor
Found valid file 3bfc8300-0a04-4c90-b6d9-1bc9cc693dacCONTAM_ADJUST.controlfreec.info.txt PT_TAJJ9MYY Tumor
Found valid file a410a847-0ac0-45f7-b971-f7196c0d7725CONTAM_ADJUST.controlfreec.info.txt PT_BXYKW39H Tumor
Found valid file fa7d814d-3912-49a3-82fa-8839955cdf0dNO_CONTAM_ADJUST.controlfreec.info.txt PT_BXYKW39H Tumor
Found valid file 173cb67f-3f85-4191-8e31-f1ba1cc45073NO_CONTAM_ADJUST.controlfreec.info.txt PT_53M7K3JE Tumor
Found valid file 929f5

## Gather and Parse theta2

In [89]:
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
qual_tags = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/JR_qual_assign.txt')
find_ext = '.BEST.results$'
# name_ext = "_contam_unadjusted.png"
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
#pdb.set_trace()
qual_dict = {}
for line in qual_tags:
    (case_id, qual) = line.rstrip('\n').split('\t')
    qual_dict[case_id] = qual
qual_tags.close()
out = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/controlfreec/comparisons/theta2_purity_out.txt', 'w')
out.write("Case ID\tPurity\n")
for case_id in case_list:
    files = api.files.query(project=project, metadata = {'case_id': case_id} )
    for file_obj in files:
        if re.search(find_ext, file_obj.name):
            print("Found valid file " + file_obj.name + " " + file_obj.metadata["Kids First Participant ID"] + " " + file_obj.metadata["sample_type"])
            data = file_obj.content().split("\n")
            purity = data[1].split('\t')[1].split(',')
            n2 = round(float(purity[1]),2)
            if len(purity) > 2:
                n3 = round(float(purity[2]),2)
                purity = str(n2) + " + " + str(n3) + " = " + str(round(n2 + n3,2))
            else:
                purity = str(n2)
            out.write("\t".join((case_id, purity)) + "\n")
out.close()


Found valid file 16364a01-9d3e-4f3b-b5f5-94503226a853.BEST.results PT_9QQ37AWW Tumor
Found valid file 6675d5a6-9836-4e6f-80f2-be217ab73161.BEST.results PT_1EQHANKW Tumor
Found valid file dd6fa34b-66be-4b7e-b92a-175a5f197bb8.BEST.results PT_TAJJ9MYY Tumor
Found valid file b2ca5cec-352a-4698-9bfe-c34126b9cace.BEST.results PT_BXYKW39H Tumor
Found valid file c95e86a5-10e0-46f7-87d6-19b33b51ced0.BEST.results PT_53M7K3JE Tumor
Found valid file 6aab8d01-ca9a-4a04-b93c-b94f44a2a994.BEST.results PT_1YAJEAMJ Tumor
Found valid file 64d9cc2c-646d-4dc4-89a1-d690ddfe5ffd.BEST.results PT_R2TRGY6N Tumor
Found valid file 003daa8a-b59f-488c-8606-64754899e990.BEST.results PT_KWRFGRER Tumor
Found valid file 5b30afc1-1638-4115-bf24-4d6aae519060.BEST.results PT_ASH4P45D Tumor
Found valid file 505f4a0b-8266-4a42-893b-08e12d7a0aa9.BEST.results PT_3WF5J3PZ Tumor
Found valid file e1b44d7b-83f4-475e-90d0-a55e1196c376.BEST.results PT_JF62ZBX8 Tumor
Found valid file 8b529177-3b35-4f44-adc9-bc3c4edd1613.BEST.result

## Get insert size metrics of crams

In [106]:
def rounded_str(val):
    return str(round(float(val), 2))

In [108]:
germ_origin = 'kfdrc-harmonization/sd-dypmehhf'
tumor_origin= 'kfdrc-harmonization/sd-dypmehhf-03/'

case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
qual_tags = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/JR_qual_assign.txt')

qual_dict = {}
for line in qual_tags:
    (case_id, qual) = line.rstrip('\n').split('\t')
    qual_dict[case_id] = qual
qual_tags.close()

case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
out = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/insert_size_metrics.txt', 'w')
out.write("Case ID\tSample Type\tJR Assess\tPT ID\tBS ID\tMedian Insert Size\tMean Insert Size\tStd Dev Insert Size\tMean Coverage\tStd Dev Coverage\n")
res_dict = {}
row = 7
for case_id in case_list:
    res_dict[case_id] = {}
    stype = "Tumor"
    res_dict[case_id][stype] = {}
    tumor_files = api.files.query(project=tumor_origin, metadata = {'case_id': case_id} )
    for file_obj in tumor_files:
        if re.search(".cram$", file_obj.name):
            # pdb.set_trace()
            ids = file_obj.metadata["Kids First Participant ID"] + "\t" + file_obj.metadata["Kids First Biospecimen ID"]
            res_dict[case_id][stype]['ids'] = ids
            task = api.tasks.get(file_obj.name.split('.')[0])
            wgs = api.files.get(task.outputs['wgs_metrics'].id)
            wgs_data = wgs.content().split('\n')
            wgs_info = wgs_data[row].split('\t')
            res_dict[case_id][stype]['xcov'] = rounded_str(wgs_info[1])
            res_dict[case_id][stype]['scov'] = rounded_str(wgs_info[2])

            for out_file in task.outputs['aggregation_metrics']:
                if re.search(".insert_size_metrics$", out_file.name):
                    ins = api.files.get(out_file.id)
                    ins_data = ins.content().split('\n')
                    ins_info = ins_data[row].split('\t')
                    res_dict[case_id][stype]['mins'] = rounded_str(ins_info[0])
                    res_dict[case_id][stype]['xins'] = rounded_str(ins_info[5])
                    res_dict[case_id][stype]['sins'] = rounded_str(ins_info[6])
    stype = "Normal"
    res_dict[case_id][stype] = {}
    normal_files = api.files.query(project=germ_origin, metadata = {'case_id': case_id} )
    for file_obj in normal_files:
        if re.search(".cram$", file_obj.name):
            ids = file_obj.metadata["Kids First Participant ID"] + "\t" + file_obj.metadata["Kids First Biospecimen ID"]
            res_dict[case_id][stype]['ids'] = ids
            task = api.tasks.get(file_obj.name.split('.')[0])
            wgs = api.files.get(task.outputs['wgs_metrics'].id)
            wgs_data = wgs.content().split('\n')
            wgs_info = wgs_data[row].split('\t')
            res_dict[case_id][stype]['xcov'] = rounded_str(wgs_info[1])
            res_dict[case_id][stype]['scov'] = rounded_str(wgs_info[2])

            for out_file in task.outputs['aggregation_metrics']:
                if re.search(".insert_size_metrics$", out_file.name):
                    ins = api.files.get(out_file.id)
                    ins_data = ins.content().split('\n')
                    ins_info = ins_data[row].split('\t')
                    res_dict[case_id][stype]['mins'] = rounded_str(ins_info[0])
                    res_dict[case_id][stype]['xins'] = rounded_str(ins_info[5])
                    res_dict[case_id][stype]['sins'] = rounded_str(ins_info[6])
for case_id in case_list:
    for stype in res_dict[case_id]:
        cur = res_dict[case_id][stype]
        try:
            out.write("\t".join([case_id, stype, qual_dict[case_id], cur['ids'], cur['mins'], cur['xins'], cur['sins'], cur['xcov'], cur['scov']]) + '\n')
        except Exception as e:
            print (e)
            pdb.set_trace()
            hold =1
            
out.close()

## Compare Array to CFREE

In [None]:
import sys
seg_file = open(sys.argv[1])
qual_tags = open(sys.argv[2])
manifest = open(sys.argv[3])
out_pre = sys.argv[4]
flag = int(sys.argv[5])
qual_dict = {}
for line in qual_tags:
    (case_id, qual) = line.rstrip('\n').split('\t')
    qual_dict[case_id.replace("03","")] = qual
qual_tags.close()

array_calls = {}
loss_lrr = -0.18
gain_lrr = 0.3
seg_out = open("seg_w_annot.seg", "w")
seg_out.write("Sample\tChromosome\tStart\tEnd\tNum_markers\tSeg_CN\tMB_call")
for line in seg_file:
    (sample, chrom, start, end, p_ct, lrr) = line.rstrip('\n').split('\t')
    status = "neutral"
    if sample not in array_calls:
        array_calls[sample] = {}
    if chrom not in array_calls[sample]:
            array_calls[sample][chrom] = []
    if (float(lrr) <= loss_lrr or float(lrr) >= gain_lrr):
        status = "loss"
        if float(lrr) > 0:
            status = "gain"
    array_calls[sample][chrom].append([int(start) + 1, int(end), status])
    seg_out.write(line.rstrip('\n') + "\t" + status + "\n")
seg_file.close()

out_annot = open(out_pre + "cfree_annotated_calls.txt", "w")
if flag == 0:
    out_annot.write("Case ID\tQual Tag\tAnnot\tchr\tstart\tend\tcopy number\tstatus\tgenotype\tuncertainty\tWilcoxonRankSumTestPvalue\tKolmogorovSmirnovPvalue\n")
else:
    out_annot.write("Case ID\tQual Tag\tAnnot\tchr\tstart\tend\tcopy number\tstatus\tWilcoxonRankSumTestPvalue\tKolmogorovSmirnovPvalue\n")
head = next(manifest)
for metadata in manifest:
    finfo = metadata.rstrip('\n').split(',')
    case_id = finfo[-2].replace("03","")
    flag = 0
    pval_fh = open(finfo[1])
    head = next(pval_fh)
    header = head.split('\t')

    for data in pval_fh:
        out_annot.write(case_id + "\t" + qual_dict[case_id])
        cnv = data.split('\t')
        (chrom, start, end, status) = (cnv[0], cnv[1], cnv[2], cnv[4])
        if case_id in array_calls and chrom in array_calls[case_id]:
            f = 0
            for coords in array_calls[case_id][chrom]:
                if int(start) <= coords[1] and int(end) >= coords[0]:
                    f = 1
                    if status == coords[2]:
                        out_annot.write("\tTP\t" + data)
                    else:
                        out_annot.write("\tMM\t" + data)
                    break
            if f == 0:
                out_annot.write("\tFP\t" + data)
        else:
            out_annot.write("\tFP\t" + data)

        

## Length Filter Ratio Files

In [7]:
import sys
pval_concat_file = open(sys.argv[1])
info_manifest = open(sys.argv[2])
ratio_manifest = open(sys.argv[3])
len_min = int(sys.argv[4])
out_pre = sys.argv[5]
head = next(pval_concat_file)
coords = {}
failed = {}
for line in pval_concat_file:
    info = line.rstrip('\n').split('\t')
    (case_id, chrom, start, end) = (info[0], info[3], int(info[4]), int(info[5]))
    case_id += "03"
    if case_id not in coords:
        coords[case_id] = {}
        failed[case_id] = 0
    if chrom not in coords[case_id]:
        coords[case_id][chrom] = []
    if end - start >= len_min or chrom == "Y":
        coords[case_id][chrom].append([start, end])
    else:
        failed[case_id] += 1
ploidy_dict = {}
head = next(info_manifest)
ploidy_fh = open(out_pre + "_cfree_ploidy.txt", "w")
for line in info_manifest:
    metadata = line.rstrip('\n').split(',')
    case_id = metadata[-2]
    cur = open(metadata[1])
    for i in range(0, 10, 1):
        skip = next(cur)
    ploidy_value = next(cur)
    val = ploidy_value.rstrip('\n').split('\t')[1]
    ploidy_dict[case_id] = val
    ploidy_fh.write(case_id + "\t" + val + "\n")
    cur.close()
sys.stderr.write("Case ID\tSource\tFailed GT " + str(len_min) + " CT\n")
for case_id in failed:
    sys.stderr.write(case_id + "\tP Val File\t" + str(failed[case_id]) + '\n')

head = next(ratio_manifest)
draw_cmd = open(out_pre + "_" + str(len_min) + "_cfree_draw.sh", "w")
for line in ratio_manifest:
    metadata = line.rstrip('\n').split(',')
    case_id = metadata[-2]
    new_ratio_fname = out_pre + "_" + case_id + "_" + str(len_min) + ".ratio.txt"
    filt_ratio = open(new_ratio_fname, "w")
    draw_cmd.write("cat makeGraph.R | R --slave --args " + ploidy_dict[case_id] + " " + new_ratio_fname + "\n")
    cur = open(metadata[1])
    head = next(cur)
    filt_ratio.write(head)
    neutral = "1\t1\t" + ploidy_dict[case_id] + "\t-1\t2\t-\t-1"
    fail_ct = 0
    for data in cur:
        datum = data.rstrip('\n').split('\t')
        (chrom, pos) = (datum[0], datum[1])
        flag = 0
        if case_id in coords and chrom in coords[case_id]:
            for entry in coords[case_id][chrom]:
                if int(pos) >= entry[0] and int(pos) <= entry[1] or chrom == "Y":
                    flag = 1
                    filt_ratio.write(data)
                    break
                # end early in searching beyond end of coordinate
                elif int(pos) > entry[1]:
                    break
            if flag == 0:
                filt_ratio.write("\t".join([chrom, pos, neutral]) + '\n')
                fail_ct += 1
        else:
            filt_ratio.write("\t".join([chrom, pos, neutral]) + '\n')
            fail_ct += 1
    filt_ratio.close()
    sys.stderr.write(case_id + "\t Ratio file\t" + str(fail_ct) + '\n')
draw_cmd.close()

SyntaxError: invalid syntax (<ipython-input-7-4d00e633637a>, line 48)

## Add GUI Tag to outputs

In [12]:
tasks = api.tasks.query(project=project, status="COMPLETED").all()
# task name search phrase
phrase = "SINGLE GENOTYPE GATK"
tags = ['GATK', 'GERMLINE', 'SNPEFF']
# modify this to set which input file to use to tag the outputs with, may need to modify code if an array element
out_keys = ['snpeff_vcf']
for task in tasks:
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        for out_key in out_keys:
            try:
                if type(task.outputs[out_key]) is not list:
                    file_obj = api.files.get(task.outputs[out_key].id)
                    file_obj.tags=tags
                    file_obj.save()
                else:
                    for output in task.outputs[out_key]:
                        file_obj = api.files.get(output.id)
                        for key in metadata:
                            file_obj.metadata[key] = metadata[key]
                        file_obj.save()
            except Exception as e:
                print(e)
                print("Skipping " + task.name + " due to error")

Valid task found SINGLE GENOTYPE GATK: PT_NK8A49X5 BS_668350EZ
Valid task found SINGLE GENOTYPE GATK: PT_MNSEJCDM BS_29YQSB5E
Valid task found SINGLE GENOTYPE GATK: PT_KBFM551M BS_1GWZCWVG
Valid task found SINGLE GENOTYPE GATK: PT_M23Q0DC3 BS_6GS4XT7F
Valid task found SINGLE GENOTYPE GATK: PT_M9XXJ4GR BS_9TSKXKGH
Valid task found SINGLE GENOTYPE GATK: PT_KBFM551M BS_9H6Z0MEG
Valid task found SINGLE GENOTYPE GATK: PT_HGM20MW7 BS_NY9MPC8F
Valid task found SINGLE GENOTYPE GATK: PT_1E3E6GMF BS_BKCPNFZ5
Valid task found SINGLE GENOTYPE GATK: PT_0MXPTTM3 BS_Z370T42N
Valid task found SINGLE GENOTYPE GATK: PT_KZ56XHJT BS_3PNWA7WT
Valid task found SINGLE GENOTYPE GATK: PT_KBFM551M BS_HJ7HYZ7N
Valid task found SINGLE GENOTYPE GATK: PT_KZ56XHJT BS_Q7R8BT07
Valid task found SINGLE GENOTYPE GATK: PT_V1HNAC2Q BS_E5RKHG41
Valid task found SINGLE GENOTYPE GATK: PT_QA9WJ679 BS_QPSQPDR8
Valid task found SINGLE GENOTYPE GATK: PT_KTRJ8TFY BS_SNRF1RKC
Valid task found SINGLE GENOTYPE GATK: PT_WGVEF96B BS_3

## Combined Controlfreec/CNKkit Run

In [12]:
def get_refs(api):
    ref_dict = {}
    ref_dict['annotation_file'] = api.files.query(project=project, names=['refFlat_HG38.txt'])[0]
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['combined_include_expression'] = 'FILTER="PASS" && (INFO/STATUS="Germline" | INFO/STATUS="StrongSomatic")'
    ref_dict['wgs_mode'] = 'Y'
    ref_dict['threads'] = 16
    ref_dict['chr_len'] = api.files.query(project=project, names=['hs38_chr.len'])[0]
    ref_dict['reference_fai'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta.fai'])[0]
    ref_dict['coeff_var'] = 0.05
    ref_dict['mate_orientation_control'] = "FR"
    ref_dict['mate_orientation_sample'] = "FR"
    ref_dict['ploidy'] = [2,3,4]
    ref_dict['contamination_adjustment'] = False

    return ref_dict

In [13]:
app_name = project + '/kfdrc-combined-somatic-wgs-cnv-wf'
paired_vcf = api.files.get('5d83e9aee4b0e4c53a247ec8')
b_allele = api.files.get('5db0b8c5e4b0950c6e16e760')
ref_objs = get_refs(api)
in_dict = {}
for key in ref_objs:
    in_dict[key] = ref_objs[key]
in_dict['paired_vcf'] = paired_vcf
in_dict['b_allele'] = b_allele
in_dict['cfree_sex'] = "XY"
in_dict['cnvkit_sex'] = "Male"
in_dict["input_normal_name"] = "BS_2TZNPK1V"
in_dict["input_tumor_name"] = "BS_R3WB0PP7"
in_dict['input_tumor_aligned'] = api.files.get('5d83e922e4b0e4c53a247c38')
in_dict['input_normal_aligned'] = api.files.get('5d83e922e4b0e4c53a247c35')
in_dict['output_basename'] = 'COMBINED_CNV_SUBSET_TEST'
task_name = "COMBINED_CNV_SUBSET_TEST"
task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
task.save()

<Task: id=e3ec7d2c-c6f1-4798-85c3-c9f795d243d0>