In [64]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper], advance_access=True)
project = 'brownm28/mb-controlfreec-troubleshoot'



## Copy files

In [53]:
germ_origin = 'kfdrc-harmonization/sd-dypmehhf'
tumor_origin= 'kfdrc-harmonization/sd-dypmehhf-03/'
somatic_origin= 'kfdrc-harmonization/sd-dypmehhf-05'
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')

case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
for case_id in case_list:
    files = api.files.query(project=somatic_origin, metadata = {'case_id': case_id} )
    for file_obj in files:
        if re.search("vardict.merged.vcf.gz$|vardict.merged.vcf.gz.tbi$", file_obj.name):
            print("Found valid file " + file_obj.name + " " + file_obj.metadata["Kids First Participant ID"] + " " + file_obj.metadata["sample_type"])
            file_obj.copy(project=project)
            print ("Copied file over")


Found valid file a04d1bd7-3451-40ec-a99c-1d5d13bf8a39.vardict.merged.vcf.gz PT_9QQ37AWW Tumor
Copied file over
Found valid file a04d1bd7-3451-40ec-a99c-1d5d13bf8a39.vardict.merged.vcf.gz.tbi PT_9QQ37AWW Tumor
Copied file over
Found valid file d8ccc539-5463-4217-937e-1c958dadabcd.vardict.merged.vcf.gz PT_1EQHANKW Tumor
Copied file over
Found valid file d8ccc539-5463-4217-937e-1c958dadabcd.vardict.merged.vcf.gz.tbi PT_1EQHANKW Tumor
Copied file over
Found valid file b3bccc8a-261f-4be5-978c-34c5e1075177.vardict.merged.vcf.gz PT_TAJJ9MYY Tumor
Copied file over
Found valid file b3bccc8a-261f-4be5-978c-34c5e1075177.vardict.merged.vcf.gz.tbi PT_TAJJ9MYY Tumor
Copied file over
Found valid file 48aa6252-e617-49de-a9b7-7d1af74de326.vardict.merged.vcf.gz PT_BXYKW39H Tumor
Copied file over
Found valid file 48aa6252-e617-49de-a9b7-7d1af74de326.vardict.merged.vcf.gz.tbi PT_BXYKW39H Tumor
Copied file over
Found valid file 27e72734-c076-423f-a196-6a17e524989f.vardict.merged.vcf.gz PT_53M7K3JE Tumor
Co

## HC Run

In [28]:
def get_refs(api):
    ref_dict = {}
    ref_dict['axiomPoly_resource_vcf'] = api.files.query(project=project, names=['Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz'])[0]
    ref_dict['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_dict['dbsnp_vcf'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dbsnp138.vcf'])[0]
    ref_dict['hapmap_resource_vcf'] = api.files.query(project=project, names=['hapmap_3.3.hg38.vcf.gz'])[0]
    ref_dict['mills_resource_vcf'] = api.files.query(project=project, names=['Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'])[0]
    ref_dict['omni_resource_vcf'] = api.files.query(project=project, names=['1000G_omni2.5.hg38.vcf.gz'])[0]
    ref_dict['one_thousand_genomes_resource_vcf'] = api.files.query(project=project, names=['1000G_phase1.snps.high_confidence.hg38.vcf.gz'])[0]
    #ref_dict['ref_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    #ref_dict['ref_tar_gz'] = api.files.query(project=project, names=['hg38_snpeff.tgz'])[0]
    ref_dict['unpadded_intervals_file'] = api.files.query(project=project, names=['hg38.even.handcurated.20k.intervals'])[0]
    ref_dict['wgs_evaluation_interval_list'] = api.files.query(project=project, names=['wgs_evaluation_regions.hg38.interval_list'])[0]
    ref_dict['snp_sites'] = api.files.query(project=project, names=['1000G_phase3_v4_20130502.sites.hg38.vcf'])[0]
    return ref_dict

In [29]:
manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_gvcf_test-manifest.csv')
head = next(manifest)
ref_obj = get_refs(api)
# app_name = project + "/kf-single-genotype/0"
app_name = project + "/kfdrc-single-genotype-basic"
for line in manifest:
    info = line.rstrip('\n').split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_dict['input_vcfs'] = [api.files.get(info[0])]
    task_name = "SINGLE GENOTYPE GATK: " + info[6] + " " + info[11] + " " + info[-2]
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, execution_settings = {'use_memoization': True}, run=False)
    task.inputs['output_basename'] = task.id
    
    task.save()

### expand view

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

## ControlFreeC Run

In [32]:
def get_cf_refs(api):
    ref_dict = {}
    ref_dict['chr_len'] = api.files.query(project=project, names=['hs38_chr.len'])[0]
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['reference_fai'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta.fai'])[0]
    ref_dict['include_expression'] = 'FILTER="PASS"'
    ref_dict['coeff_var'] = 0.05
    ref_dict['mate_orientation_control'] = "FR"
    ref_dict['mate_orientation_sample'] = "FR"
    ref_dict['ploidy'] = [2,3,4]
    ref_dict['threads'] = 16
    ref_dict['contamination_adjustment'] = "TRUE"
    return ref_dict

In [33]:
app_name = project + '/kfdrc-controlfreec-wf'
manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_crams-manifest.csv')
# case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/missed_case_list.txt')
sex_prediction = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_sex_info_w_case_id.txt')
# b_allele = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_ballele-manifest.csv')
b_allele = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/missed_vcf-manifest.csv')
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
ref_objs = get_cf_refs(api)
head = next(manifest)

bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[9]
    sample_id = info[11]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
    
sex_dict = {}
s_trans = {"Male": "XY", "Female": "XX"}
head = next(sex_prediction)
for line in sex_prediction:
    info = line.rstrip('\n').split('\t')
    case_id = info[4]
    pred_sex = info[-1]
    ds_sex = info[1]
    if case_id in case_list:
        if pred_sex != "Unknown" and pred_sex == ds_sex:
            sex_dict[case_id] = s_trans[pred_sex]
        else:
            sys.stderr.write("Warn, prediction for " + case_id + " was inconclusive.  Default to reported sex\n")
            sex_dict[case_id] = s_trans[ds_sex]
b_dict = {}
head = next(b_allele)
for line in b_allele:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    b_dict[case_id] = api.files.get(info[0])
for case_id in case_list:
    try:
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        tumor = ''
        normal = ''
        for stype in bam_dict[case_id]:
            if stype == 'Normal':
                in_dict['input_normal'] = bam_dict[case_id][stype]['file_obj']
                normal = bam_dict[case_id][stype]['sid']
            else:
                in_dict['input_tumor'] = bam_dict[case_id][stype]['file_obj']
                tumor = bam_dict[case_id][stype]['sid']
                in_dict['sample_name'] = tumor
        in_dict['sex'] = sex_dict[case_id]
        in_dict['b_allele'] = b_dict[case_id]
        task_name = 'CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: ' + case_id + ' ' + tumor + ' ' + normal
        # pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id + "CONTAM_ADJUST"
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error: " + case_id)

## CNVkit Run

In [45]:
def get_ckit_refs(api):
    ref_dict = {}
    ref_dict['annotation_file'] = api.files.query(project=project, names=['refFlat_HG38.txt'])[0]
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_dict['include_expression'] = 'FILTER="PASS"'
    ref_dict['wgs_mode'] = 'Y'
    ref_dict['threads'] = 36
    return ref_dict

In [47]:
app_name = project + '/kfdrc-cnvkit-batch-wf'
manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_crams-manifest.csv')
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
# case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/missed_case_list.txt')
sex_prediction = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_sex_info_w_case_id.txt')
b_allele = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_ballele-manifest.csv')
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
ref_objs = get_ckit_refs(api)
head = next(manifest)
bam_dict = {}
for line in manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    stype = info[9]
    sample_id = info[11]
    fid = info[0]
    if case_id not in bam_dict:
        bam_dict[case_id] = {}
    bam_dict[case_id][stype] = {}
    bam_dict[case_id][stype]['sid'] = sample_id
    bam_dict[case_id][stype]['file_obj'] = api.files.get(fid)
    
sex_dict = {}
head = next(sex_prediction)
for line in sex_prediction:
    info = line.rstrip('\n').split('\t')
    case_id = info[4]
    pred_sex = info[-1]
    ds_sex = info[1]
    if case_id in case_list:
        if pred_sex != "Unknown" and pred_sex == ds_sex:
            sex_dict[case_id] = pred_sex
        else:
            sys.stderr.write("Warn, prediction for " + case_id + " was inconclusive.  Default to reported sex\n")
            sex_dict[case_id] = s_trans[ds_sex]
b_dict = {}
head = next(b_allele)
for line in b_allele:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    b_dict[case_id] = api.files.get(info[0])
for case_id in case_list:
    try:
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        tumor = ''
        normal = ''
        for stype in bam_dict[case_id]:
            if stype == 'Normal':
                in_dict['input_control'] = bam_dict[case_id][stype]['file_obj']
                normal = bam_dict[case_id][stype]['sid']
            else:
                in_dict['input_sample'] = bam_dict[case_id][stype]['file_obj']
                tumor = bam_dict[case_id][stype]['sid']
                in_dict['tumor_sample_name'] = tumor
        in_dict['sex'] = sex_dict[case_id]
        in_dict['b_allele_vcf'] = b_dict[case_id]
        task_name = 'CNVKIT FIRST PASS RERUN: ' + case_id + ' ' + tumor + ' ' + normal
        #pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, execution_settings={'use_memoization': True}, run=False)
        task.inputs['output_basename'] = task.id + "_FIRST_PASS"
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error:" + case_id)

### tag outputs

In [51]:
tasks = api.tasks.query(project=project, status="COMPLETED").all()
# task name search phrase
phrase = "CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4"
# modify this to set which input file to use to tag the outputs with, may need to modify code if an array element
in_key = 'input_tumor'
for task in tasks:
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        metadata = task.inputs[in_key].metadata
        for out_key in task.outputs:
            try:
                if type(task.outputs[out_key]) is not list:
                    file_obj = api.files.get(task.outputs[out_key].id)
                    for key in metadata:
                        file_obj.metadata[key] = metadata[key]
                    file_obj.save()
                else:
                    for output in task.outputs[out_key]:
                        file_obj = api.files.get(output.id)
                        for key in metadata:
                            file_obj.metadata[key] = metadata[key]
                        file_obj.save()
            except Exception as e:
                print(e)
                print("Skipping " + task.name + " due to error")

Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PARUTJ03 BS_K2K5YSDS BS_E88G2GGG
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASCWD03 BS_DWYR5CTE BS_WJWC3WV7
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASTGD03 BS_W1R54A2M BS_3JR1MGPE
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASXRJ03 BS_KXRFQF5N BS_C12CV8CF
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASMET03 BS_RA5HNMDP BS_NVCBHA84
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASUTC03 BS_B80Z459E BS_2M61TD7H
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PARTRP03 BS_4RX1AAVV BS_0PQA0GGY
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASFKX03 BS_2X9EVKZ0 BS_PEGDA8G2
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASPGB03 BS_ACCE0MEA BS_DBV7S78S
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASUMG03 BS_WSK7MH3C BS_RD4H0EJ2
Valid task found CONTROLFREEC NO CONTAM ADJUST PLOIDY 2-4: PASVCK03 BS_KQHSSRW3 

## Run PureCN

In [65]:
def get_pcn_refs(api):
    ref_dict = {}
    ref_dict['purecn_gc_ref'] = api.files.query(project=project, names=['hg38_PureCN_150bp_gc_file.txt'])[0]
    ref_dict['dbsnp_vcf'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dbsnp138.vcf.gz'])[0]
    ref_dict['include_expression'] = 'FILTER="PASS" && (INFO/STATUS="Germline" | INFO/STATUS="StrongSomatic")'
    ref_dict['genome_version'] = 'hg38'
    ref_dict['cores'] = 16
    return ref_dict

In [66]:
app_name = project + '/kfdrc-purecn-wf'
seg_manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/cnvkit_seg-manifest.csv')
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
sex_prediction = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_sex_info_w_case_id.txt')
somatic_germline = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_vardict_vcfs-manifest.csv')
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
ref_objs = get_pcn_refs(api)
head = next(seg_manifest)
seg_dict = {}
for line in seg_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    sample_id = info[11]
    fid = info[0]
    seg_dict[case_id] = api.files.get(fid)
    
sex_dict = {}
s_trans = {"Male": "M", "Female": "F"}
head = next(sex_prediction)
for line in sex_prediction:
    info = line.rstrip('\n').split('\t')
    case_id = info[4]
    pred_sex = info[-1]
    ds_sex = info[1]
    if case_id in case_list:
        if pred_sex != "Unknown" and pred_sex == ds_sex:
            sex_dict[case_id] = s_trans[pred_sex]
        else:
            sys.stderr.write("Warn, prediction for " + case_id + " was inconclusive.  Default to reported sex\n")
            sex_dict[case_id] = s_trans[ds_sex]
b_dict = {}
head = next(somatic_germline)
for line in somatic_germline:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    b_dict[case_id] = api.files.get(info[0])
for case_id in case_list:
    try:
        in_dict = {}
        for key in ref_objs:
            in_dict[key] = ref_objs[key]
        seg_obj = seg_dict[case_id]
        tumor_sample_id = seg_obj.metadata['Kids First Biospecimen ID']
        in_dict['input_seg_file'] = seg_obj
        in_dict['tumor_sample_id'] = tumor_sample_id
        in_dict['sex'] = sex_dict[case_id]
        in_dict['somatic_germline_vcf'] = b_dict[case_id]
        task_name = 'PureCN CNVKIT SEG INPUT: ' + case_id + ' ' + tumor_sample_id
        #pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, execution_settings={'use_memoization': True}, run=False)
        task.inputs['output_basename'] = task.id + "_CNVKIT_SEG"
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error:" + case_id)

## Run Theta2

In [61]:
app_name = project + '/cnvkit-theta2-wf'
cns_manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/cns_segment-manifest.csv')
cnn_manifest = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/cnn_cnvkit-manifest.csv')
case_id_list = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_test_set.txt')
somatic_germline = open('/Users/brownm28/Documents/2019-Aug-23_cnv_ts/benchmark_run/maris_vardict_vcfs-manifest.csv')
case_list = []
for line in case_id_list:
    case_list.append(line.rstrip('\n'))
case_id_list.close()
head = next(cns_manifest)
cns_dict = {}
for line in cns_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    fid = info[0]
    cns_dict[case_id] = api.files.get(fid)
head = next(cnn_manifest)
cnn_dict = {}
for line in cnn_manifest:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    fid = info[0]
    cnn_dict[case_id] = api.files.get(fid)
b_dict = {}
tum_id_list = {}
norm_id_list = {}
head = next(somatic_germline)
for line in somatic_germline:
    info = line.rstrip('\n').split(',')
    case_id = info[-2]
    norm_id = info[15]
    tum_id = info[18]
    b_dict[case_id] = api.files.get(info[0])
    tum_id_list[case_id] = tum_id
    norm_id_list[case_id] = norm_id
for case_id in case_list:
    try:
        in_dict = {}
        in_dict['include_expression'] = 'FILTER="PASS" && (INFO/STATUS="Germline" | INFO/STATUS="StrongSomatic")'
        in_dict['tumor_cns'] = cns_dict[case_id]
        in_dict['reference_cnn'] = cnn_dict[case_id]
        in_dict['tumor_ID'] = tum_id_list[case_id]
        in_dict['normal_ID'] = norm_id_list[case_id]
        in_dict['paired_vcf'] = b_dict[case_id]
        task_name = 'Theta2 Run: ' + case_id + ' ' + tumor_sample_id
        #pdb.set_trace()
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=in_dict, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
    except Exception as e:
        print (e)
        print ("Skipping due to error:" + case_id)

In [67]:
task = api.tasks.get('799d3c7f-73dc-4624-904e-e7aec4e98999')
pdb.set_trace()
hold =1 

--Return--
> <ipython-input-67-a4844147d0b8>(2)<module>()->None
-> pdb.set_trace()
(Pdb) dir(task)
(Pdb) p task.stats
*** AttributeError: 'Task' object has no attribute 'stats'
(Pdb) p task.status
'COMPLETED'
(Pdb) p task.inputs
{'output_basename': '003daa8a-b59f-488c-8606-64754899e990', 'min_frac': 0.01, 'include_expression': 'FILTER="PASS" && (INFO/STATUS="Germline" | INFO/STATUS="StrongSomatic")', 'tumor_cns': <File: id=5d8aca66e4b0950c40282304>, 'tumor_ID': 'BS_4RX1AAVV', 'normal_ID': 'BS_0PQA0GGY', 'paired_vcf': <File: id=5d8ba611e4b097679c333f34>, 'reference_cnn': <File: id=5d8aca66e4b0950c40282312>}
(Pdb) p dir(task.inputs['reference_cnn'])
['FOLDER_TYPE', '_API', '_URL', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__

BdbQuit: 