# CNV Benchmarking
This notebook is mostly for file copying to set up benchmarking tasks

In [1]:
#!/usr/bin/env python3
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper

import sys
import re
import os
import pdb
from requests import request
from datetime import datetime
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])

## Get crams and create a manifest

In [2]:
def cp_obj(obj, new_proj, cp_path):
    """
    take file obj, copy, and return new file id
    """
    copied = obj.copy(project=new_proj, name=obj.name).move_to_folder(parent=cp_path, name=obj.name)
    return copied

In [13]:
def find_file(fname, project):
    """
    Search project and get file id using file name
    """
    try:
        file_obj = api.files.query(project=project, names=[fname])[0]
    except Exception as e:
        sys.stderr.write("Could not find " + fname + "\n")
        return 1
    return file_obj

In [None]:
def build_str_array(tum_id, tum_obj, norm_id, norm_name, norm_info, cp_project, cp_folder_obj):
    out_str_array = []
    out_str_array.append(tum_id)
    copied_tum = cp_obj(tum_obj, cp_project, cp_folder_obj)
    out_str_array.extend([copied_tum.id, copied_tum.name, norm_id])
    # normals were already copied, so just going to find the file
    out_str_array.extend([norm_info[norm_name], norm_name])
    return out_str_array


In [None]:
def get_norm_info(cp_folder_obj):
    cp_list = []
    j = 0
    cp_list.append(cp_folder_obj.list_files())

    total = cp_folder_obj.list_files().total
    i = len(cp_list[j])
    norm_dict = {}
    # pdb.set_trace()
    for obj in cp_list[j]:
        norm_dict[obj.name] = obj.id
    while i < (total - 1):
        cp_list.append(cp_list[j].next_page())
        j += 1
        for obj in cp_list[j]:
            norm_dict[obj.name] = obj.id
        i += len(cp_list[j])
    return norm_dict


In [None]:
src_project = 'kfdrc-harmonization/sd-dypmehhf-05'
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_path = 'somatic-benchmarking/maris_array_compare'
cp_folder_obj = api.files.get('61562156be6b86435184baae')
manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/kfnbl_paired_snp_array_intersect_histologies_test_set.txt')
head = next(manifest)
bs_ids = {}
for line in manifest:
    info = line.rstrip('\n').split('\t')
    bs_ids[info[0]] = info[1]
tasks = api.tasks.query(project=src_project, status="COMPLETED").all()
cramifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/cramifest.tsv', 'w')
cramifest.write('tum_bs_id\ttumor_file_id\ttumor_file_name\tnorm_bs_id\tnormal_file_id\tnormal_file_name\n')
norm_info = get_norm_info(cp_folder_obj)
for task in tasks:
    if re.search("Maris-dna-somatic", task.name) and task.inputs['input_tumor_name'] in bs_ids:
        tum_id = task.inputs['input_tumor_name']
        norm_id = task.inputs['input_normal_name']
        tum_cram = task.inputs['input_tumor_aligned']
        # norm files were already copied, just going to find them instead
        norm_cram_name = task.inputs['input_normal_aligned'].name
        print_array = build_str_array(tum_id, tum_cram, norm_id, norm_cram_name, norm_info, cp_project, cp_folder_obj)
        cramifest.write("\t".join(print_array) + "\n")
        tum_crai = find_file(tum_cram.name + '.crai', src_project)
        norm_crai_name = norm_cram_name + '.crai'
        print_array = build_str_array(tum_id, tum_crai, norm_id, norm_crai_name, norm_info, cp_project, cp_folder_obj)
        cramifest.write("\t".join(print_array) + "\n")
        sys.stderr.write('Copied cram and crai for ' + tum_id + ' ' + norm_id + '\n')
cramifest.close()
        

## Copy over b allele files

In [None]:
pattern = '.postCGP.Gfiltered.vcf.gz'
cram_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/cramifest.tsv')
norm_bs_ids = {}
head = next(cram_manifest)
header = head.rstrip('\n').split('\t')
n_idx = header.index('norm_bs_id')
for line in cram_manifest:
    info = line.rstrip('\n').split('\t')
    norm_bs_ids[info[n_idx]] = 0
src_project = 'kfdrc-harmonization/sd-dypmehhf-05'
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_path = 'somatic-benchmarking/maris_array_compare'
cp_folder_obj = api.files.get('61562156be6b86435184baae')

b_allele_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/ballele_manifest.tsv', 'w')
b_allele_manifest.write('bs_id\tfile_id\tfile_name\n')
for bs_id in norm_bs_ids:
    related = api.files.query(project=src_project, metadata = {'Kids First Biospecimen ID': bs_id} )
    for obj in related:
        if re.search(pattern, obj.name):
            copied = cp_obj(obj, cp_project, cp_folder_obj)
            b_allele_manifest.write("\t".join([bs_id, copied.id, copied.name]) + "\n")
            tbi_name = copied.name + '.tbi'
            tbi = find_file(tbi_name, src_project)
            cp_tbi = cp_obj(tbi, cp_project, cp_folder_obj)
            b_allele_manifest.write("\t".join([bs_id, cp_tbi.id, cp_tbi.name]) + "\n")
            sys.stderr.write('Copied vcf and tbi for ' + bs_id + '\n')
            
b_allele_manifest.close()

## Task set up

In [None]:
def get_refs(project):
    ref_objs = {}
    ref_objs['count_panel_of_normals'] = api.files.query(project=project, names=['marris_male_pon.cnv.pon.hdf5'])[0]
    ref_objs['funcotator_data_sources_tgz'] = api.files.query(project=project, names=['funcotator_dataSources.v1.6.20190124s.tar.gz'])[0]
    ref_objs['input_exclude_interval_list'] = api.files.query(project=project, names=['somatic-hg38_CNV_and_centromere_blacklist.hg38liftover.list'])[0]
    ref_objs['input_interval_list'] = api.files.query(project=project, names=['wgs_calling_regions.hg38.interval_list'])[0]
    ref_objs['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_objs['reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]
    ref_objs['run_funcotatesegments'] = True
    return ref_objs
    

In [None]:
def draft_tasks(project, input_list, ref_dict, app_name):
    input_dict = {}
    for key in ref_dict:
        input_dict[key] = ref_dict[key]
    pt_id = input_list['input_aligned_reads_tumor'].metadata['Kids First Participant ID']
    for key in input_list:
        input_dict[key] = input_list[key]
    task_name = pt_id + " KFDRC GATK CNV Somatic Pair Workflow"
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=input_dict, run=False)



In [None]:
cram_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/cramifest.tsv')
b_allele_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/ballele_manifest.tsv')
project = 'danmiller/kf-gatk-cnv-dev'
app_name = project + '/kf_cnv_somatic_pair_wf'
ref_dict = get_refs(project)
inputs = {}
head = next(cram_manifest)
norm_ids = {}
for line in cram_manifest:
    info = line.rstrip('\n').split('\t')
    tum_cram = api.files.get(info[1])
    # for male-only cohort
    if tum_cram.metadata['gender'] == 'Male':
        inputs[info[0]] = {}
        norm_ids[info[3]] = info[0]
        inputs[info[0]]['input_aligned_reads_tumor'] = tum_cram
        inputs[info[0]]['input_aligned_reads_normal'] = api.files.get(info[-2])
    skip = next(cram_manifest)

head = next(b_allele_manifest)
for line in b_allele_manifest:
    info = line.rstrip('\n').split('\t')
    if info[0] in norm_ids:
        inputs[norm_ids[info[0]]]['common_sites'] = api.files.get(info[-2])
    skip = next(b_allele_manifest)
for bs_id in inputs:
    draft_tasks(project, inputs[bs_id], ref_dict, app_name)


## Run Draft Tasks

In [None]:
project = 'danmiller/kf-gatk-cnv-dev'
# pattern = "\w+ KFDRC GATK CNV Somatic Pair Workflow"
pattern = "KFDRC CNVkit Theta2"
tasks = api.tasks.query(project=project, status = "DRAFT")
for task in tasks:
    if re.search(pattern, task.name):
        task.run()

# Set up CNVkit run

## Copy over cnn files for cnvkit

In [None]:
src_project = "kfdrc-harmonization/sd-dypmehhf-ad-hoc-caller-rerun"
b_allele_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/ballele_manifest.tsv')
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_path = 'somatic-benchmarking/maris_array_compare'
cp_folder_obj = api.files.get('61562156be6b86435184baae')

cnn_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/cnn_manifest.tsv', 'w')
cnn_manifest.write('bs_id\tfile_id\tfile_name\n')
pattern = "_cnvkit_reference.cnn"
head = next(b_allele_manifest)
for line in b_allele_manifest:
    info = line.rstrip('\n').split('\t')
    related = api.files.query(project=src_project, metadata = {'Kids First Biospecimen ID Normal': info[0]} )
    for obj in related:
        if re.search(pattern, obj.name):
            copied = cp_obj(obj, cp_project, cp_folder_obj)
            cnn_manifest.write("\t".join([info[0], copied.id, copied.name]) + "\n")
            sys.stderr.write('Copied CNN file for ' + info[0] + '\n')
            break
    skip = next(b_allele_manifest)
cnn_manifest.close()

## Copy over vardict files

In [None]:
src_project = "kfdrc-harmonization/sd-dypmehhf-ad-hoc-caller-rerun"
b_allele_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/ballele_manifest.tsv')
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_path = 'somatic-benchmarking/maris_array_compare'
cp_folder_obj = api.files.get('61562156be6b86435184baae')

vardict_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/vardict_manifest.tsv', 'w')
vardict_manifest.write('bs_id\tfile_id\tfile_name\n')
pattern = ".vardict_somatic.merged.vcf.gz"
head = next(b_allele_manifest)
for line in b_allele_manifest:
    info = line.rstrip('\n').split('\t')
    related = api.files.query(project=src_project, metadata = {'Kids First Biospecimen ID Normal': info[0]} )
    for obj in related:
        if re.search(pattern, obj.name):
            copied = cp_obj(obj, cp_project, cp_folder_obj)
            vardict_manifest.write("\t".join([info[0], copied.id, copied.name]) + "\n")
            tbi_name = copied.name + '.tbi'
            tbi_obj = find_file(tbi_name, src_project)
            copied = cp_obj(tbi_obj, cp_project, cp_folder_obj)
            vardict_manifest.write("\t".join([info[0], copied.id, copied.name]) + "\n")
           
            sys.stderr.write('Copied vardict file for ' + info[0] + '\n')
            break
    skip = next(b_allele_manifest)
vardict_manifest.close()   


## Draft tasks for CNVkit WF

In [None]:
def get_cnvkit_refs(project):
    ref_objs = {}
    ref_objs['cnvkit_annotation_file'] = api.files.query(project=project, names=['refFlat_HG38.txt'])[0]
    ref_objs['reference_fai'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta.fai'])[0]
    ref_objs['cnvkit_wgs_mode'] = 'Y'
    ref_objs['reference_dict'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.dict'])[0]
    ref_objs['reference_fasta'] = api.files.query(project=project, names=['Homo_sapiens_assembly38.fasta'])[0]

    ref_objs['i_flag'] = 'N'
    ref_objs['wgs_or_wxs'] = 'WGS'
    ref_objs['combined_include_expression'] = 'FILTER="PASS" && (INFO/STATUS="Germline" | INFO/STATUS="StrongSomatic")'
    return ref_objs


In [None]:
def draft_cnvkit_tasks(project, input_list, ref_dict, app_name):
    input_dict = {}
    for key in ref_dict:
        input_dict[key] = ref_dict[key]
    pt_id = input_list['input_tumor_aligned'].metadata['Kids First Participant ID']
    for key in input_list:
        input_dict[key] = input_list[key]
    task_name = "KFDRC CNVkit Theta2: " + pt_id
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=input_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()


In [None]:
cram_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/cramifest.tsv')
b_allele_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/ballele_manifest.tsv')
cnn_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/cnn_manifest.tsv')
vardict_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/vardict_manifest.tsv')
app_name = project + '/kfdrc-production-cnvkit-theta2-wf'
ref_dict = get_cnvkit_refs(project)
inputs = {}
head = next(cram_manifest)
norm_ids = {}
for line in cram_manifest:
    info = line.rstrip('\n').split('\t')
    tum_cram = api.files.get(info[1])
    inputs[info[0]] = {}
    norm_ids[info[3]] = info[0]
    inputs[info[0]]['input_normal_name'] = info[3]
    inputs[info[0]]['input_tumor_name'] = info[0]
    inputs[info[0]]['input_tumor_aligned'] = tum_cram
    inputs[info[0]]['input_normal_aligned'] = api.files.get(info[-2])
    if tum_cram.metadata['gender'] == 'Male':
        inputs[info[0]]['cnvkit_sex'] = 'y'
    else:
        inputs[info[0]]['cnvkit_sex'] = 'x'
    skip = next(cram_manifest)
# process b allele inputs
head = next(b_allele_manifest)
for line in b_allele_manifest:
    info = line.rstrip('\n').split('\t')
    if info[0] in norm_ids:
        inputs[norm_ids[info[0]]]['b_allele'] = api.files.get(info[-2])
    index = next(b_allele_manifest)
    info = index.rstrip('\n').split('\t')
    if info[0] in norm_ids:
        inputs[norm_ids[info[0]]]['b_allele_index'] = api.files.get(info[-2])
# process cnn inputs   
head = next(cnn_manifest)
for line in cnn_manifest:
    info = line.rstrip('\n').split('\t')
    if info[0] in norm_ids:
        inputs[norm_ids[info[0]]]['cnvkit_cnn'] = api.files.get(info[-2])
# process vardict inputs
head = next(vardict_manifest)
for line in vardict_manifest:
    info = line.rstrip('\n').split('\t')
    if info[0] in norm_ids:
        inputs[norm_ids[info[0]]]['vardict_prepass_vcf'] = api.files.get(info[-2])
    skip = next(vardict_manifest)

for bs_id in inputs:
    draft_cnvkit_tasks(project, inputs[bs_id], ref_dict, app_name)


## Copy over controlfreec results

In [None]:
pattern = ".controlfreec.CNVs.p.value.txt"
src_project = 'kfdrc-harmonization/sd-dypmehhf-05'
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_folder_obj = api.files.get('5e9753d8e4b0efd83ca85b24')
b_allele_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/ballele_manifest.tsv')

cfree_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/controlfreec_value_manifest.tsv', 'w')
cfree_manifest.write('bs_id\tfile_id\tfile_name\n')
head = next(b_allele_manifest)
for line in b_allele_manifest:
    info = line.rstrip('\n').split('\t')
    related = api.files.query(project=src_project, metadata = {'Kids First Biospecimen ID Normal': info[0]} )
    for obj in related:
        if re.search(pattern, obj.name):
            copied = cp_obj(obj, cp_project, cp_folder_obj)
            cfree_manifest.write("\t".join([info[0], copied.id, copied.name]) + "\n")
            sys.stderr.write('Copied cfree file for ' + info[0] + '\n')
            break
    skip = next(b_allele_manifest)
cfree_manifest.close()   



### Add metadata and tag NEW outputs by task

In [5]:
def add_meta_to_outputs(task, tags, phrase, in_key, tn_dict):
    try:
        if in_key in task.inputs and re.search(phrase, task.name):
            metadata = {}
            for key in task.inputs[in_key].metadata:
                metadata[key] = task.inputs[in_key].metadata[key]
            # Add special KF IDs by tumor/normal if defined
            if tn_dict:
                for key in tn_dict:
                    metadata[tn_dict[key]] = task.inputs[key].metadata['Kids First Biospecimen ID']
            # outputs for these tasks are file arrays
            for key in task.outputs:
                if isinstance(task.outputs[key], list):
                    for out_file in task.outputs[key]:
                        try:
                            out_file_obj = api.files.get(out_file.id)
                        except Exception as e:
                            print (e)
                            print ("Error getting file output for " + task.name + " " + task.id + " skipping!")
                            break
                        if out_file_obj.metadata is None or len(out_file_obj.metadata) == 0:
                            out_file_obj.metadata = metadata
                            if tags:
                                out_file_obj.tags = tags
                            out_file_obj.save()
                else:
                    try:
                        out_file_obj = api.files.get(task.outputs[key].id)
                        if out_file_obj.metadata is None or len(out_file_obj.metadata) == 0:
                            out_file_obj.metadata = metadata
                            if tags:
                                out_file_obj.tags = tags
                            out_file_obj.save()
                    except Exception as e:
                        print(e)
                    
            return 0
        else:
            return None
    except Exception as e:
        print (e)
        print ("Got an error processing " + task.name + " " + task.id)
        exit(1)


In [7]:
project = 'danmiller/kf-gatk-cnv-dev'
pattern = 'KFDRC GATK CNV'
tn_dict= {'input_aligned_reads_tumor': 'Kids First Biospecimen ID Tumor',
          'input_aligned_reads_normal': 'Kids First Biospecimen ID Normal'}
in_key = 'input_aligned_reads_tumor'
tags = ['GATK', 'WGS']
tasks = api.tasks.query(project=project, status="COMPLETED").all()
for task in tasks:
    add_meta_to_outputs(task, tags, pattern, in_key, tn_dict)

# WXS Processing

## Copy over WXS crams

In [11]:
src_project = 'd3b-bixu-ops/open-target-target-somatic-mutations-wxs-tumor'
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_path = 'somatic-benchmarking/maris_array_compare'
cp_folder_obj = api.files.get('61562156be6b86435184baae')

cramifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/wxs_cramifest.tsv', 'w')
cramifest.write('tum_bs_id\ttumor_file_id\ttumor_file_name\tnorm_bs_id\tnormal_file_id\tnormal_file_name\n')

manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/CNV_SNP_Array_testset.txt')
head = next(manifest)
for line in manifest:
    info = line.rstrip('\n').split('\t')
    (tum_id, norm_id) = (info[0], info[1])
    info_dict = {}
    info_dict[tum_id] = {}
    info_dict[norm_id] = {}
    tum_files = api.files.query(project=src_project, metadata={'Kids First Biospecimen ID': tum_id}).all()
    t_found = 0
    for file_obj in tum_files:
        if t_found == 2:
            break
        if re.search("cram$", file_obj.name):
            info_dict[tum_id]['cram'] = file_obj.id + "\t" + file_obj.name
            t_found += 1
            cp_obj(file_obj, cp_project, cp_folder_obj)
        elif re.search("crai$", file_obj.name):
            info_dict[tum_id]['crai'] = file_obj.id + "\t" + file_obj.name
            t_found += 1
            cp_obj(file_obj, cp_project, cp_folder_obj)
    if t_found != 2:
        sys.stderr.write('Only found ' + str(t_found) + ' for ' + tum_id + '\n')
        pdb.set_trace()
    norm_files = api.files.query(project=src_project, metadata={'Kids First Biospecimen ID': norm_id}).all()
    n_found = 0
    for file_obj in norm_files:
        if n_found == 2:
            break
        if re.search("cram$", file_obj.name):
            info_dict[norm_id]['cram'] = file_obj.id + "\t" + file_obj.name
            n_found += 1
            cp_obj(file_obj, cp_project, cp_folder_obj)
        elif re.search("crai$", file_obj.name):
            info_dict[norm_id]['crai'] = file_obj.id + "\t" + file_obj.name
            n_found += 1
            cp_obj(file_obj, cp_project, cp_folder_obj)
    if n_found != 2:
        sys.stderr.write('Only found ' + str(n_found) + ' for ' + norm_id + '\n')
        pdb.set_trace()


    cramifest.write("\t".join([tum_id, info_dict[tum_id]['cram'], norm_id, info_dict[norm_id]['cram']]) + "\n")
    cramifest.write("\t".join([tum_id, info_dict[tum_id]['crai'], norm_id, info_dict[norm_id]['crai']]) + "\n")

    sys.stderr.write('Copied cram and crai for ' + tum_id + ' ' + norm_id + '\n')
cramifest.close()


Copied cram and crai for TARGET-30-PAKXDZ-01A-01W TARGET-30-PAKXDZ-10A-01W
Copied cram and crai for TARGET-30-PANUKV-01A-01W TARGET-30-PANUKV-10A-01W
Copied cram and crai for TARGET-30-PAMMWD-01A-01W TARGET-30-PAMMWD-10A-01W
Copied cram and crai for TARGET-30-PAMMXF-01A-01W TARGET-30-PAMMXF-10A-01W
Copied cram and crai for TARGET-30-PAKHHB-01A-01W TARGET-30-PAKHHB-10A-01W
Copied cram and crai for TARGET-30-PATDXG-01A-01D TARGET-30-PATDXG-10A-01D
Copied cram and crai for TARGET-30-PANIPC-06A-01W TARGET-30-PANIPC-10A-01W
Copied cram and crai for TARGET-30-PATDWN-01A-01D TARGET-30-PATDWN-10A-01D
Copied cram and crai for TARGET-30-PAHYWC-01A-01W TARGET-30-PAHYWC-10A-01W
Copied cram and crai for TARGET-30-PASWFB-01A-01D TARGET-30-PASWFB-10A-01D
Copied cram and crai for TARGET-30-PAMDAL-01A-01W TARGET-30-PAMDAL-10A-01W
Copied cram and crai for TARGET-30-PAPSMC-01A-01W TARGET-30-PAPSMC-10A-01W
Copied cram and crai for TARGET-30-PAMUTD-01A-01W TARGET-30-PAMUTD-10A-01W
Copied cram and crai for 

## Copy over wxs b allele files

In [16]:
pattern = '.postCGP.Gfiltered.vcf.gz$'
cram_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/wxs_cramifest.tsv')
norm_bs_ids = {}
head = next(cram_manifest)
header = head.rstrip('\n').split('\t')
n_idx = header.index('norm_bs_id')
for line in cram_manifest:
    info = line.rstrip('\n').split('\t')
    norm_bs_ids[info[n_idx]] = 0
src_project = 'd3b-bixu-ops/open-target-target-somatic-mutations-wxs-tumor'
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_folder_obj = api.files.get('61562156be6b86435184baae')

b_allele_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/wxs_ballele_manifest.tsv', 'w')
b_allele_manifest.write('bs_id\tfile_id\tfile_name\n')
for bs_id in norm_bs_ids:
    related = api.files.query(project=src_project, metadata = {'Kids First Biospecimen ID': bs_id} )
    for obj in related:
        if re.search(pattern, obj.name):
            copied = cp_obj(obj, cp_project, cp_folder_obj)
            b_allele_manifest.write("\t".join([bs_id, copied.id, copied.name]) + "\n")
            tbi_name = copied.name + '.tbi'
            tbi = find_file(tbi_name, src_project)
            cp_tbi = cp_obj(tbi, cp_project, cp_folder_obj)
            b_allele_manifest.write("\t".join([bs_id, cp_tbi.id, cp_tbi.name]) + "\n")
            sys.stderr.write('Copied vcf and tbi for ' + bs_id + '\n')
            break
            
b_allele_manifest.close()

Copied vcf and tbi for TARGET-30-PAKXDZ-10A-01W
Copied vcf and tbi for TARGET-30-PANUKV-10A-01W
Copied vcf and tbi for TARGET-30-PAMMWD-10A-01W
Copied vcf and tbi for TARGET-30-PAMMXF-10A-01W
Copied vcf and tbi for TARGET-30-PAKHHB-10A-01W
Copied vcf and tbi for TARGET-30-PATDXG-10A-01D
Copied vcf and tbi for TARGET-30-PANIPC-10A-01W
Copied vcf and tbi for TARGET-30-PATDWN-10A-01D
Copied vcf and tbi for TARGET-30-PAHYWC-10A-01W
Copied vcf and tbi for TARGET-30-PASWFB-10A-01D
Copied vcf and tbi for TARGET-30-PAMDAL-10A-01W
Copied vcf and tbi for TARGET-30-PAPSMC-10A-01W
Copied vcf and tbi for TARGET-30-PAMUTD-10A-01W
Copied vcf and tbi for TARGET-30-PAMBAC-14A-01W
Copied vcf and tbi for TARGET-30-PAPCTS-10A-01D
Copied vcf and tbi for TARGET-30-PAKHCF-10A-01W
Copied vcf and tbi for TARGET-30-PAMYCE-10A-01W
Copied vcf and tbi for TARGET-30-PALWVJ-10A-01W
Copied vcf and tbi for TARGET-30-PAKFUY-10A-01W
Copied vcf and tbi for TARGET-30-PAIXIF-10A-01W
Copied vcf and tbi for TARGET-30-PASMNT-

## Copy over processed files
CNVkit and ControlFreec Results already exist

### Copy ControlFreeC Files

In [17]:
pattern = ".controlfreec.CNVs.p.value.txt"
src_project = 'd3b-bixu-ops/open-target-target-somatic-mutations-wxs-tumor'
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_folder_obj = api.files.get('5e9753d8e4b0efd83ca85b24')
cram_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/wxs_cramifest.tsv')

cfree_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/wxs_controlfreec_value_manifest.tsv', 'w')
cfree_manifest.write('bs_id\tfile_id\tfile_name\n')
head = next(cram_manifest)
for line in cram_manifest:
    info = line.rstrip('\n').split('\t')
    related = api.files.query(project=src_project, metadata = {'Kids First Biospecimen ID Tumor': info[0]} )
    for obj in related:
        if re.search(pattern, obj.name):
            copied = cp_obj(obj, cp_project, cp_folder_obj)
            cfree_manifest.write("\t".join([info[0], copied.id, copied.name]) + "\n")
            sys.stderr.write('Copied cfree file for ' + info[0] + '\n')
            break
    skip = next(cram_manifest)
cfree_manifest.close()   


Copied cfree file for TARGET-30-PAKXDZ-01A-01W
Copied cfree file for TARGET-30-PANUKV-01A-01W
Copied cfree file for TARGET-30-PAMMWD-01A-01W
Copied cfree file for TARGET-30-PAMMXF-01A-01W
Copied cfree file for TARGET-30-PAKHHB-01A-01W
Copied cfree file for TARGET-30-PATDXG-01A-01D
Copied cfree file for TARGET-30-PANIPC-06A-01W
Copied cfree file for TARGET-30-PATDWN-01A-01D
Copied cfree file for TARGET-30-PAHYWC-01A-01W
Copied cfree file for TARGET-30-PASWFB-01A-01D
Copied cfree file for TARGET-30-PAMDAL-01A-01W
Copied cfree file for TARGET-30-PAPSMC-01A-01W
Copied cfree file for TARGET-30-PAMUTD-01A-01W
Copied cfree file for TARGET-30-PAMBAC-01A-01W
Copied cfree file for TARGET-30-PAPCTS-01A-01D
Copied cfree file for TARGET-30-PAKHCF-01A-01W
Copied cfree file for TARGET-30-PAMYCE-01A-01W
Copied cfree file for TARGET-30-PALWVJ-01A-01W
Copied cfree file for TARGET-30-PAKFUY-01A-01W
Copied cfree file for TARGET-30-PAIXIF-01A-01W
Copied cfree file for TARGET-30-PASMNT-01A-01D
Copied cfree 

### Copy CNVkit Files

In [20]:
pattern = ".theta2.total.cns"
alt = ".call.cns"
src_project = 'd3b-bixu-ops/open-target-target-somatic-mutations-wxs-tumor'
cp_project = 'danmiller/kf-gatk-cnv-dev'
cp_folder_obj = api.files.get('5e9753d8e4b0efd83ca85b24')
cram_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/wxs_cramifest.tsv')

cnvkit_manifest = open('/Users/brownm28/Documents/2021-Sep-30_CNV_benchmarking/wxs_cnvkit_value_manifest.tsv', 'w')
cnvkit_manifest.write('bs_id\tfile_id\tfile_name\n')
head = next(cram_manifest)
for line in cram_manifest:
    info = line.rstrip('\n').split('\t')
    related = api.files.query(project=src_project, metadata = {'Kids First Biospecimen ID Tumor': info[0]} ).all()
    found = 0
    alt_obj = None
    for obj in related:
        if re.search(pattern, obj.name):
            copied = cp_obj(obj, cp_project, cp_folder_obj)
            cnvkit_manifest.write("\t".join([info[0], copied.id, copied.name]) + "\n")
            sys.stderr.write('Copied cnvkit file for ' + info[0] + '\n')
            found = 1
            break
        elif re.search(alt, obj.name):
            alt_obj = obj
    if not found:
        sys.stderr.write('Theta2 results not found for ' + info[0] + '. Falling back on CNVkit results\n')
        copied = cp_obj(alt_obj, cp_project, cp_folder_obj)
        cnvkit_manifest.write("\t".join([info[0], copied.id, copied.name]) + "\n")
        sys.stderr.write('Copied cnvkit file for ' + info[0] + '\n')

        
    skip = next(cram_manifest)
cnvkit_manifest.close()   


Copied cnvkit file for TARGET-30-PAKXDZ-01A-01W
Copied cnvkit file for TARGET-30-PANUKV-01A-01W
Copied cnvkit file for TARGET-30-PAMMWD-01A-01W
Copied cnvkit file for TARGET-30-PAMMXF-01A-01W
Copied cnvkit file for TARGET-30-PAKHHB-01A-01W
Copied cnvkit file for TARGET-30-PATDXG-01A-01D
Copied cnvkit file for TARGET-30-PANIPC-06A-01W
Copied cnvkit file for TARGET-30-PATDWN-01A-01D
Copied cnvkit file for TARGET-30-PAHYWC-01A-01W
Copied cnvkit file for TARGET-30-PASWFB-01A-01D
Copied cnvkit file for TARGET-30-PAMDAL-01A-01W
Copied cnvkit file for TARGET-30-PAPSMC-01A-01W
Copied cnvkit file for TARGET-30-PAMUTD-01A-01W
Copied cnvkit file for TARGET-30-PAMBAC-01A-01W
Copied cnvkit file for TARGET-30-PAPCTS-01A-01D
Copied cnvkit file for TARGET-30-PAKHCF-01A-01W
Copied cnvkit file for TARGET-30-PAMYCE-01A-01W
Copied cnvkit file for TARGET-30-PALWVJ-01A-01W
Copied cnvkit file for TARGET-30-PAKFUY-01A-01W
Copied cnvkit file for TARGET-30-PAIXIF-01A-01W
Copied cnvkit file for TARGET-30-PASMNT-