# Normalize, Annotate
This is a notebook to run the patch workflow to bring existing VCFs to spec

In [1]:
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])

In [None]:
def get_refs(api):
    ref_dict = {}
    ref_dict['bcftools_annot_vcf'] = api.files.query(project=project, names=["af-only-gnomad.hg38.vcf.gz"])[0]
    ref_dict['bcftools_public_filter'] = 'FILTER="PASS"|INFO/HotSpotAllele=1'
    ref_dict['bcftools_annot_columns'] = "INFO/AF"
    ref_dict['disable_hotspot_annotation'] = False
    ref_dict['gatk_filter_name'] = ["NORM_DP_LOW", "GNOMAD_AF_HIGH"]
    ref_dict['vep_cache'] = api.files.query(project=project, names=["homo_sapiens_vep_93_GRCh38.tar.gz"])[0]
    ref_dict['genomic_hotspots'] = [api.files.query(project=project, names=["tert.bed"])[0]]
    ref_dict['protein_snv_hotspots'] = [api.files.query(project=project, names=["protein_snv_cancer_hotspots_v2.tsv"])[0]]
    ref_dict['protein_indel_hotspots'] = [api.files.query(project=project, names=["protein_indel_cancer_hotspots_v2.tsv"])[0]]
    ref_dict['indexed_reference_fasta'] = api.files.query(project=project, names=["Homo_sapiens_assembly38.fasta"])[0]
    return ref_dict

In [None]:
def draft_task(ref_dict, tool_name, retain_info, vcf_obj, short_name, tumor_id, normal_id, project):
    try:
        input_dict = {}
        for key in ref_dict:
            input_dict[key] = ref_dict[key]
        # pdb.set_trace()
        if tool_name == "strelka2_somatic":
            input_dict['add_common_fields'] = True
        else:
            input_dict['add_common_fields'] = False
        input_dict['input_vcf'] = vcf_obj
        input_dict['gatk_filter_expression'] = ["vc.getGenotype('" + normal_id + "').getDP() <= 7","AF > 0.001"]
        input_dict['tool_name'] = tool_name
        input_dict['retain_info'] = retain_info
        input_dict['input_tumor_name'] = tumor_id
        input_dict['input_normal_name'] = normal_id
        task_name = "KFDRC NORM ANNOT PATCH RPT: " + short_name + " " + tool_name + " " + tumor_id + " " + normal_id
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=input_dict, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
    except Exception as e:
        print(e)
        print("had a problem drafting a task for " + tumor_id)
    

In [None]:
def parse_manifest(entry):
    try:
        info = entry.rstrip('\n').split(',')
        in_vcf = api.files.get(info[0])
        tool_name = ''
        # use keys to search file name for key word, then standardize tool name
        for key in tool_name_dict:
            if re.search(key, info[1]):
                tool_name = tool_name_dict[key]
                break
        retain_info = retain_info_dict[tool_name]
        draft_task(ref_dict, tool_name, retain_info, in_vcf, short_name, info[t_idx], info[n_idx], project)
    except Exception as e:
        print(e)
        print("had a problem with parsing manifest for " + entry)
        exit(1)

In [None]:
# comment out project, vcf_input and short_name vars not used
pnoc_project = "kfdrc-harmonization/sd-8y99qzjj-ad-hoc-caller-rerun"
pnoc_vcf_input = open('/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/vardict_rpt/pnoc_vardict_vcfs-manifest.csv')
pnoc_short_name = "PNOC"

# cbtn_project = "kfdrc-harmonization/sd-bhjxbdqk-ad-hoc-caller-rerun"
# cbtn_vcf_input = open('/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/vardict_rpt/cbtn_vardict_vcfs-manifest.csv')
# cbtn_short_name = "CBTN"

# tcga_project = "kfdrc-harmonization/openpbta-tcga-ad-hoc-caller-rerun"
# tcga_vcf_input = open('/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/tcga_input_vcf-manifest.csv')
# tcga_short_name = "TCGA"

# set these vars to the ones uncommented and to be used in the process
project = pnoc_project
vcf_input = pnoc_vcf_input
short_name = pnoc_short_name

app_name = project + "/kfdrc-norm-annot-wf"
tool_name_dict = {'strelka': 'strelka2_somatic', 'mutect': 'mutect2_somatic',
                  'lancet': 'lancet_somatic', 'vardict': 'vardict_somatic'}
# use previous result to set helpful info output for maf fields
retain_info_dict = {'strelka2_somatic': 'MQ,MQ0,QSI,HotSpotAllele',
                   'mutect2_somatic': 'MBQ,TLOD,HotSpotAllele',
                   'lancet_somatic':'MS,FETS,HotSpotAllele',
                   'vardict_somatic': 'MSI,MSILEN,SOR,SSF,HotSpotAllele'}
ref_dict = get_refs(api)
head = next(vcf_input)
header = head.rstrip('\n').split(',')
# get tumor and normal ID positions - commented because TCGA is slightly different
t_idx = header.index('Kids First Biospecimen ID Tumor')
n_idx = header.index('Kids First Biospecimen ID Normal')
# t_idx = header.index('bam_tumor_id')
# n_idx = header.index('bam_normal_id')
i = 1
n = 100

with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(parse_manifest, line): line for line in vcf_input}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks created\n')
        i += 1

# for line in vcf_input:
#     parse_manifest(line)



## Survey tasks from TCGA project

In [None]:
def parse_task(task):
    try:
        if 'input_tumor_aligned' in task.inputs:
            return [task.id, task.name, str(task.end_time.date()), str(task.end_time.time()),
                    task.inputs["input_tumor_aligned"].name,
                    task.inputs["input_tumor_aligned"].tags[0], task.app]
        else:
            return None
    except Exception as e:
        print (e)
        sys.exit(1)
    

In [None]:
project = "cavatica/openpbta-tcga"
tasks = api.tasks.query(project=project, status="COMPLETED").all()
out = open("/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/tcga_tasks.txt", "w")
out.write("\t".join(["task id", "task name", "date", "time", "tumor bam", "bam tag", "app"]) + "\n")
i = 1
n = 100
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(parse_task, task): task for task in tasks}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks processed\n')
        i += 1
        if result.result() is not None:
            out.write("\t".join(result.result()) + "\n")
out.close()


# for task in tasks:
#     info = parse_task(task)
#     if info is not None:
#         print("\t".join(info))
#         pdb.set_trace()
#         hold=1


### Review TCGA metadata

In [None]:
def parse_march_task(task):
    try:
        if 'input_tumor_aligned' in task.inputs and task.end_time.month == 3:
            return [task.id, task.name,
                    task.inputs["input_tumor_aligned"].id,
                    task.inputs["input_tumor_aligned"].name,
                   task.inputs["input_tumor_aligned"].metadata['case_id'],
                   task.inputs["input_tumor_aligned"].metadata['sample_id'],
                   task.inputs["input_tumor_aligned"].metadata['aliquot_id'],
                    task.inputs["input_tumor_name"],
                    task.inputs["input_normal_aligned"].id,
                    task.inputs["input_normal_aligned"].name,
                    task.inputs["input_normal_aligned"].metadata['case_id'],
                   task.inputs["input_normal_aligned"].metadata['sample_id'],
                   task.inputs["input_normal_aligned"].metadata['aliquot_id'],
                    task.inputs["input_normal_name"]
                   ]
        else:
            return None
    except Exception as e:
        print (e)
        sys.exit(1)
    

In [None]:
project = "cavatica/openpbta-tcga"
tasks = api.tasks.query(project=project, status="COMPLETED").all()
out = open("/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/tcga_bam_info.txt", "w")
out.write("\t".join(["task id", "task name", "tumor bam id",
                     "tumor bam name", "tumor bam case id",
                     "tumor bam sample id", "tumor bam aliquot id",
                     "input_tumor_name", "normal bam id",
                     "normal bam name", "normal bam case id",
                     "normal bam sample id", "normal bam aliquot id",
                     "input_normal_name"
                    ]) + "\n")
i = 1
n = 100
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(parse_march_task, task): task for task in tasks}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks processed\n')
        i += 1
        if result.result() is not None:
            out.write("\t".join(result.result()) + "\n")
out.close()


# for task in tasks:
#     info = parse_march_task(task)
#     if info is not None:
#         print("\t".join(info))
#         pdb.set_trace()
#         hold=1


### Patch metadata to March outputs

In [None]:
def parse_march_meta(task):
    try:
        if 'input_tumor_aligned' in task.inputs and task.end_time.month == 3:
            metadata = {}
            for key in task.inputs["input_tumor_aligned"].metadata:
                metadata[key] = task.inputs["input_tumor_aligned"].metadata[key]
            metadata['bam_normal_id'] = task.inputs["input_normal_name"]
            metadata['bam_tumor_id'] = task.inputs["input_tumor_name"]
            for key in task.outputs:
                try:
                    out_file_obj = api.files.get(task.outputs[key].id)
                except Exception as e:
                    print (e)
                    print ("Error getting file output for " + task.name + " " + task.id + " skipping!")
                    break
                # if out_file_obj.metadata is None:
                out_file_obj.metadata = metadata
                out_file_obj.save()
        else:
            return None
    except Exception as e:
        print (e)
        print ("Got an error processing " + task.name + " " + task.id)

        # sys.exit(1)


In [None]:
project = "cavatica/openpbta-tcga"
tasks = api.tasks.query(project=project, status="COMPLETED").all()
i = 1
n = 100
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(parse_march_meta, task): task for task in tasks}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks processed\n')
        i += 1


### Add helpful tag for output processing

In [None]:
def parse_march_tag(task):
    try:
        if 'input_tumor_aligned' in task.inputs and task.end_time.month == 3:
            for key in task.outputs:
                # pdb.set_trace()
                if re.search("vep_vcf", key) or re.search("vep_tbi", key):
                    try:
                        out_file_obj = api.files.get(task.outputs[key].id)
                    except Exception as e:
                        print (e)
                        print ("Error getting file output for " + task.name + " " + task.id + " skipping!")
                        break
                    if out_file_obj.tags == []:
                        out_file_obj.tags = ["PASS", "TCGA", "SOMATIC"]
                        out_file_obj.save()
        else:
            return None
    except Exception as e:
        print (e)
        print ("Got an error processing " + task.name + " " + task.id)


In [None]:
project = "cavatica/openpbta-tcga"
tasks = api.tasks.query(project=project, status="COMPLETED").all()
i = 1
n = 100
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(parse_march_tag, task): task for task in tasks}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks processed\n')
        i += 1
# for task in tasks:
#     parse_march_tag(task)

### Add metadata and tag NEW outputs

In [4]:
def add_meta_to_outputs(task):
    try:
        if in_key in task.inputs and re.search(phrase, task.name):
            metadata = {}
            if tag_flag:
                tags = task.inputs["tool_name"].split("_")
                tags.append("NORM")
            for key in task.inputs[in_key].metadata:
                metadata[key] = task.inputs[in_key].metadata[key]
            # outputs for these tasks are file arrays
            for key in task.outputs:
                if isinstance(task.outputs[key], list):
                    for out_file in task.outputs[key]:
                        try:
                            out_file_obj = api.files.get(out_file.id)
                        except Exception as e:
                            print (e)
                            print ("Error getting file output for " + task.name + " " + task.id + " skipping!")
                            break
                        if out_file_obj.metadata is None or len(out_file_obj.metadata) == 0:
                            out_file_obj.metadata = metadata
                            if tag_flag:
                                out_file_obj.tags = tags
                            out_file_obj.save()
                else:
                    try:
                        out_file_obj = api.files.get(task.outputs[key].id)
                        if out_file_obj.metadata is None or len(out_file_obj.metadata) == 0:
                            out_file_obj.metadata = metadata
                            if tag_flag:
                                out_file_obj.tags = tags
                            out_file_obj.save()
                    except Exception as e:
                        print(e)
                    
            return 0
        else:
            return None
    except Exception as e:
        print (e)
        print ("Got an error processing " + task.name + " " + task.id)
        exit(1)


In [None]:
project = "kfdrc-harmonization/sd-bhjxbdqk-ad-hoc-caller-rerun"
tasks = api.tasks.query(project=project, status="COMPLETED").all()
i = 1
n = 100
phrase = "PATCH VCF2MAF RPT"
in_key = "strelka2_protected_vcf"
tag_flag = False
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(add_meta_to_outputs, task): task for task in tasks}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks processed\n')
        if result.result() is not None:
            i += 1

# task = api.tasks.get("13f0bfe4-fd53-402f-9cc2-30d1caa6d1cd")
# add_meta_to_outputs(task)


# Re-run MAF Conversion

In [None]:
def get_vcf2maf_refs(api):
    ref_dict = {}
    ref_dict['reference'] = api.files.query(project=project, names=["Homo_sapiens_assembly38.fasta"])[0]
    return ref_dict

In [None]:
def draft_patch_task(t_id):
    try:
        input_dict = {}
        for key in ref_dict:
            input_dict[key] = ref_dict[key]
        tumor_id = in_dict[t_id]['tumor_id']
        normal_id = in_dict[t_id]['normal_id']
        for key in in_dict[t_id]:
            input_dict[key] = in_dict[t_id][key]
        task_name = "PATCH VCF2MAF RPT: " + short_name + " " + tumor_id + " " + normal_id
        task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=input_dict, run=False)
        task.inputs['output_basename'] = task.id
        task.save()
    except Exception as e:
        print(e)
        print("had a problem drafting a task for " + old_task_id)
        exit(1)
    

In [None]:
def squash_manifest(entry):
    # Compress old inputs into single entry
    try:
        info = entry.rstrip('\n').split(',')
        tumor_id = info[t_idx]
        if tumor_id not in in_dict:
            in_dict[tumor_id] = {}
            in_dict[tumor_id]['tumor_id'] = info[t_idx]
            in_dict[tumor_id]['normal_id'] = info[n_idx]
        tool_name = ''
        # use keys to search file name for key word, then standardize tool name
        for key in tool_name_dict:
            if re.search(key, info[1]):
                tool_name = key
                break

        if re.search('public', info[1]):
            in_dict[tumor_id][tool_name + '_public_vcf'] = api.files.get(info[0])
        else:
            in_dict[tumor_id][tool_name + '_protected_vcf'] = api.files.get(info[0])
    except Exception as e:
        print(e)
        print ("Could not process " + entry)
        exit(1)


In [None]:
# comment out project, vcf_input and short_name vars not used

pnoc_project = "kfdrc-harmonization/sd-8y99qzjj-ad-hoc-caller-rerun"
pnoc_vcf_input = '/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/VCF2MAF_RPT/pnoc_vcf-manifest.csv'
pnoc_short_name = "PNOC"

# cbtn_project = "kfdrc-harmonization/sd-bhjxbdqk-ad-hoc-caller-rerun"
# cbtn_vcf_input = '/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/VCF2MAF_RPT/cbtn_vcf-manifest.csv'
# cbtn_short_name = "CBTN"

pbta_app = "/pbta-vcf2maf-patch"

# tcga_project = "kfdrc-harmonization/openpbta-tcga-ad-hoc-caller-rerun"
# tcga_vcf_input = '/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/VCF2MAF_RPT/tcga_vcfs-manifest.csv'
# tcga_short_name = "TCGA"
# tcga_app = "/tcga-vcf2maf-patch"

# set these vars to the ones uncommented and to be used in the process
project = pnoc_project
vcf_input = open(pnoc_vcf_input)
short_name = pnoc_short_name

app_name = project + pbta_app
tool_name_dict = {'strelka2': 'strelka2_somatic', 'mutect2': 'mutect2_somatic',
                  'lancet': 'lancet_somatic', 'vardict': 'vardict_somatic'}
ref_dict = get_vcf2maf_refs(api)
head = next(vcf_input)
header = head.rstrip('\n').split(',')
# get tumor and normal ID positions - commented because TCGA is slightly different
t_idx = header.index('Kids First Biospecimen ID Tumor')
n_idx = header.index('Kids First Biospecimen ID Normal')
# t_idx = header.index('bam_tumor_id')
# n_idx = header.index('bam_normal_id')
in_dict = {}
i = 1
n = 100

with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(squash_manifest, entry): entry for entry in vcf_input}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' input files processed\n')
        i += 1

# for entry in vcf_input:
#     squash_manifest(entry)
i=1
with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(draft_patch_task, t_id): t_id for t_id in in_dict}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks created\n')
        i += 1

# for task_id in in_dict:
#     draft_patch_task(task_id)


### Get all drafted tasks and an input file

In [None]:
project = "kfdrc-harmonization/sd-bhjxbdqk-ad-hoc-caller-rerun"
tasks = api.tasks.query(project=project, status="DRAFT").all()
out = open("/Users/brownm28/Documents/2021-Apr-20_norm_annot_patch/MAF_RPT/cbtn_drafted_tasks.txt", "w")
out.write("task id\ttask_name\tpub_input_name\tpub_input_id\toutput_basename\n")
for task in tasks:
    if re.search("VCF2MAF", task.name):
        bname = task.inputs['output_basename']
        if bname is None:
            bname = "MISSING!!!"
        out.write("\t".join([task.id, task.name, task.inputs['public_vcf'].name, task.inputs['public_vcf'].id, bname]) + "\n")
out.close()

### Run-a-paloooza

In [None]:
def runzilla(task):
    try:
        task.run()
    except Exception as e:
        print (e)
        print ("Could not run task " + task.name + " " + task.id)

In [None]:
tcga_project = "kfdrc-harmonization/openpbta-tcga-ad-hoc-caller-rerun"
cbtn_project = "kfdrc-harmonization/sd-bhjxbdqk-ad-hoc-caller-rerun"
pnoc_project = "kfdrc-harmonization/sd-8y99qzjj-ad-hoc-caller-rerun"

project = pnoc_project
draft_tasks = api.tasks.query(project=project, status="DRAFT").all()
i = 1
n = 100

with concurrent.futures.ThreadPoolExecutor(16) as executor:
    results = {executor.submit(runzilla, task): task for task in draft_tasks}
    for result in concurrent.futures.as_completed(results):
        if i % n == 0:
            sys.stderr.write(str(i) + ' tasks started\n')
        i += 1


## DELME SCRATCH

In [5]:
task = api.tasks.get("20f6e0f5-2096-4fec-8bbe-99c9cf75eb64")
phrase = "BS_K2G05P1M"
tag_flag = False
in_key = "input_tumor_aligned"
add_meta_to_outputs(task)


'NoneType' object has no attribute 'id'


0