Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional columns when annotating genomic change #208

Merged
merged 1 commit into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
70 changes: 48 additions & 22 deletions AnnotatorCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ def setsampleidsfileterfile(f):
DESCRIPTION_HEADERS = ['GENE_SUMMARY', 'VARIANT_SUMMARY', 'TUMOR_TYPE_SUMMARY', 'DIAGNOSTIC_SUMMARY',
'PROGNOSTIC_SUMMARY', 'MUTATION_EFFECT_DESCRIPTION']

ONCOKB_ANNOTATION_HEADERS_GC = ["ONCOKB_HUGO_SYMBOL", "ONCOKB_PROTEIN_CHANGE", "ONCOKB_CONSEQUENCE"]

UNKNOWN = 'UNKNOWN'


Expand Down Expand Up @@ -486,6 +488,14 @@ def get_reference_genome_from_row(row_reference_genome, default_reference_genome
return reference_genome


def append_headers(outf, newncols, include_descriptions, genomic_change_annotation):
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, genomic_change_annotation)
outf.write("\t".join(oncokb_annotation_headers))
newncols += len(oncokb_annotation_headers)

outf.write("\n")
return newncols

def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerType, cancerTypeMap,
annotatehotspots, user_input_query_type, default_reference_genome, include_descriptions):
if annotatehotspots:
Expand All @@ -510,30 +520,28 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy
outf.write("\tIS-A-3D-HOTSPOT")
newncols += 2

oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions)

outf.write("\t")
outf.write("\t".join(oncokb_annotation_headers))
newncols += len(oncokb_annotation_headers)

outf.write("\n")

query_type = resolve_query_type(user_input_query_type, headers)
if (query_type == QueryType.HGVSP_SHORT):
newncols = append_headers(outf, newncols, include_descriptions, False)
process_alteration(reader, outf, headers, [HGVSP_SHORT_HEADER, ALTERATION_HEADER], ncols, newncols,
defaultCancerType,
cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions)

if (query_type == QueryType.HGVSP):
newncols = append_headers(outf, newncols, include_descriptions, False)
process_alteration(reader, outf, headers, [HGVSP_HEADER, ALTERATION_HEADER], ncols, newncols,
defaultCancerType,
cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions)

if (query_type == QueryType.HGVSG):
newncols = append_headers(outf, newncols, include_descriptions, True)
process_hvsg(reader, outf, headers, [HGVSG_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType,
cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions)

if (query_type == QueryType.GENOMIC_CHANGE):
newncols = append_headers(outf, newncols, include_descriptions, True)
process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap,
annotatehotspots, default_reference_genome, include_descriptions)

Expand All @@ -549,10 +557,17 @@ def get_cell_content(row, index, return_empty_string=False):
return None


def get_oncokb_annotation_column_headers(include_descriptions):
headers = [ANNOTATED_HEADER, GENE_IN_ONCOKB_HEADER, VARIANT_IN_ONCOKB_HEADER, "MUTATION_EFFECT",
"MUTATION_EFFECT_CITATIONS",
"ONCOGENIC"]
def get_oncokb_annotation_column_headers(include_descriptions, genomic_change_annotation):
headers = [ANNOTATED_HEADER]
if genomic_change_annotation:
headers.extend(ONCOKB_ANNOTATION_HEADERS_GC)

headers.extend([GENE_IN_ONCOKB_HEADER,
VARIANT_IN_ONCOKB_HEADER,
"MUTATION_EFFECT",
"MUTATION_EFFECT_CITATIONS",
"ONCOGENIC"])

for level in sorted(levels):
headers.append(level)
headers.append("HIGHEST_LEVEL")
Expand Down Expand Up @@ -764,9 +779,9 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol

if hgvsg is None:
if annotatehotspots:
default_cols = [['', '', GENE_IN_ONCOKB_DEFAULT, VARIANT_IN_ONCOKB_DEFAULT]]
default_cols = [['', '', 'False']]
else:
default_cols = [[GENE_IN_ONCOKB_DEFAULT, VARIANT_IN_ONCOKB_DEFAULT]]
default_cols = [['False']]
append_annotation_to_file(outf, ncols + nannotationcols, [row],
default_cols)
else:
Expand Down Expand Up @@ -816,7 +831,7 @@ def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTy
return

outf.write(headers['^-$'])
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions)
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, False)
outf.write("\t")
outf.write("\t".join(oncokb_annotation_headers))
outf.write("\n")
Expand Down Expand Up @@ -886,7 +901,7 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa
return

outf.write(headers['^-$'])
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions)
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, False)
outf.write("\t")
outf.write("\t".join(oncokb_annotation_headers))
outf.write("\n")
Expand Down Expand Up @@ -1007,7 +1022,7 @@ def process_gistic_data(outf, gistic_data_file, defaultCancerType, cancerTypeMap
rows.append([sample, cancer_type, hugo, cna_type])
queries.append(CNAQuery(hugo, cna_type, cancer_type))

headers = ['SAMPLE_ID', 'CANCER_TYPE', 'HUGO_SYMBOL', 'ALTERATION'] + get_oncokb_annotation_column_headers(include_descriptions)
headers = ['SAMPLE_ID', 'CANCER_TYPE', 'HUGO_SYMBOL', 'ALTERATION'] + get_oncokb_annotation_column_headers(include_descriptions, False)
outf.write('\t'.join(headers))
outf.write('\n')
return headers, rows, queries
Expand All @@ -1017,7 +1032,7 @@ def process_individual_cna_file(outf, cna_data_file, defaultCancerType, cancerTy
with open(cna_data_file, DEFAULT_READ_FILE_MODE) as infile:
reader = csv.reader(infile, delimiter='\t')
headers = readheaders(reader)
row_headers = headers['^-$'].split('\t') + get_oncokb_annotation_column_headers(include_descriptions)
row_headers = headers['^-$'].split('\t') + get_oncokb_annotation_column_headers(include_descriptions, False)

i = 0
rows = []
Expand Down Expand Up @@ -1586,7 +1601,7 @@ def pull_protein_change_info(queries, include_descriptions, annotate_hotspot):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot))
return processed_annotation


Expand Down Expand Up @@ -1614,7 +1629,7 @@ def pull_hgvsg_info(queries, include_descriptions, annotate_hotspot):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, True, annotate_hotspot))
return processed_annotation


Expand Down Expand Up @@ -1642,7 +1657,7 @@ def pull_genomic_change_info(queries, include_descriptions, annotate_hotspot):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, True, annotate_hotspot))
return processed_annotation


Expand Down Expand Up @@ -1672,7 +1687,7 @@ def pull_cna_info(queries, include_descriptions):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot=False))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot=False))
return processed_annotation


Expand Down Expand Up @@ -1706,11 +1721,11 @@ def pull_structural_variant_info(queries, include_descriptions):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot=False))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot=False))
return processed_annotation


def process_oncokb_annotation(annotation, include_descriptions, annotate_hotspot):
def process_oncokb_annotation(annotation, include_descriptions, genomic_change_annotation, annotate_hotspot):
if annotation is None:
return ['False']

Expand Down Expand Up @@ -1803,6 +1818,17 @@ def process_oncokb_annotation(annotation, include_descriptions, annotate_hotspot
ret.append(_3dhotspot)

ret.append('True')

if genomic_change_annotation:
query_hugo_symbol = annotation['query']['hugoSymbol']
ret.append('' if query_hugo_symbol is None else query_hugo_symbol)

query_alteration = annotation['query']['alteration']
ret.append('' if query_alteration is None else query_alteration)

query_consequence = annotation['query']['consequence']
ret.append('' if query_consequence is None else query_consequence)

ret.append(oncokbdata[GENE_IN_ONCOKB_HEADER])
ret.append(oncokbdata[VARIANT_IN_ONCOKB_HEADER])
ret.append(oncokbdata['mutation_effect'])
Expand Down
53 changes: 28 additions & 25 deletions test_Annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import logging

from AnnotatorCore import pull_hgvsg_info, DESCRIPTION_HEADERS
from AnnotatorCore import pull_hgvsg_info, DESCRIPTION_HEADERS, ONCOKB_ANNOTATION_HEADERS_GC
from AnnotatorCore import pull_genomic_change_info
from AnnotatorCore import pull_protein_change_info
from AnnotatorCore import pull_structural_variant_info
Expand Down Expand Up @@ -34,7 +34,10 @@
UNKNOWN = 'Unknown'
NUMBER_OF_ANNOTATION_COLUMNS = 27
NUMBER_OF_DESCRIPTION_COLUMNS = len(DESCRIPTION_HEADERS)
NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS = len(ONCOKB_ANNOTATION_HEADERS_GC)
NUMBER_OF_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS
NUMBER_OF_GC_ANNOTATION_COLUMNS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS
NUMBER_OF_GC_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_GC_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS


def fake_gene_one_query_suite(annotations, include_descriptions):
Expand Down Expand Up @@ -163,22 +166,22 @@ def test_check_hgvsg():
assert len(annotations) == 3

annotation = annotations[0]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'

annotation = annotations[1]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'

annotation = annotations[2]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == ''
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == ''


@pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
Expand All @@ -196,22 +199,22 @@ def test_check_genomic_change():
assert len(annotations) == 3

annotation = annotations[0]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'

annotation = annotations[1]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'

annotation = annotations[2]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == ''
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == ''


@pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
Expand Down