Skip to content

Commit

Permalink
Merge pull request #208 from zhx828/add-oncokb-annotation-columns
Browse files Browse the repository at this point in the history
Add additional columns when annotating genomic change
  • Loading branch information
zhx828 committed Nov 2, 2023
2 parents 61c7c7d + a70d928 commit ea512c7
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 47 deletions.
70 changes: 48 additions & 22 deletions AnnotatorCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ def setsampleidsfileterfile(f):
DESCRIPTION_HEADERS = ['GENE_SUMMARY', 'VARIANT_SUMMARY', 'TUMOR_TYPE_SUMMARY', 'DIAGNOSTIC_SUMMARY',
'PROGNOSTIC_SUMMARY', 'MUTATION_EFFECT_DESCRIPTION']

ONCOKB_ANNOTATION_HEADERS_GC = ["ONCOKB_HUGO_SYMBOL", "ONCOKB_PROTEIN_CHANGE", "ONCOKB_CONSEQUENCE"]

UNKNOWN = 'UNKNOWN'


Expand Down Expand Up @@ -486,6 +488,14 @@ def get_reference_genome_from_row(row_reference_genome, default_reference_genome
return reference_genome


def append_headers(outf, newncols, include_descriptions, genomic_change_annotation):
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, genomic_change_annotation)
outf.write("\t".join(oncokb_annotation_headers))
newncols += len(oncokb_annotation_headers)

outf.write("\n")
return newncols

def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerType, cancerTypeMap,
annotatehotspots, user_input_query_type, default_reference_genome, include_descriptions):
if annotatehotspots:
Expand All @@ -510,30 +520,28 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy
outf.write("\tIS-A-3D-HOTSPOT")
newncols += 2

oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions)

outf.write("\t")
outf.write("\t".join(oncokb_annotation_headers))
newncols += len(oncokb_annotation_headers)

outf.write("\n")

query_type = resolve_query_type(user_input_query_type, headers)
if (query_type == QueryType.HGVSP_SHORT):
newncols = append_headers(outf, newncols, include_descriptions, False)
process_alteration(reader, outf, headers, [HGVSP_SHORT_HEADER, ALTERATION_HEADER], ncols, newncols,
defaultCancerType,
cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions)

if (query_type == QueryType.HGVSP):
newncols = append_headers(outf, newncols, include_descriptions, False)
process_alteration(reader, outf, headers, [HGVSP_HEADER, ALTERATION_HEADER], ncols, newncols,
defaultCancerType,
cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions)

if (query_type == QueryType.HGVSG):
newncols = append_headers(outf, newncols, include_descriptions, True)
process_hvsg(reader, outf, headers, [HGVSG_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType,
cancerTypeMap, annotatehotspots, default_reference_genome, include_descriptions)

if (query_type == QueryType.GENOMIC_CHANGE):
newncols = append_headers(outf, newncols, include_descriptions, True)
process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap,
annotatehotspots, default_reference_genome, include_descriptions)

Expand All @@ -549,10 +557,17 @@ def get_cell_content(row, index, return_empty_string=False):
return None


def get_oncokb_annotation_column_headers(include_descriptions):
headers = [ANNOTATED_HEADER, GENE_IN_ONCOKB_HEADER, VARIANT_IN_ONCOKB_HEADER, "MUTATION_EFFECT",
"MUTATION_EFFECT_CITATIONS",
"ONCOGENIC"]
def get_oncokb_annotation_column_headers(include_descriptions, genomic_change_annotation):
headers = [ANNOTATED_HEADER]
if genomic_change_annotation:
headers.extend(ONCOKB_ANNOTATION_HEADERS_GC)

headers.extend([GENE_IN_ONCOKB_HEADER,
VARIANT_IN_ONCOKB_HEADER,
"MUTATION_EFFECT",
"MUTATION_EFFECT_CITATIONS",
"ONCOGENIC"])

for level in sorted(levels):
headers.append(level)
headers.append("HIGHEST_LEVEL")
Expand Down Expand Up @@ -764,9 +779,9 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol

if hgvsg is None:
if annotatehotspots:
default_cols = [['', '', GENE_IN_ONCOKB_DEFAULT, VARIANT_IN_ONCOKB_DEFAULT]]
default_cols = [['', '', 'False']]
else:
default_cols = [[GENE_IN_ONCOKB_DEFAULT, VARIANT_IN_ONCOKB_DEFAULT]]
default_cols = [['False']]
append_annotation_to_file(outf, ncols + nannotationcols, [row],
default_cols)
else:
Expand Down Expand Up @@ -816,7 +831,7 @@ def process_fusion(svdata, outfile, previousoutfile, defaultCancerType, cancerTy
return

outf.write(headers['^-$'])
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions)
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, False)
outf.write("\t")
outf.write("\t".join(oncokb_annotation_headers))
outf.write("\n")
Expand Down Expand Up @@ -886,7 +901,7 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa
return

outf.write(headers['^-$'])
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions)
oncokb_annotation_headers = get_oncokb_annotation_column_headers(include_descriptions, False)
outf.write("\t")
outf.write("\t".join(oncokb_annotation_headers))
outf.write("\n")
Expand Down Expand Up @@ -1007,7 +1022,7 @@ def process_gistic_data(outf, gistic_data_file, defaultCancerType, cancerTypeMap
rows.append([sample, cancer_type, hugo, cna_type])
queries.append(CNAQuery(hugo, cna_type, cancer_type))

headers = ['SAMPLE_ID', 'CANCER_TYPE', 'HUGO_SYMBOL', 'ALTERATION'] + get_oncokb_annotation_column_headers(include_descriptions)
headers = ['SAMPLE_ID', 'CANCER_TYPE', 'HUGO_SYMBOL', 'ALTERATION'] + get_oncokb_annotation_column_headers(include_descriptions, False)
outf.write('\t'.join(headers))
outf.write('\n')
return headers, rows, queries
Expand All @@ -1017,7 +1032,7 @@ def process_individual_cna_file(outf, cna_data_file, defaultCancerType, cancerTy
with open(cna_data_file, DEFAULT_READ_FILE_MODE) as infile:
reader = csv.reader(infile, delimiter='\t')
headers = readheaders(reader)
row_headers = headers['^-$'].split('\t') + get_oncokb_annotation_column_headers(include_descriptions)
row_headers = headers['^-$'].split('\t') + get_oncokb_annotation_column_headers(include_descriptions, False)

i = 0
rows = []
Expand Down Expand Up @@ -1586,7 +1601,7 @@ def pull_protein_change_info(queries, include_descriptions, annotate_hotspot):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot))
return processed_annotation


Expand Down Expand Up @@ -1614,7 +1629,7 @@ def pull_hgvsg_info(queries, include_descriptions, annotate_hotspot):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, True, annotate_hotspot))
return processed_annotation


Expand Down Expand Up @@ -1642,7 +1657,7 @@ def pull_genomic_change_info(queries, include_descriptions, annotate_hotspot):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, True, annotate_hotspot))
return processed_annotation


Expand Down Expand Up @@ -1672,7 +1687,7 @@ def pull_cna_info(queries, include_descriptions):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot=False))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot=False))
return processed_annotation


Expand Down Expand Up @@ -1706,11 +1721,11 @@ def pull_structural_variant_info(queries, include_descriptions):

processed_annotation = []
for query_annotation in annotation:
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, annotate_hotspot=False))
processed_annotation.append(process_oncokb_annotation(query_annotation, include_descriptions, False, annotate_hotspot=False))
return processed_annotation


def process_oncokb_annotation(annotation, include_descriptions, annotate_hotspot):
def process_oncokb_annotation(annotation, include_descriptions, genomic_change_annotation, annotate_hotspot):
if annotation is None:
return ['False']

Expand Down Expand Up @@ -1803,6 +1818,17 @@ def process_oncokb_annotation(annotation, include_descriptions, annotate_hotspot
ret.append(_3dhotspot)

ret.append('True')

if genomic_change_annotation:
query_hugo_symbol = annotation['query']['hugoSymbol']
ret.append('' if query_hugo_symbol is None else query_hugo_symbol)

query_alteration = annotation['query']['alteration']
ret.append('' if query_alteration is None else query_alteration)

query_consequence = annotation['query']['consequence']
ret.append('' if query_consequence is None else query_consequence)

ret.append(oncokbdata[GENE_IN_ONCOKB_HEADER])
ret.append(oncokbdata[VARIANT_IN_ONCOKB_HEADER])
ret.append(oncokbdata['mutation_effect'])
Expand Down
53 changes: 28 additions & 25 deletions test_Annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import logging

from AnnotatorCore import pull_hgvsg_info, DESCRIPTION_HEADERS
from AnnotatorCore import pull_hgvsg_info, DESCRIPTION_HEADERS, ONCOKB_ANNOTATION_HEADERS_GC
from AnnotatorCore import pull_genomic_change_info
from AnnotatorCore import pull_protein_change_info
from AnnotatorCore import pull_structural_variant_info
Expand Down Expand Up @@ -34,7 +34,10 @@
UNKNOWN = 'Unknown'
NUMBER_OF_ANNOTATION_COLUMNS = 27
NUMBER_OF_DESCRIPTION_COLUMNS = len(DESCRIPTION_HEADERS)
NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS = len(ONCOKB_ANNOTATION_HEADERS_GC)
NUMBER_OF_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS
NUMBER_OF_GC_ANNOTATION_COLUMNS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS
NUMBER_OF_GC_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_GC_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS


def fake_gene_one_query_suite(annotations, include_descriptions):
Expand Down Expand Up @@ -163,22 +166,22 @@ def test_check_hgvsg():
assert len(annotations) == 3

annotation = annotations[0]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'

annotation = annotations[1]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'

annotation = annotations[2]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == ''
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == ''


@pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
Expand All @@ -196,22 +199,22 @@ def test_check_genomic_change():
assert len(annotations) == 3

annotation = annotations[0]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'

annotation = annotations[1]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'

annotation = annotations[2]
assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Gain-of-function'
assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX] == ''
assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function'
assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic'
assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == ''


@pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
Expand Down

0 comments on commit ea512c7

Please sign in to comment.