Skip to content

Commit

Permalink
Support individual CNA file format
Browse files Browse the repository at this point in the history
The following format will be supported

| Tumor_Sample_Barcode | Hugo_Symbol | Copy_Number_Alteration |
|----------------------|-------------|------------------------|
| TCGA-05-4417-01      | MET         | 2                      |
| TCGA-05-4417-01      | ERBB2       | 2                      |
  • Loading branch information
zhx828 committed Apr 7, 2022
1 parent f22fe53 commit d5fddf3
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 55 deletions.
179 changes: 130 additions & 49 deletions AnnotatorCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,29 @@ def setsampleidsfileterfile(f):
'vIII deletion': ['any']
}

CNA_AMPLIFICATION_TXT = 'Amplification'
CNA_DELETION_TXT = 'Deletion'
CNA_LOSS_TXT = 'Loss'
CNA_GAIN_TXT = 'Gain'

CNAS = [
CNA_DELETION_TXT,
CNA_LOSS_TXT,
CNA_GAIN_TXT,
CNA_AMPLIFICATION_TXT,
]

GISTIC_CNA_MAP = {
"-2": CNA_DELETION_TXT,
"-1.5": CNA_DELETION_TXT,
"-1": CNA_LOSS_TXT,
"1": CNA_GAIN_TXT,
"2": CNA_AMPLIFICATION_TXT
}

CNA_FILE_FORMAT_GISTIC = 'gistic'
CNA_FILE_FORMAT_INDIVIDUAL = 'individual'
CND_FILE_FORMAT = [CNA_FILE_FORMAT_GISTIC, CNA_FILE_FORMAT_INDIVIDUAL]

# column headers
HUGO_HEADERS = ['HUGO_SYMBOL', 'HUGO_GENE_SYMBOL', 'GENE']
Expand All @@ -145,6 +168,9 @@ def setsampleidsfileterfile(f):
GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2'
GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER]

# columns for copy number alteration
CNA_HEADER = ['COPY_NUMBER_ALTERATION', 'CNA', 'GISTIC']

# columns for structural variant annotation
SV_GENEA_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
SV_GENEB_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
Expand Down Expand Up @@ -811,28 +837,22 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa
outf.close()


def processcnagisticdata(cnafile, outfile, previousoutfile, defaultCancerType, cancerTypeMap, annotate_gain_loss=False):
CNA_AMPLIFICATION_TXT = 'Amplification'
CNA_DELETION_TXT = 'Deletion'
CNA_LOSS_TXT = 'Loss'
CNA_GAIN_TXT = 'Gain'

cnaEventMap = {
"-2": CNA_DELETION_TXT,
"-1.5": CNA_DELETION_TXT,
"2": CNA_AMPLIFICATION_TXT
}
def get_cna(cell_value, annotate_gain_loss=False):
cna = None
if cell_value is not None and cell_value != '':
if cell_value in GISTIC_CNA_MAP:
cna = GISTIC_CNA_MAP[cell_value]
else:
for default_cna in CNAS:
if cell_value.upper() == default_cna.upper():
cna = default_cna
if not annotate_gain_loss and cna is not None and cna.upper() in [CNA_GAIN_TXT.upper(), CNA_LOSS_TXT.upper()]:
cna = None
return cna

if annotate_gain_loss:
cnaEventMap.update({
"-1": CNA_LOSS_TXT,
"1": CNA_GAIN_TXT
})

if os.path.isfile(previousoutfile):
cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
outf = open(outfile, 'w+', 1000)
with open(cnafile, 'rU') as infile:
def process_gistic_data(outf, gistic_data_file, defaultCancerType, cancerTypeMap, annotate_gain_loss):
with open(gistic_data_file, 'rU') as infile:
reader = csv.reader(infile, delimiter='\t')
headers = readheaders(reader)
samples = []
Expand All @@ -848,15 +868,6 @@ def processcnagisticdata(cnafile, outfile, previousoutfile, defaultCancerType, c
"Cancer type for all samples should be defined for a more accurate result\nsamples in cna file: %s\n" % (
samples))

outf.write('SAMPLE_ID\tCANCER_TYPE\tHUGO_SYMBOL\tALTERATION')
ncols = 4

oncokb_annotation_headers = get_oncokb_annotation_column_headers()
outf.write("\t")
outf.write("\t".join(oncokb_annotation_headers))
outf.write("\n")
ncols += len(oncokb_annotation_headers)

i = 0
rows = []
queries = []
Expand All @@ -875,31 +886,98 @@ def processcnagisticdata(cnafile, outfile, previousoutfile, defaultCancerType, c
if len(row) <= headers[rawsample]:
log.warning('No CNA specified for ' + row[0] + ' ' + rawsample)
continue
cna = row[headers[rawsample]]
if cna in cnaEventMap:
cna_type = cnaEventMap[cna]
if cna_type is not None:
cancertype = defaultCancerType
sample = rawsample
cna_type = get_cna(row[headers[rawsample]], annotate_gain_loss)
if cna_type is not None:
cancer_type = defaultCancerType
sample = rawsample

if sampleidsfilter and sample not in sampleidsfilter:
continue
if sampleidsfilter and sample not in sampleidsfilter:
continue

if sample in cancerTypeMap:
cancertype = cancerTypeMap[sample]
if sample in cancerTypeMap:
cancer_type = cancerTypeMap[sample]

rows.append([sample, cancertype, hugo, cna_type])
queries.append(CNAQuery(hugo, cna_type, cancertype))
rows.append([sample, cancer_type, hugo, cna_type])
queries.append(CNAQuery(hugo, cna_type, cancer_type))

if len(queries) == POST_QUERIES_THRESHOLD:
annotations = pull_cna_info(queries)
append_annotation_to_file(outf, ncols, rows, annotations)
rows = []
queries = []
headers = ['SAMPLE_ID', 'CANCER_TYPE', 'HUGO_SYMBOL', 'ALTERATION'] + get_oncokb_annotation_column_headers()
outf.write('\t'.join(headers))
outf.write('\n')
return headers, rows, queries

if len(queries) > 0:
annotations = pull_cna_info(queries)
append_annotation_to_file(outf, ncols, rows, annotations)

def process_individual_cna_file(outf, cna_data_file, defaultCancerType, cancerTypeMap, annotate_gain_loss):
with open(cna_data_file, 'rU') as infile:
reader = csv.reader(infile, delimiter='\t')
headers = readheaders(reader)
row_headers = headers['^-$'].split('\t') + get_oncokb_annotation_column_headers()

i = 0
rows = []
queries = []

outf.write('\t'.join(row_headers))
outf.write('\n')

for row in reader:
i = i + 1
isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
ihugo = geIndexOfHeader(headers, HUGO_HEADERS)
icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
icna = geIndexOfHeader(headers, CNA_HEADER)

hugo = row[ihugo] if ihugo >= 0 else None
cna_type = get_cna(row[icna], annotate_gain_loss)
sample = row[isample] if isample >= 0 else None
cancer_type = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)

if sampleidsfilter and sample not in sampleidsfilter:
continue

if hugo and cna_type:
rows.append(row)
queries.append(CNAQuery(hugo, cna_type, cancer_type))
else:
outf.write('\t'.join(row))
outf.write('\n')
if not hugo:
log.warning("Gene is not specified for row " + str(row))
if not cna_type:
log.warning("CNA is not specified for row " + str(row))
return row_headers, rows, queries


def process_cna_data(cnafile, outfile, previousoutfile, defaultCancerType, cancerTypeMap, annotate_gain_loss=False,
cna_format=CNA_FILE_FORMAT_GISTIC):
if os.path.isfile(previousoutfile):
cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)

if not cna_format or cna_format not in CND_FILE_FORMAT:
log.error('The CNA file format is not supported, only gistic or individual can be used ')
return

outf = open(outfile, 'w+', 1000)

headers = []
rows = []
queries = []
if cna_format == CNA_FILE_FORMAT_GISTIC:
headers, rows, queries = process_gistic_data(outf, cnafile, defaultCancerType, cancerTypeMap,
annotate_gain_loss)
else:
headers, rows, queries = process_individual_cna_file(outf, cnafile, defaultCancerType, cancerTypeMap,
annotate_gain_loss)

ncols = len(headers)

i = 0
while len(rows) > 0:
i += POST_QUERIES_THRESHOLD
log.info(i)
rows_sec, rows = rows[:POST_QUERIES_THRESHOLD], rows[POST_QUERIES_THRESHOLD:]
queries_sec, queries = queries[:POST_QUERIES_THRESHOLD], queries[POST_QUERIES_THRESHOLD:]
annotations = pull_cna_info(queries_sec)
append_annotation_to_file(outf, ncols, rows_sec, annotations)

outf.close()

Expand Down Expand Up @@ -1526,6 +1604,9 @@ def __init__(self, hugo, cnatype, cancertype):
self.copyNameAlterationType = cnatype.upper()
self.tumorType = cancertype

def __str__(self):
return "\t".join([self.gene.hugoSymbol, self.copyNameAlterationType, self.tumorType])

class StructuralVariantQuery:
def __init__(self, hugoA, hugoB, structural_variant_type, cancertype):

Expand Down
9 changes: 5 additions & 4 deletions CnaAnnotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
def main(argv):
if argv.help:
log.info('\n'
'CnaAnnotator.py -i <input CNA file> -o <output CNA file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb_api_bear_token] [-z annotate_gain_loss]\n'
' Input CNA file should follow the GISTIC output (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1)\n'
'CnaAnnotator.py -i <input CNA file> -o <output CNA file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb_api_bear_token] [-z annotate_gain_loss] [-f CNA file formt, gistic or individual]\n'
' Input CNA file uses GISTIC output by default (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1). You can also list copy number alteration individually by specifying -f=individual\n'
' Essential clinical columns:\n'
' SAMPLE_ID: sample ID\n'
' Cancer type will be assigned based on the following priority:\n'
Expand All @@ -35,8 +35,8 @@ def main(argv):
readCancerTypes(argv.input_clinical_file, cancertypemap)

log.info('annotating %s ...' % argv.input_file)
processcnagisticdata(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
cancertypemap, argv.annotate_gain_loss)
process_cna_data(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
cancertypemap, argv.annotate_gain_loss, argv.cna_file_format.lower())

log.info('done!')

Expand All @@ -53,6 +53,7 @@ def main(argv):
parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
parser.add_argument('-z', dest='annotate_gain_loss', action="store_true", default=False)
parser.add_argument('-f', dest='cna_file_format', default=CNA_FILE_FORMAT_GISTIC)
parser.set_defaults(func=main)

args = parser.parse_args()
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ We recommend processing VCF files by [vcf2maf](https://github.com/mskcc/vcf2maf/
You can still use MAF format to annotate atypical alterations, such as MSI-H, TMB-H, EGFR vIII. Please see more examples [HERE](data/example_atypical_alterations.txt).

### Copy Number Alteration
We use GISTIC 2.0 format. For more information, please see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data.
We use GISTIC 2.0 format by default. For more information, please see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data, please see examples [HERE](data/example_cna.txt).

You can also list copy number alteration individually by specifying `-f individual`, please see examples [HERE](data/example_individual_cna.txt).

Get more details on the command line using `python CnaAnnotator.py -h`.

Expand All @@ -54,7 +56,7 @@ All structural variants with two different gene partners, they will be considere
Get more details on the command line using `python StructuralVariantAnnotator.py -h`.

### Clinical Data (Combine MAF+CNA+Fusion)
You can comebine all annotation on sample/patient level using the clinical data annotator.
You can combine all annotation on sample/patient level using the clinical data annotator.

Get more details on the command line using `python ClinicalDataAnnotator.py -h`.

Expand Down
13 changes: 13 additions & 0 deletions data/example_individual_cna.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Tumor_Sample_Barcode Hugo_Symbol Copy_Number_Alteration
TCGA-05-4417-01 MET 2
TCGA-05-4417-01 ERBB2 2
TCGA-05-4417-01 CDK4 -2
TCGA-05-4417-01 CDK4 -1
TCGA-02-0033-01 MET 2
TCGA-02-0033-01 ERBB2 1
TCGA-02-0033-01 CDK4 2
TCGA-02-0033-01 CDK4 2
TCGA-05-4417-01 MET Amplification
TCGA-05-4417-01 CDK4 Deletion
TCGA-05-4417-01 CDK4 Loss
TCGA-05-4417-01 ERBB2 Gain
4 changes: 4 additions & 0 deletions example.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ OSV="data/example_sv.oncokb.txt"
ICNA="data/example_cna.txt"
OCNA="data/example_cna.oncokb.txt"

IICNA="data/example_individual_cna.txt"
OICNA="data/example_individual_cna.oncokb.txt"

IC="data/example_clinical.txt"
OC="data/example_clinical.oncokb.txt"

Expand All @@ -43,6 +46,7 @@ python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN
python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN"
python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN"
python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN"
python CnaAnnotator.py -i "$IICNA" -o "$OICNA" -c "$IC" -b "$TOKEN" -f "individual"
python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV"
python OncoKBPlots.py -i "$OC" -o "$OCPDF" -c ONCOTREE_CODE #-n 10
python GenerateReadMe.py -o "$README"
38 changes: 38 additions & 0 deletions test_AnnotatorCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,41 @@ def test_resolve_query_type():
assert resolve_query_type(QueryType.HGVSG, [HGVSP_SHORT_HEADER])
with pytest.raises(Exception):
assert resolve_query_type(QueryType.GENOMIC_CHANGE, [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER])


def test_cna():
assert get_cna(None) is None
assert get_cna('') is None
assert get_cna('test') is None
assert get_cna('Amplification') == CNA_AMPLIFICATION_TXT
assert get_cna('Gain') is None
assert get_cna('Deletion') == CNA_DELETION_TXT
assert get_cna('Loss') is None
assert get_cna('2') == CNA_AMPLIFICATION_TXT
assert get_cna('1') is None
assert get_cna('-2') == CNA_DELETION_TXT
assert get_cna('-1.5') == CNA_DELETION_TXT
assert get_cna('-1') is None
assert get_cna('0') is None

assert get_cna(None, False) is None
assert get_cna('', False) is None
assert get_cna('test', False) is None
assert get_cna('Amplification', False) == CNA_AMPLIFICATION_TXT
assert get_cna('Gain', False) is None
assert get_cna('Deletion', False) == CNA_DELETION_TXT
assert get_cna('Loss', False) is None

assert get_cna(None, True) is None
assert get_cna('', True) is None
assert get_cna('test', True) is None
assert get_cna('Amplification', True) == CNA_AMPLIFICATION_TXT
assert get_cna('Gain', True) == CNA_GAIN_TXT
assert get_cna('Deletion', True) == CNA_DELETION_TXT
assert get_cna('Loss', True) == CNA_LOSS_TXT
assert get_cna('2', True) == CNA_AMPLIFICATION_TXT
assert get_cna('1', True) == CNA_GAIN_TXT
assert get_cna('-2', True) == CNA_DELETION_TXT
assert get_cna('-1.5', True) == CNA_DELETION_TXT
assert get_cna('-1', True) == CNA_LOSS_TXT
assert get_cna('0', True) is None

0 comments on commit d5fddf3

Please sign in to comment.