Support individual CNA file format

The following format will be supported | Tumor_Sample_Barcode | Hugo_Symbol | Copy_Number_Alteration | |----------------------|-------------|------------------------| | TCGA-05-4417-01 | MET | 2 | | TCGA-05-4417-01 | ERBB2 | 2 |
oncokb · Apr 7, 2022 · d5fddf3 · d5fddf3
1 parent f22fe53
commit d5fddf3
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 55 deletions.
diff --git a/AnnotatorCore.py b/AnnotatorCore.py
@@ -119,6 +119,29 @@ def setsampleidsfileterfile(f):
     'vIII deletion': ['any']
 }
 
+CNA_AMPLIFICATION_TXT = 'Amplification'
+CNA_DELETION_TXT = 'Deletion'
+CNA_LOSS_TXT = 'Loss'
+CNA_GAIN_TXT = 'Gain'
+
+CNAS = [
+    CNA_DELETION_TXT,
+    CNA_LOSS_TXT,
+    CNA_GAIN_TXT,
+    CNA_AMPLIFICATION_TXT,
+]
+
+GISTIC_CNA_MAP = {
+    "-2": CNA_DELETION_TXT,
+    "-1.5": CNA_DELETION_TXT,
+    "-1": CNA_LOSS_TXT,
+    "1": CNA_GAIN_TXT,
+    "2": CNA_AMPLIFICATION_TXT
+}
+
+CNA_FILE_FORMAT_GISTIC = 'gistic'
+CNA_FILE_FORMAT_INDIVIDUAL = 'individual'
+CND_FILE_FORMAT = [CNA_FILE_FORMAT_GISTIC, CNA_FILE_FORMAT_INDIVIDUAL]
 
 # column headers
 HUGO_HEADERS = ['HUGO_SYMBOL', 'HUGO_GENE_SYMBOL', 'GENE']
@@ -145,6 +168,9 @@ def setsampleidsfileterfile(f):
 GC_VAR_ALLELE_2_HEADER = 'TUMOR_SEQ_ALLELE2'
 GENOMIC_CHANGE_HEADERS = [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, GC_VAR_ALLELE_2_HEADER]
 
+# columns for copy number alteration
+CNA_HEADER = ['COPY_NUMBER_ALTERATION', 'CNA', 'GISTIC']
+
 # columns for structural variant annotation
 SV_GENEA_HEADER = ['SITE1_GENE', 'GENEA', 'GENE1']
 SV_GENEB_HEADER = ['SITE2_GENE', 'GENEB', 'GENE2']
@@ -811,28 +837,22 @@ def process_sv(svdata, outfile, previousoutfile, defaultCancerType, cancerTypeMa
     outf.close()
 
 
-def processcnagisticdata(cnafile, outfile, previousoutfile, defaultCancerType, cancerTypeMap, annotate_gain_loss=False):
-    CNA_AMPLIFICATION_TXT = 'Amplification'
-    CNA_DELETION_TXT = 'Deletion'
-    CNA_LOSS_TXT = 'Loss'
-    CNA_GAIN_TXT = 'Gain'
-
-    cnaEventMap = {
-        "-2": CNA_DELETION_TXT,
-        "-1.5": CNA_DELETION_TXT,
-        "2": CNA_AMPLIFICATION_TXT
-    }
+def get_cna(cell_value, annotate_gain_loss=False):
+    cna = None
+    if cell_value is not None and cell_value != '':
+        if cell_value in GISTIC_CNA_MAP:
+            cna = GISTIC_CNA_MAP[cell_value]
+        else:
+            for default_cna in CNAS:
+                if cell_value.upper() == default_cna.upper():
+                    cna = default_cna
+    if not annotate_gain_loss and cna is not None and cna.upper() in [CNA_GAIN_TXT.upper(), CNA_LOSS_TXT.upper()]:
+        cna = None
+    return cna
 
-    if annotate_gain_loss:
-        cnaEventMap.update({
-            "-1": CNA_LOSS_TXT,
-            "1": CNA_GAIN_TXT
-        })
 
-    if os.path.isfile(previousoutfile):
-        cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
-    outf = open(outfile, 'w+', 1000)
-    with open(cnafile, 'rU') as infile:
+def process_gistic_data(outf, gistic_data_file, defaultCancerType, cancerTypeMap, annotate_gain_loss):
+    with open(gistic_data_file, 'rU') as infile:
         reader = csv.reader(infile, delimiter='\t')
         headers = readheaders(reader)
         samples = []
@@ -848,15 +868,6 @@ def processcnagisticdata(cnafile, outfile, previousoutfile, defaultCancerType, c
                 "Cancer type for all samples should be defined for a more accurate result\nsamples in cna file: %s\n" % (
                     samples))
 
-        outf.write('SAMPLE_ID\tCANCER_TYPE\tHUGO_SYMBOL\tALTERATION')
-        ncols = 4
-
-        oncokb_annotation_headers = get_oncokb_annotation_column_headers()
-        outf.write("\t")
-        outf.write("\t".join(oncokb_annotation_headers))
-        outf.write("\n")
-        ncols += len(oncokb_annotation_headers)
-
         i = 0
         rows = []
         queries = []
@@ -875,31 +886,98 @@ def processcnagisticdata(cnafile, outfile, previousoutfile, defaultCancerType, c
                     if len(row) <= headers[rawsample]:
                         log.warning('No CNA specified for ' + row[0] + ' ' + rawsample)
                         continue
-                    cna = row[headers[rawsample]]
-                    if cna in cnaEventMap:
-                        cna_type = cnaEventMap[cna]
-                        if cna_type is not None:
-                            cancertype = defaultCancerType
-                            sample = rawsample
+                    cna_type = get_cna(row[headers[rawsample]], annotate_gain_loss)
+                    if cna_type is not None:
+                        cancer_type = defaultCancerType
+                        sample = rawsample
 
-                            if sampleidsfilter and sample not in sampleidsfilter:
-                                continue
+                        if sampleidsfilter and sample not in sampleidsfilter:
+                            continue
 
-                            if sample in cancerTypeMap:
-                                cancertype = cancerTypeMap[sample]
+                        if sample in cancerTypeMap:
+                            cancer_type = cancerTypeMap[sample]
 
-                            rows.append([sample, cancertype, hugo, cna_type])
-                            queries.append(CNAQuery(hugo, cna_type, cancertype))
+                        rows.append([sample, cancer_type, hugo, cna_type])
+                        queries.append(CNAQuery(hugo, cna_type, cancer_type))
 
-                            if len(queries) == POST_QUERIES_THRESHOLD:
-                                annotations = pull_cna_info(queries)
-                                append_annotation_to_file(outf, ncols, rows, annotations)
-                                rows = []
-                                queries = []
+        headers = ['SAMPLE_ID', 'CANCER_TYPE', 'HUGO_SYMBOL', 'ALTERATION'] + get_oncokb_annotation_column_headers()
+        outf.write('\t'.join(headers))
+        outf.write('\n')
+        return headers, rows, queries
 
-        if len(queries) > 0:
-            annotations = pull_cna_info(queries)
-            append_annotation_to_file(outf, ncols, rows, annotations)
+
+def process_individual_cna_file(outf, cna_data_file, defaultCancerType, cancerTypeMap, annotate_gain_loss):
+    with open(cna_data_file, 'rU') as infile:
+        reader = csv.reader(infile, delimiter='\t')
+        headers = readheaders(reader)
+        row_headers = headers['^-$'].split('\t') + get_oncokb_annotation_column_headers()
+
+        i = 0
+        rows = []
+        queries = []
+
+        outf.write('\t'.join(row_headers))
+        outf.write('\n')
+
+        for row in reader:
+            i = i + 1
+            isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
+            ihugo = geIndexOfHeader(headers, HUGO_HEADERS)
+            icancertype = geIndexOfHeader(headers, CANCER_TYPE_HEADERS)
+            icna = geIndexOfHeader(headers, CNA_HEADER)
+
+            hugo = row[ihugo] if ihugo >= 0 else None
+            cna_type = get_cna(row[icna], annotate_gain_loss)
+            sample = row[isample] if isample >= 0 else None
+            cancer_type = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
+
+            if sampleidsfilter and sample not in sampleidsfilter:
+                continue
+
+            if hugo and cna_type:
+                rows.append(row)
+                queries.append(CNAQuery(hugo, cna_type, cancer_type))
+            else:
+                outf.write('\t'.join(row))
+                outf.write('\n')
+                if not hugo:
+                    log.warning("Gene is not specified for row " + str(row))
+                if not cna_type:
+                    log.warning("CNA is not specified for row " + str(row))
+        return row_headers, rows, queries
+
+
+def process_cna_data(cnafile, outfile, previousoutfile, defaultCancerType, cancerTypeMap, annotate_gain_loss=False,
+                     cna_format=CNA_FILE_FORMAT_GISTIC):
+    if os.path.isfile(previousoutfile):
+        cacheannotated(previousoutfile, defaultCancerType, cancerTypeMap)
+
+    if not cna_format or cna_format not in CND_FILE_FORMAT:
+        log.error('The CNA file format is not supported, only gistic or individual can be used ')
+        return
+
+    outf = open(outfile, 'w+', 1000)
+
+    headers = []
+    rows = []
+    queries = []
+    if cna_format == CNA_FILE_FORMAT_GISTIC:
+        headers, rows, queries = process_gistic_data(outf, cnafile, defaultCancerType, cancerTypeMap,
+                                                     annotate_gain_loss)
+    else:
+        headers, rows, queries = process_individual_cna_file(outf, cnafile, defaultCancerType, cancerTypeMap,
+                                                       annotate_gain_loss)
+
+    ncols = len(headers)
+
+    i = 0
+    while len(rows) > 0:
+        i += POST_QUERIES_THRESHOLD
+        log.info(i)
+        rows_sec, rows = rows[:POST_QUERIES_THRESHOLD], rows[POST_QUERIES_THRESHOLD:]
+        queries_sec, queries = queries[:POST_QUERIES_THRESHOLD], queries[POST_QUERIES_THRESHOLD:]
+        annotations = pull_cna_info(queries_sec)
+        append_annotation_to_file(outf, ncols, rows_sec, annotations)
 
     outf.close()
 
@@ -1526,6 +1604,9 @@ def __init__(self, hugo, cnatype, cancertype):
         self.copyNameAlterationType = cnatype.upper()
         self.tumorType = cancertype
 
+    def __str__(self):
+        return "\t".join([self.gene.hugoSymbol, self.copyNameAlterationType, self.tumorType])
+
 class StructuralVariantQuery:
     def __init__(self, hugoA, hugoB, structural_variant_type, cancertype):
 

diff --git a/CnaAnnotator.py b/CnaAnnotator.py
@@ -10,8 +10,8 @@
 def main(argv):
     if argv.help:
         log.info('\n'
-        'CnaAnnotator.py -i <input CNA file> -o <output CNA file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb_api_bear_token] [-z annotate_gain_loss]\n'
-        '  Input CNA file should follow the GISTIC output (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1)\n'
+        'CnaAnnotator.py -i <input CNA file> -o <output CNA file> [-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb_api_bear_token] [-z annotate_gain_loss] [-f CNA file formt, gistic or individual]\n'
+        '  Input CNA file uses GISTIC output by default (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1). You can also list copy number alteration individually by specifying -f=individual\n'
         '  Essential clinical columns:\n'
         '    SAMPLE_ID: sample ID\n'
         '  Cancer type will be assigned based on the following priority:\n'
@@ -35,8 +35,8 @@ def main(argv):
         readCancerTypes(argv.input_clinical_file, cancertypemap)
 
     log.info('annotating %s ...' % argv.input_file)
-    processcnagisticdata(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
-                         cancertypemap, argv.annotate_gain_loss)
+    process_cna_data(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
+                         cancertypemap, argv.annotate_gain_loss, argv.cna_file_format.lower())
 
     log.info('done!')
 
@@ -53,6 +53,7 @@ def main(argv):
     parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
     parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
     parser.add_argument('-z', dest='annotate_gain_loss', action="store_true", default=False)
+    parser.add_argument('-f', dest='cna_file_format', default=CNA_FILE_FORMAT_GISTIC)
     parser.set_defaults(func=main)
 
     args = parser.parse_args()

diff --git a/README.md b/README.md
@@ -33,7 +33,9 @@ We recommend processing VCF files by [vcf2maf](https://github.com/mskcc/vcf2maf/
 You can still use MAF format to annotate atypical alterations, such as MSI-H, TMB-H, EGFR vIII. Please see more examples [HERE](data/example_atypical_alterations.txt).  
 
 ### Copy Number Alteration
-We use GISTIC 2.0 format. For more information, please see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data. 
+We use GISTIC 2.0 format by default. For more information, please see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data, please see examples [HERE](data/example_cna.txt).
+
+You can also list copy number alteration individually by specifying `-f individual`, please see examples [HERE](data/example_individual_cna.txt).
 
 Get more details on the command line using `python CnaAnnotator.py -h`.  
 
@@ -54,7 +56,7 @@ All structural variants with two different gene partners, they will be considere
 Get more details on the command line using `python StructuralVariantAnnotator.py -h`.
 
 ### Clinical Data (Combine MAF+CNA+Fusion)
-You can comebine all annotation on sample/patient level using the clinical data annotator.  
+You can combine all annotation on sample/patient level using the clinical data annotator.  
 
 Get more details on the command line using `python ClinicalDataAnnotator.py -h`.  
 

diff --git a/data/example_individual_cna.txt b/data/example_individual_cna.txt
@@ -0,0 +1,13 @@
+Tumor_Sample_Barcode	Hugo_Symbol	Copy_Number_Alteration
+TCGA-05-4417-01	MET	2
+TCGA-05-4417-01	ERBB2	2
+TCGA-05-4417-01	CDK4	-2
+TCGA-05-4417-01	CDK4	-1
+TCGA-02-0033-01	MET	2
+TCGA-02-0033-01	ERBB2	1
+TCGA-02-0033-01	CDK4	2
+TCGA-02-0033-01	CDK4	2
+TCGA-05-4417-01	MET	Amplification
+TCGA-05-4417-01	CDK4	Deletion
+TCGA-05-4417-01	CDK4	Loss
+TCGA-05-4417-01	ERBB2	Gain
diff --git a/example.sh b/example.sh
@@ -22,6 +22,9 @@ OSV="data/example_sv.oncokb.txt"
 ICNA="data/example_cna.txt"
 OCNA="data/example_cna.oncokb.txt"
 
+IICNA="data/example_individual_cna.txt"
+OICNA="data/example_individual_cna.oncokb.txt"
+
 IC="data/example_clinical.txt"
 OC="data/example_clinical.oncokb.txt"
 
@@ -43,6 +46,7 @@ python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN
 python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN"
 python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN"
 python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN"
+python CnaAnnotator.py -i "$IICNA" -o "$OICNA" -c "$IC" -b "$TOKEN" -f "individual"
 python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV"
 python OncoKBPlots.py -i "$OC" -o "$OCPDF" -c ONCOTREE_CODE #-n 10
 python GenerateReadMe.py -o "$README"
diff --git a/test_AnnotatorCore.py b/test_AnnotatorCore.py
@@ -102,3 +102,41 @@ def test_resolve_query_type():
         assert resolve_query_type(QueryType.HGVSG, [HGVSP_SHORT_HEADER])
     with pytest.raises(Exception):
         assert resolve_query_type(QueryType.GENOMIC_CHANGE, [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER])
+
+
+def test_cna():
+    assert get_cna(None) is None
+    assert get_cna('') is None
+    assert get_cna('test') is None
+    assert get_cna('Amplification') == CNA_AMPLIFICATION_TXT
+    assert get_cna('Gain') is None
+    assert get_cna('Deletion') == CNA_DELETION_TXT
+    assert get_cna('Loss') is None
+    assert get_cna('2') == CNA_AMPLIFICATION_TXT
+    assert get_cna('1') is None
+    assert get_cna('-2') == CNA_DELETION_TXT
+    assert get_cna('-1.5') == CNA_DELETION_TXT
+    assert get_cna('-1') is None
+    assert get_cna('0') is None
+
+    assert get_cna(None, False) is None
+    assert get_cna('', False) is None
+    assert get_cna('test', False) is None
+    assert get_cna('Amplification', False) == CNA_AMPLIFICATION_TXT
+    assert get_cna('Gain', False) is None
+    assert get_cna('Deletion', False) == CNA_DELETION_TXT
+    assert get_cna('Loss', False) is None
+
+    assert get_cna(None, True) is None
+    assert get_cna('', True) is None
+    assert get_cna('test', True) is None
+    assert get_cna('Amplification', True) == CNA_AMPLIFICATION_TXT
+    assert get_cna('Gain', True) == CNA_GAIN_TXT
+    assert get_cna('Deletion', True) == CNA_DELETION_TXT
+    assert get_cna('Loss', True) == CNA_LOSS_TXT
+    assert get_cna('2', True) == CNA_AMPLIFICATION_TXT
+    assert get_cna('1', True) == CNA_GAIN_TXT
+    assert get_cna('-2', True) == CNA_DELETION_TXT
+    assert get_cna('-1.5', True) == CNA_DELETION_TXT
+    assert get_cna('-1', True) == CNA_LOSS_TXT
+    assert get_cna('0', True) is None