# Data Structuring and Pruning

In [35]:
# Load datasets

import json
import pathlib
import importlib

import viccdb
%matplotlib inline

  'Matplotlib is building the font cache using fc-list. '


In [16]:
# reload module and load data
importlib.reload(viccdb)
vdb = viccdb.ViccDb().select(lambda x: x['source'] != 'oncokb' or 'clinical' in x['raw'])
vdb.report_groups()

Loaded 22617 associations
brca: 5717 (30.4% of total)
cgi: 1073 (5.7% of total)
civic: 3340 (17.8% of total)
jax: 5754 (30.6% of total)
molecularmatch: 2085 (11.1% of total)
oncokb: 247 (1.3% of total)
pmkb: 600 (3.2% of total)
18816 total associations


In [23]:
# Associations with more than 1 feature
x = vdb.select(lambda x: len(x['features']) > 1)
x.report_groups(vdb)

cgi: 231 (21.5% of superset)
civic: 1 (0.0% of superset)
jax: 1797 (31.2% of superset)
molecularmatch: 22 (1.1% of superset)
oncokb: 35 (14.2% of superset)
Total: 2086 (11.1% of superset)


In [32]:
x.associations_by_source['civic'][0]['features'] # Multi-featured shouldn't exist

[{'alt': 'None',
  'chromosome': '6',
  'description': 'HLA-C COPY-NEUTRAL LOSS OF HETEROZYGOSITY',
  'end': 29716826,
  'entrez_id': 3107,
  'geneSymbol': 'HLA',
  'name': 'COPY-NEUTRAL LOSS OF HETEROZYGOSITY',
  'provenance': ['http://mygene.info/v3/query?q=HLA&fields=genomic_pos_hg19'],
  'provenance_rule': 'is_fusion_donor',
  'ref': 'None',
  'referenceName': 'GRCh37',
  'start': 29694378},
 {'alt': 'None',
  'chromosome': 'None',
  'description': 'HLA-C COPY-NEUTRAL LOSS OF HETEROZYGOSITY',
  'end': None,
  'entrez_id': 3107,
  'geneSymbol': 'C',
  'name': 'COPY-NEUTRAL LOSS OF HETEROZYGOSITY',
  'ref': 'None',
  'referenceName': 'None',
  'start': None}]

In [34]:
# for now, omit brca from analysis
core_vdb = vdb.select(lambda x: x['source'] != 'brca')
core_vdb.report_groups(vdb)

cgi: 1073 (100.0% of superset)
civic: 3340 (100.0% of superset)
jax: 5754 (100.0% of superset)
molecularmatch: 2085 (100.0% of superset)
oncokb: 247 (100.0% of superset)
pmkb: 600 (100.0% of superset)
Total: 13099 (69.6% of superset)


## Feature coordinate filtering

What follows is a detailed look at associations without start and end coordinates after normalization, and a set of regular expression filters to separate out these associations into chunks that can be annotated with gene- or exon-level coordinates, as appropriate.

In [54]:
# Associations with coordinate features
coord_featured = core_vdb.select(lambda x: any(filter(lambda y: 'start' in y and 'end' in y, x['features'])))
coord_featured.report_groups(core_vdb)

cgi: 1069 (99.6% of superset)
civic: 3340 (100.0% of superset)
jax: 5744 (99.8% of superset)
molecularmatch: 2068 (99.2% of superset)
oncokb: 4041 (99.8% of superset)
pmkb: 600 (100.0% of superset)
Total: 16862 (99.8% of superset)


In [23]:
import re

def feature_filter(re_obj, associations):
    # report matches and return non-matches
    found = list(filter(lambda x: re_obj.search(x['feature_names']) is not None, associations))
    not_found = list(filter(lambda x: re_obj.search(x['feature_names']) is None, associations))
    report_groups(found)
    return(not_found)

amp_re = re.compile(r'(amplification)|(loss)|(amp)', re.IGNORECASE)
remainder = feature_filter(amp_re, no_partial_coord_featured_with_feature_names)

cgi: 125 (8.7%)
jax: 533 (9.3%)
molecularmatch: 135 (6.5%)
oncokb: 92 (2.3%)
Total: 885 (5.1%)


In [24]:
fusion_re = re.compile(r'(\w{2,}-\w{2,})|(fusion)', re.IGNORECASE)
r2 = feature_filter(fusion_re, remainder)

cgi: 117 (8.2%)
jax: 554 (9.6%)
molecularmatch: 89 (4.3%)
oncokb: 229 (5.7%)
Total: 989 (5.7%)


In [25]:
ppm_re = re.compile(r'\w+(:| )[a-z]\d+[a-z]?(fs\*?)?$', re.IGNORECASE)
r3 = feature_filter(ppm_re, r2)

jax: 460 (8.0%)
molecularmatch: 8 (0.4%)
oncokb: 1350 (33.3%)
Total: 1818 (10.4%)


In [26]:
indel_re = re.compile(r'\w+(:| )\w+(ins\w+)|(del($|ins\w+))|(dup$)')
r4 = feature_filter(indel_re, r3)

jax: 208 (3.6%)
oncokb: 215 (5.3%)
Total: 423 (2.4%)


In [27]:
bucket_re = re.compile(r'[A-Z0-9]+( (in)?act)?( oncogenic)? mut((ant)|(ation))?$')
r5 = feature_filter(bucket_re, r4)

cgi: 183 (12.8%)
jax: 931 (16.2%)
molecularmatch: 8 (0.4%)
Total: 1122 (6.4%)


In [28]:
exon_re = re.compile(r'exon', re.IGNORECASE)
r6 = feature_filter(exon_re, r5)

cgi: 36 (2.5%)
jax: 88 (1.5%)
oncokb: 27 (0.7%)
Total: 151 (0.9%)


In [29]:
expression_re = re.compile(r'(exp)|(^\w+ (pos(itive)?)|(neg(ative)?)|(biallelic inactivation)$)|(truncating)|(deletion)', re.IGNORECASE)
r7 = feature_filter(expression_re, r6)

cgi: 172 (12.0%)
jax: 407 (7.1%)
molecularmatch: 1 (0.0%)
oncokb: 226 (5.6%)
Total: 806 (4.6%)


In [30]:
report_groups(r7)

cgi: 92 (6.4%)
jax: 74 (1.3%)
molecularmatch: 24 (1.2%)
oncokb: 79 (2.0%)
Total: 269 (1.5%)


In [31]:
get_feature_names([x for x in r7 if x['source'] == 'cgi'])

['ALK inframe insertion (1151T)',
 'BRAF (V600D,V600K,V600M,V600G,V600R)',
 'DPYD splice donor variant',
 'DPYD splice donor variant',
 'EGFR (S720)',
 'EGFR (L718)',
 'EGFR (L792)',
 'BRAF (V600R)',
 'BRAF (V600E,V600K)',
 'BRAF (V600E,V600D,V600K,V600M,V600G,V600R)',
 'BRAF (V600E,V600K)',
 'DPYD splice donor variant',
 'FGFR2 (V565I,M536I,M538I,I548V,N550,E566G,L618M,K660E)',
 'FGFR2 (M536I,M538I,I548V,N550,E566G,L618M,K660E)',
 'FLT3 (D835)',
 'FLT3 (F691,D835,N676,Y842)',
 'GNAQ (Q209)',
 'GNAQ (Q209)',
 'GNAS (R201)',
 'KIT (D816)',
 'MAP2K1 (P124)',
 'MET kinase domain mutation',
 'TERT promoters core',
 'ESR1 (E380Q,537,538,L536,P535H)',
 'IDH1 (R132)',
 'KIT wildtype',
 'KRAS (G12)',
 'MPL (W515F)',
 'TPMT splice acceptor variant',
 'BRAF (V600)',
 'BRAF (V600E,V600K)',
 'BRAF (V600)',
 'BRAF (V600)',
 'BRAF (V600E,V600D,V600K,V600M,V600G,V600R)',
 'BTK (C481)',
 'CSF3R frameshift variant (D771),frameshift variant (S783)',
 'ERBB2 inframe insertion (A775YVMA),inframe insertion

# Knowledgebase Comparison

## Genes

In [70]:
# Reload Vicc module and data
importlib.reload(viccdb)
vdb = viccdb.ViccDb().select(lambda x: x['source'] != 'oncokb' or 'clinical' in x['raw'])
core_vdb = vdb.select(lambda x: x['source'] != 'brca')
core_vdb.report_groups(vdb)

Loaded 22617 associations
cgi: 1073 (100.0% of superset)
civic: 3340 (100.0% of superset)
jax: 5754 (100.0% of superset)
molecularmatch: 2085 (100.0% of superset)
oncokb: 247 (100.0% of superset)
pmkb: 600 (100.0% of superset)
Total: 13099 (69.6% of superset)


In [77]:
import re
pmid_re = re.compile(r'https?://.*pubmed/(\d+)$')
def pmids(association):
    try:
        evidence = association['association']['evidence']
    except TypeError as e:
        print (association)
        raise e
    for e in evidence:
        publications = e['info']['publications']
        for p in publications:
            m = pmid_re.match(p)
            if m:
                return m[1]
            elif p.startswith('https://www.nccn.org'):
                pass
            else:
                raise NotImplementedError(e)

In [78]:
core_vdb.plot_overlap(pmids)

NotImplementedError: {'info': {'publications': ['']}, 'evidenceType': {'sourceName': 'molecularmatch'}, 'description': 'EGFR L858R confers sensitivity to Cetuximab in patients with Neoplasm of colorectum'}

In [42]:
core_vdb.associations[0]['association']['evidence']

[{'description': 'KRAS Q61 confers resistance to Gefitinib in patients with Neoplasm of lung',
  'evidenceType': {'sourceName': 'molecularmatch'},
  'info': {'publications': ['https://www.ncbi.nlm.nih.gov/pubmed/15696205',
    'https://www.ncbi.nlm.nih.gov/pubmed/20921461']}}]

In [42]:
# Genes observed in all knowledgebases
all_set_genes = set.intersection(*(s.values()))
all_set_genes

{'ABL1',
 'ALK',
 'APC',
 'BRAF',
 'CDKN2A',
 'CTNNB1',
 'EGFR',
 'ERBB2',
 'FGFR2',
 'FGFR3',
 'FLT3',
 'KIT',
 'KRAS',
 'MET',
 'NOTCH1',
 'NRAS',
 'PDGFRA',
 'PIK3CA',
 'PTEN',
 'STK11',
 'TP53'}

In [44]:
# Total genes
all_genes = set.union(*(s.values()))
len(all_genes)

494

In [59]:
# >=3x coverage
count = Counter()
for source in s:
    count.update(s[source])
high_coverage_genes = [gene for gene in count if count[gene] > 3]
len(high_coverage_genes)

82

In [61]:
sorted(high_coverage_genes)

['ABL1',
 'AKT1',
 'AKT2',
 'AKT3',
 'ALK',
 'APC',
 'AR',
 'ATM',
 'AURKA',
 'BCOR',
 'BRAF',
 'BRCA1',
 'BRCA2',
 'CBL',
 'CCND1',
 'CCND2',
 'CCND3',
 'CDH1',
 'CDK4',
 'CDK6',
 'CDKN2A',
 'CDKN2B',
 'CSF1R',
 'CTNNB1',
 'DDR2',
 'DNMT3A',
 'EGFR',
 'ERBB2',
 'ERBB3',
 'ERBB4',
 'ESR1',
 'EZH2',
 'FBXW7',
 'FGF3',
 'FGFR1',
 'FGFR2',
 'FGFR3',
 'FLT3',
 'FOXL2',
 'GNA11',
 'GNAQ',
 'GNAS',
 'HRAS',
 'IDH1',
 'IDH2',
 'IGF1R',
 'JAK2',
 'JAK3',
 'KDR',
 'KIT',
 'KRAS',
 'MAP2K1',
 'MET',
 'MLH1',
 'MPL',
 'MSH6',
 'MTOR',
 'MYD88',
 'NOTCH1',
 'NPM1',
 'NRAS',
 'NTRK1',
 'PDGFRA',
 'PIK3CA',
 'PTCH1',
 'PTEN',
 'RB1',
 'RET',
 'RICTOR',
 'ROS1',
 'SF3B1',
 'SMAD4',
 'SMARCB1',
 'SMO',
 'SRSF2',
 'STAG2',
 'STK11',
 'TP53',
 'TSC2',
 'U2AF1',
 'VEGFA',
 'VHL']