In [61]:
from elasticsearch import Elasticsearch
from collections import Counter
import pandas as pd

In [15]:
es = Elasticsearch()
es

In [33]:
res = es.search(index="g2p", size=10000, body={"query": {"match_all": {}}})

In [94]:
def collect_all_results():
    res = es.search(index="g2p", size=10000, body={"query": {"match_all": {}}}, scroll='1m')
    scroll_id = res.get('_scroll_id')
    i = 1
    while scroll_id is not None:
        print('Processing scroll {}'.format(i))
        r = es.scroll(scroll_id)
        scroll_id = r.get('_scroll_id')
        res['hits']['hits'] += r['hits']['hits']
        i += 1
    return res
        

In [95]:
res = collect_all_results()

Processing scroll 1


In [132]:
assert len(res['hits']['hits']) == res['hits']['total']
len(res['hits']['hits'])

15686

In [102]:
def count_attribute(result, attribute):
    c = Counter([x['_source'][attribute] for x in result['hits']['hits']])
    return c

def count_attribute_by_group(result, attribute, group):
    d = dict()
    for g in count_attribute(result, group):
        d[g] = Counter([x['_source'][attribute] for x in result['hits']['hits'] if x['_source'][group] == g])
    return d

In [103]:
count_attribute(res, 'source')

Counter({'cgi': 1429, 'civic': 2406, 'jax': 10872, 'oncokb': 373, 'pmkb': 606})

In [104]:
count_attribute(res,'gene')

Counter({'+': 32,
         '11': 5,
         '14': 88,
         '19': 82,
         '20': 18,
         'A289T': 4,
         'ABCB1': 5,
         'ABCC10': 1,
         'ABCC3': 1,
         'ABCG2': 1,
         'ABL1': 387,
         'ACTA1': 1,
         'AGR2': 1,
         'AKT1': 70,
         'AKT2': 3,
         'AKT3': 4,
         'ALCAM': 1,
         'ALDH1A2': 3,
         'ALK': 353,
         'APC': 140,
         'AR': 20,
         'AR;AR': 1,
         'ARAF': 8,
         'AREG': 7,
         'ARID1A': 2,
         'ARID1A;ANXA1': 2,
         'ASNS': 1,
         'ASXL1': 14,
         'ATM': 68,
         'ATR': 4,
         'ATRX': 8,
         'AURKA': 8,
         'AXL': 1,
         'B2M': 2,
         'B4GALT1': 1,
         'BAP1': 12,
         'BCL': 1,
         'BCL2': 4,
         'BCL2L11': 2,
         'BCL6': 1,
         'BCOR': 4,
         'BCORL1': 2,
         'BIRC3': 1,
         'BIRC5': 2,
         'BIRC7': 1,
         'BRAF': 1168,
         'BRAF;EGFR': 1,
         'BRCA1': 96,


In [105]:
d = count_attribute_by_group(res, 'gene', 'source')

CGI has some genotype "genes" (e.g. 'ARID1A;ANXA1' from ARID1A amplification + ANXA1 overexpression)

JAX has some unclear "genes" (e.g. '11', '14', '+', 'act', 'exon14', 'mut', 'over'), as well as gene fusions ('EML4-ALK')


In [133]:
d

{'cgi': Counter({'ABL1': 143,
          'AKT1': 8,
          'AKT2': 1,
          'AKT3': 1,
          'ALK': 44,
          'APC': 1,
          'AR': 13,
          'AR;AR': 1,
          'ARAF': 3,
          'AREG': 1,
          'ARID1A': 2,
          'ARID1A;ANXA1': 2,
          'ATM': 11,
          'ATR': 3,
          'AURKA': 1,
          'B2M': 1,
          'BAP1': 6,
          'BCL': 1,
          'BCL2': 1,
          'BCL6': 1,
          'BCOR': 1,
          'BRAF': 59,
          'BRAF;EGFR': 1,
          'BRCA1': 11,
          'BRCA2': 11,
          'BTK': 2,
          'C15orf55': 1,
          'CA9': 1,
          'CBL': 4,
          'CCND1': 3,
          'CCND2': 2,
          'CCND3': 1,
          'CCNE1': 1,
          'CD274': 4,
          'CD69': 1,
          'CDH1': 1,
          'CDK12': 2,
          'CDK4': 2,
          'CDK4;RB1': 1,
          'CDK6': 3,
          'CDKN1A': 2,
          'CDKN1B': 2,
          'CDKN2A': 6,
          'CDKN2B': 2,
          'CDKN2C': 2,
        

In [130]:
def overlap(group_counts):
    columns = sorted(group_counts) + ['Total']
    sets = {g: set(group_counts[g]) for g in group_counts}
    for g in sets:
        total = set()
        for s in sets:
            l = len(sets[g])
            if s != g:
                o = (sets[g] & sets[s])
                total |= sets[s]
            else:
                continue
            print('{} has {}/{} ({:.2f}%) overlap with {}'.format(g, len(o), l, len(o) / l * 100, s))
        o = (sets[g] & total)
        print('{} has {}/{} ({:.2f}%) overlap overall'.format(g, len(o), l, len(o) / l * 100, s))

In [131]:
overlap(d)

cgi has 56/198 (28.28%) overlap with jax
cgi has 120/198 (60.61%) overlap with civic
cgi has 42/198 (21.21%) overlap with oncokb
cgi has 37/198 (18.69%) overlap with pmkb
cgi has 124/198 (62.63%) overlap overall
jax has 56/116 (48.28%) overlap with cgi
jax has 71/116 (61.21%) overlap with civic
jax has 28/116 (24.14%) overlap with oncokb
jax has 37/116 (31.90%) overlap with pmkb
jax has 74/116 (63.79%) overlap overall
civic has 120/289 (41.52%) overlap with cgi
civic has 71/289 (24.57%) overlap with jax
civic has 42/289 (14.53%) overlap with oncokb
civic has 38/289 (13.15%) overlap with pmkb
civic has 138/289 (47.75%) overlap overall
oncokb has 42/43 (97.67%) overlap with cgi
oncokb has 28/43 (65.12%) overlap with jax
oncokb has 42/43 (97.67%) overlap with civic
oncokb has 20/43 (46.51%) overlap with pmkb
oncokb has 42/43 (97.67%) overlap overall
pmkb has 37/42 (88.10%) overlap with cgi
pmkb has 37/42 (88.10%) overlap with jax
pmkb has 38/42 (90.48%) overlap with civic
pmkb has 20/42 (

Overlap of associations (drug + gene + phenotype)