In [2]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

In [1]:
import sys
sys.path.append('../')

In [35]:
!pwd

/home/ubuntu/python-notebooks/database


In [3]:
from lib.obo import read_ontology
tissue_ontology = read_ontology('../data/geo-annotation/brenda-tissue-ontology.obo')

Read obo graph
Name: 
Type: DiGraph
Number of nodes: 5659
Number of edges: 6430
Average in degree:   1.1362
Average out degree:   1.1362


In [7]:
def get_doc_counts(index, field, filter_=None):
    body = {
        "size": 0,
        "aggs": {
            "nodes": {
              "terms": {
                "field": field,
                "size": 8000
              }
            }
        }
    }
    
    if filter_:
        body['query'] = {
            "filtered": {
                "filter": filter_
            }
        }
    
    res = es.search(index=index, body=body)
    return dict((r['key'], r['doc_count']) for r in res['aggregations']['nodes']['buckets'])

In [10]:
tissue_counts = get_doc_counts('samples_dev', 'annotation.tissue_id', {
        "term": {
            "organism": "Homo sapiens"
        }
    })

In [14]:
import networkx as nx
?tissue_ontology.name

In [48]:
from itertools import chain
def build_tree(ontology, root_node, index, field,  filter_=None):
    counts = get_doc_counts(index, field, filter_)
    def build_node(node_id):
#         print(node_id)
#         print(ontology.graph.successors(node_id))
        descs = nx.descendants(ontology.graph, node_id)
        total_count = sum(
            counts[nid]
            for nid in chain(descs, [node_id])
            if nid in counts
        )
        
        
        if total_count:
            children = [
                build_node(child_node_id)
                for child_node_id in ontology.graph.successors(node_id)
            ]
        else:
            children = []
            
        return {
            'id': node_id,
            'name': ontology.name(node_id),
            'count': total_count,
            'children': children
        }
    return build_node(root_node)

In [49]:
n = build_tree(tissue_ontology, 
           root_node='BTO:0000000', 
           index='samples_dev', 
           field='annotation.tissue_id', 
           filter_={
              "exists": {
                "field": "annotation.age"
              }
            })

In [50]:
n['count']

60950

In [51]:
n['children']

[{'children': [], 'count': 0, 'id': 'BTO:0001494', 'name': 'fungus'},
 {'children': [{'children': [],
    'count': 0,
    'id': 'BTO:0003809',
    'name': 'soil'},
   {'children': [{'children': [],
      'count': 0,
      'id': 'BTO:0000311',
      'name': 'culture filtrate'},
     {'children': [],
      'count': 0,
      'id': 'BTO:0001982',
      'name': 'chemostat culture'},
     {'children': [],
      'count': 0,
      'id': 'BTO:0000152',
      'name': 'infected cell'},
     {'children': [],
      'count': 0,
      'id': 'BTO:0004503',
      'name': 'submerged culture'},
     {'children': [],
      'count': 0,
      'id': 'BTO:0005517',
      'name': 'lyophilized cell'},
     {'children': [],
      'count': 0,
      'id': 'BTO:0001091',
      'name': 'axenic culture'},
     {'children': [],
      'count': 0,
      'id': 'BTO:0003916',
      'name': 'plant culture'},
     {'children': [], 'count': 0, 'id': 'BTO:0000682', 'name': 'koji culture'},
     {'children': [],
      'count':

In [34]:
import json
len(json.dumps(n))

56133