In [2]:
import load_env
from elasticsearch import AsyncElasticsearch
from src.models.annotation_model import Annotation, AnnotationStats, Frequency, Bucket, Evidence
from src.config.settings import settings
from src.config.es import  es


In [7]:

async def get_annotations_stats():
    
    search_query = {
            "match_all":{},
        }
    aggs = {
        "term_frequency": {
          "terms": {
            "field": "term.label.keyword"
          }
        },
        "aspect_frequency": {
          "terms": {
            "field": "aspect.keyword",
             "order":{"_count":"desc"}
          }
        }
    }
    resp = await es.search(
          index=settings.PANGO_ANNOTATIONS_INDEX,
          filter_path ='took,hits.total.value,aggregations',
          query=search_query,
          aggs=aggs,
          size=0,
    )

    for k, freqs in resp['aggregations'].items():        
        buckets = [Bucket(**bucket) for bucket in freqs['buckets']]
        freq = Frequency(buckets=buckets)
        results = AnnotationStats(term_frequency=freq)
        
    return results

res = await get_annotations_stats()


res

AnnotationStats(term_frequency=Frequency(buckets=[Bucket(key='nucleus', doc_count=2727), Bucket(key='cytoplasm', doc_count=2334), Bucket(key='plasma membrane', doc_count=1546), Bucket(key='regulation of transcription by RNA polymerase II', doc_count=1315), Bucket(key='RNA polymerase II cis-regulatory region sequence-specific DNA binding', doc_count=1079), Bucket(key='extracellular space', doc_count=1064), Bucket(key='DNA-binding transcription factor activity, RNA polymerase II-specific', doc_count=1052), Bucket(key='cytosol', doc_count=812), Bucket(key='integral component of plasma membrane', doc_count=785), Bucket(key='mitochondrion', doc_count=524)]))

In [3]:
async def get_annotations():
    resp = await es.search(
          index=settings.PANGO_ANNOTATIONS_INDEX,
          filter_path ='took,hits.hits._score,**hits.hits._source**',
          query={"match_all": {}},
          size=20,
    )

    results = [Annotation(id=resp['hit']['_id'], **hit['_source']) for hit in resp['hits']['hits']]
        
    return results

results = await get_annotations()
results

[Annotation(gene='UniProtKB:Q7L0Q8', gene_symbol='RHOU', gene_name='Rho-related GTP-binding protein RhoU', term={'id': 'GO:0032488', 'label': 'Cdc42 protein signal transduction', 'aspect': 'biological_process', 'is_goslim': False}, slim_terms=[{'id': 'GO:0023052', 'label': 'signaling', 'aspect': 'biological_process', 'is_goslim': True}], qualifier='involved_in', evidence=[Evidence(with_gene_id='RGD:71043', references=['PMID:21423166'])], group='GO_Central'),
 Annotation(gene='UniProtKB:Q7L0Q8', gene_symbol='RHOU', gene_name='Rho-related GTP-binding protein RhoU', term={'id': 'GO:0006897', 'label': 'endocytosis', 'aspect': 'biological_process', 'is_goslim': False}, slim_terms=[{'id': 'GO:0016192', 'label': 'vesicle-mediated transport', 'aspect': 'biological_process', 'is_goslim': True}], qualifier='involved_in', evidence=[Evidence(with_gene_id='UniProtKB:P60953', references=['PMID:26465210']), Evidence(with_gene_id='MGI:MGI:106211', references=['PMID:24792215'])], group='GO_Central'),
 