In [6]:
import time
import json as json
import re
import sys
import os
from collections import defaultdict
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
import pandas as pd

In [4]:
INDEX_NAME = "mesh_cvd"
NUMBER_SHARDS = 1 # keep this as one if no clusterNUMBER_REPLICAS = 0 
NUMBER_REPLICAS = 0

request_body = {
        "settings": {
            "number_of_shards": NUMBER_SHARDS,
            "number_of_replicas": NUMBER_REPLICAS
        },
        "mappings": {
                "properties": {
                    "id":{
                        "type": "text"
                    },
                    "name":{
                        "type": "text"
                    }
                }
            }
        }
    

es = Elasticsearch()

if es.indices.exists(INDEX_NAME):
     res = es.indices.delete(index = INDEX_NAME)
     print("Deleting index %s , Response: %s" % (INDEX_NAME, res))
    
res = es.indices.create(index = INDEX_NAME, body = request_body)
print("Create index %s , Response: %s" % (INDEX_NAME, res))

Deleting index mesh_cvd , Response: {'acknowledged': True}
Create index mesh_cvd , Response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'mesh_cvd'}


### Load CVD data

In [33]:
cvd = pd.read_csv('./cvd.csv')
cvd = cvd.to_dict(orient='records')
len(cvd)

651

### Make index

In [20]:
logFilePath = "log_cvd.txt"

INDEX_NAME = "mesh_cvd"

es = Elasticsearch()

ic = 0
ir = 0

with open(logFilePath, "w") as fout:
        start = time.time()
        bulk_size = 50 # number of document processed in each bulk index
        bulk_data = [] # data in bulk index

        cnt = 0
        for item in cvd: ## each item is single document
                cnt += 1
                
                data_dict = {}
                
                # update ID
                data_dict["id"] = item["id"]

                # update detail<------------------
                data_dict['name'] = item['name']
                
                ## Put current data into the bulk <---------
                op_dict = {
                    "index": {
                        "_index": INDEX_NAME,
                        "_id": data_dict["id"]
                    }
                }

                bulk_data.append(op_dict)
                bulk_data.append(data_dict) 
                                        
                ## Start Bulk indexing
                if cnt % bulk_size == 0 and cnt != 0:
                    ic += 1
                    tmp = time.time()
                    es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 500)
                    fout.write("bulk indexing... %s, escaped time %s (seconds) \n" \
                               % ( cnt, tmp - start ) )
                    
                    if ic%50 ==0:
                        print(" i bulk indexing... %s, escaped time %s (seconds) " \
                              % ( cnt, tmp - start ) )
                    
                    
                    bulk_data = []
   
        ## indexing those left papers
        if bulk_data:
            ir +=1
            tmp = time.time()
            es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 500)
            fout.write("bulk indexing... %s, escaped time %s (seconds) \n"\
                       % ( cnt, tmp - start ) )
            
            if ir%50 ==0:
                print(" r bulk indexing... %s, escaped time %s (seconds) "\
                      % ( cnt, tmp - start ) )
            bulk_data = []
            
        

        end = time.time()
        fout.write("Finish  meta-data indexing. Total escaped time %s (seconds) \n"\
                   % (end - start) )
        print("Finish meta-data indexing. Total escaped time %s (seconds) "\
              % (end - start) )

Finish meta-data indexing. Total escaped time 0.16572880744934082 (seconds) 


### Do the search

In [25]:
item

{'id': 'C14.907.952.880', 'name': 'Postthrombotic Syndrome'}

In [30]:
es = Elasticsearch(timeout=300)
k = 0
Data = []

entity = 'cardiovascular'

s = Search(using=es, index='mesh_cvd')\
                    .params(request_timeout=300)\
                    .query("match_phrase",name=entity)

print(type(s))
print(s)

<class 'elasticsearch_dsl.search.Search'>
<elasticsearch_dsl.search.Search object at 0x1162f9588>


In [31]:
for hit in s.scan():

        ID = str(hit.id)
        
        name = str(hit.name)
        
        Data.append({"ID":ID,\
                     "name":name})      

k = k +1
if k%10 == 0:
        print(k,'entity counted!')

In [32]:
Data

[{'ID': 'C14', 'name': 'Cardiovascular Diseases'},
 {'ID': 'C14.240', 'name': 'Cardiovascular Abnormalities'},
 {'ID': 'C14.260', 'name': 'Cardiovascular Infections'},
 {'ID': 'C14.260.500', 'name': 'Syphilis, Cardiovascular'},
 {'ID': 'C14.260.750', 'name': 'Tuberculosis, Cardiovascular'},
 {'ID': 'C14.583', 'name': 'Pregnancy Complications, Cardiovascular'}]