In [1]:
import time
import json as json
import re
import sys
import os
from collections import defaultdict
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q

### Initialize Indexing

In [2]:
INDEX_NAME = "cvd_icd"
NUMBER_SHARDS = 1 # keep this as one if no clusterNUMBER_REPLICAS = 0 
NUMBER_REPLICAS = 0

request_body = {
        "settings": {
            "number_of_shards": NUMBER_SHARDS,
            "number_of_replicas": NUMBER_REPLICAS
        },
        "mappings": {
                "properties": {
                    "pmid": {
                        "type": "keyword"
                    },
                    "title":{
                        "type": "text"
                    },
                    "abstract":{
                        "type": "text"
                    },
                    "mesh":{
                        "type": "text"
                    }
                }
            }
        }
    

es = Elasticsearch()

if es.indices.exists(INDEX_NAME):
     res = es.indices.delete(index = INDEX_NAME)
     print("Deleting index %s , Response: %s" % (INDEX_NAME, res))
    
res = es.indices.create(index = INDEX_NAME, body = request_body)
print("Create index %s , Response: %s" % (INDEX_NAME, res))

Deleting index cvd_icd , Response: {'acknowledged': True}
Create index cvd_icd , Response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'cvd_icd'}


------------------

### Populate the Index

In [3]:
with open("data/allcvd.json", 'r')as f2:
    DATA= json.load(f2)

In [4]:
len(DATA)

2241397

--------

In [5]:
logFilePath = "log.txt"

INDEX_NAME = "cvd_icd"

es = Elasticsearch()

ic = 0
ir = 0

with open(logFilePath, "w") as fout:
        start = time.time()
        bulk_size = 50 # number of document processed in each bulk index
        bulk_data = [] # data in bulk index

        cnt = 0
        for item in DATA: ## each item is single document
                cnt += 1
                
                data_dict = {}
                
                # update ID
                data_dict["pmid"] = item["pmid"]
                
        

                # update detail<------------------
                data_dict["title"] = item["title"]
                data_dict["abstract"] = item["abstract"]
                data_dict["mesh"] = item["mesh"]
                

                        
                
                ## Put current data into the bulk <---------
                op_dict = {
                    "index": {
                        "_index": INDEX_NAME,
                        "_id": data_dict["pmid"]
                    }
                }

                bulk_data.append(op_dict)
                bulk_data.append(data_dict) 
                
                
                
                        
                ## Start Bulk indexing
                if cnt % bulk_size == 0 and cnt != 0:
                    ic += 1
                    tmp = time.time()
                    es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 500)
                    fout.write("bulk indexing... %s, escaped time %s (seconds) \n" \
                               % ( cnt, tmp - start ) )
                    
                    if ic%5000 ==0:
                        print(" i bulk indexing... %s, escaped time %s (seconds) " \
                              % ( cnt, tmp - start ) )
                    
                    
                    bulk_data = []
                
                
        
        ## indexing those left papers
        if bulk_data:
            ir +=1
            tmp = time.time()
            es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 500)
            fout.write("bulk indexing... %s, escaped time %s (seconds) \n"\
                       % ( cnt, tmp - start ) )
            
            if ir%5000 ==0:
                print(" r bulk indexing... %s, escaped time %s (seconds) "\
                      % ( cnt, tmp - start ) )
            bulk_data = []
            
        

        end = time.time()
        fout.write("Finish  meta-data indexing. Total escaped time %s (seconds) \n"\
                   % (end - start) )
        print("Finish meta-data indexing. Total escaped time %s (seconds) "\
              % (end - start) )

 i bulk indexing... 250000, escaped time 103.31597971916199 (seconds) 
 i bulk indexing... 500000, escaped time 208.79483938217163 (seconds) 
 i bulk indexing... 750000, escaped time 316.67739605903625 (seconds) 
 i bulk indexing... 1000000, escaped time 425.4537658691406 (seconds) 
 i bulk indexing... 1250000, escaped time 534.4646880626678 (seconds) 
 i bulk indexing... 1500000, escaped time 645.7043840885162 (seconds) 
 i bulk indexing... 1750000, escaped time 759.2484619617462 (seconds) 
 i bulk indexing... 2000000, escaped time 872.7480363845825 (seconds) 
Finish meta-data indexing. Total escaped time 981.8086285591125 (seconds) 
