In [1]:
import time
import json as json
import re
import sys
import os
from collections import defaultdict
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
import pandas as pd

In [2]:
INDEX_NAME = "reactomepathway"
NUMBER_SHARDS = 1 # keep this as one if no clusterNUMBER_REPLICAS = 0 
NUMBER_REPLICAS = 0

request_body = {
        "settings": {
            "number_of_shards": NUMBER_SHARDS,
            "number_of_replicas": NUMBER_REPLICAS
        },
        "mappings": {
                "properties": {
                    "id":{
                        "type": "text"
                    },
                    "pathway":{
                        "type": "text"
                    }
                    "species":{
                        "type": "text"
                    }
                }
            }
        }
    

es = Elasticsearch()

if es.indices.exists(INDEX_NAME):
     res = es.indices.delete(index = INDEX_NAME)
     print("Deleting index %s , Response: %s" % (INDEX_NAME, res))
    
res = es.indices.create(index = INDEX_NAME, body = request_body)
print("Create index %s , Response: %s" % (INDEX_NAME, res))

Create index reactomepathway , Response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'reactomepathway'}


In [12]:
reactome = pd.read_table('./ReactomePathways.txt', sep='\t', header=None)
reactome = reactome.to_dict(orient='records')

In [13]:
logFilePath = "log_reactome.txt"

INDEX_NAME = "reactomepathway"

es = Elasticsearch()

ic = 0
ir = 0

with open(logFilePath, "w") as fout:
        start = time.time()
        bulk_size = 50 # number of document processed in each bulk index
        bulk_data = [] # data in bulk index

        cnt = 0
        for item in reactome: ## each item is single document
                cnt += 1
                
                data_dict = {}
                
                # update ID
                data_dict["id"] = item[0]

                # update detail<------------------
                data_dict['pathway'] = item[1]
                data_dict['species'] = item[2]
                
                ## Put current data into the bulk <---------
                op_dict = {
                    "index": {
                        "_index": INDEX_NAME,
                        "_id": data_dict["id"]
                    }
                }

                bulk_data.append(op_dict)
                bulk_data.append(data_dict) 
                                        
                ## Start Bulk indexing
                if cnt % bulk_size == 0 and cnt != 0:
                    ic += 1
                    tmp = time.time()
                    es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 500)
                    fout.write("bulk indexing... %s, escaped time %s (seconds) \n" \
                               % ( cnt, tmp - start ) )
                    
                    if ic%50 ==0:
                        print(" i bulk indexing... %s, escaped time %s (seconds) " \
                              % ( cnt, tmp - start ) )
                    
                    
                    bulk_data = []
   
        ## indexing those left papers
        if bulk_data:
            ir +=1
            tmp = time.time()
            es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 500)
            fout.write("bulk indexing... %s, escaped time %s (seconds) \n"\
                       % ( cnt, tmp - start ) )
            
            if ir%50 ==0:
                print(" r bulk indexing... %s, escaped time %s (seconds) "\
                      % ( cnt, tmp - start ) )
            bulk_data = []
            
        

        end = time.time()
        fout.write("Finish  meta-data indexing. Total escaped time %s (seconds) \n"\
                   % (end - start) )
        print("Finish meta-data indexing. Total escaped time %s (seconds) "\
              % (end - start) )

 i bulk indexing... 2500, escaped time 0.797307014465332 (seconds) 
 i bulk indexing... 5000, escaped time 1.460188865661621 (seconds) 
 i bulk indexing... 7500, escaped time 2.0389161109924316 (seconds) 
 i bulk indexing... 10000, escaped time 2.6908888816833496 (seconds) 
 i bulk indexing... 12500, escaped time 3.2648189067840576 (seconds) 
 i bulk indexing... 15000, escaped time 3.84694504737854 (seconds) 
 i bulk indexing... 17500, escaped time 4.58211088180542 (seconds) 
 i bulk indexing... 20000, escaped time 5.245260953903198 (seconds) 
Finish meta-data indexing. Total escaped time 5.601640939712524 (seconds) 
