In [1]:
from datetime import datetime
import time
import pandas as pd
import helpers as hp
from elasticsearch.helpers import scan

def RunQuery(idx, time_from, time_to):
    query = {
      "size": 0,
      "query": {
        "bool":{
          "must":[
            {
              "range": {
                "timestamp": {
                  "gte": time_from,
                  "lt": time_to
                }
              }
            }
          ]
        }
      }
    }


    results = scan(hp.es, index=idx, query=query)

    data = []
    for d in results:
        data.append(d['_source'])
            
    return data

In [2]:
def ProcessDataInChunks(idx, dateFrom, dateTo, chunks):
    start = time.time()
    print('>>> Main process start:', time.strftime("%H:%M:%S", time.localtime()))
    
    time_range = list(hp.GetTimeRanges(dateFrom, dateTo, chunks))
    
    for field in ['src_host', 'dest_host']:
        hp.GetIdxUniqueHosts(idx, field, time_range[0], time_range[-1])

    data = [] 
    for i in range(len(time_range)-1):
        curr_t = time_range[i]
        next_t = time_range[i+1]
      
        results = RunQuery(idx, curr_t, next_t)
        prdata = hp.ProcessHosts(data=results, saveUnresolved=True)
        
        print('before:', len(results), 'after:', len(prdata), 'reduced by', round(((len(results)-len(prdata))/ len(results))*100), '%')
        
        data.extend(prdata)
        
    print('Number of active hosts: total(',len(hp.hosts),') - unresolved(',len(hp.unresolved),') = ',len(hp.hosts) - len(hp.unresolved))   
    print(">>> Overall elapsed = %ss" % (int(time.time() - start)))
        
    return data
    
    
p_data = ProcessDataInChunks('ps_packetloss', '2020-03-23 10:00', '2020-03-23 13:00', 3)

>>> Main process start: 09:51:10
Start: 09:51:33
Next 100000 items
Next 200000 items
Time elapsed = 343s
before: 220558 after: 185832 reduced by 16 %
Start: 09:57:34
Next 100000 items
Next 200000 items
Time elapsed = 321s
before: 221328 after: 186590 reduced by 16 %
Start: 10:03:13
Next 100000 items
Next 200000 items
Time elapsed = 310s
before: 222803 after: 188435 reduced by 15 %
Number of active hosts: total( 315 ) - unresolved( 190 ) =  125
>>> Overall elapsed = 1034s


In [5]:
dfr = pd.DataFrame(p_data)