In [None]:
import pandas as pd, numpy as np, matplotlib.dates as dates, os, sys, pickle, re, glob
from matplotlib import pyplot as plt
import json
import calendar
from datetime import datetime, timedelta
import time
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
from ipywidgets import interact, IntSlider
if "./src" not in sys.path:
    sys.path.insert(0, "./src")
from src import bcm_utils

In [None]:
#path of json file
json_path = "Z:Desktop/json_data/webmonitor-es-bril-dipanalyzermon_old2018.json"
#json_path = "C:/Users/pkicsiny/Desktop/TSC_CERN/BLM_study/json_data/webmonitor-es-bril-dipanalyzermon_old2018-short.json"

#set up output directories
out_dir = "Z:Desktop/json_data/json_pickles"
os.makedirs(out_dir, exist_ok=True)

plot_dir = "Z:Desktop/json_data/json_plots"
os.makedirs(plot_dir, exist_ok=True)

"""
global variables
"""

#set up output dfs
months = [str(m).zfill(2) for m in list(range(4,11))]
cols = bcm_utils.get_column_names([24, 48],[9])

#set up month intervals
start_dates = ["2018-{}-01T00:00:00Z".format(m) for m in months]
end_dates = ["2018-{}-01T00:00:00Z".format(str(int(m)+1).zfill(2)) for m in months]

#json read parameters
block_size = 30000
plot_freq = 300

In [None]:
def get_json_data(json_line, result_dict):
    ts = json_line['_source']['timestamp']
    for m, start_date, end_date in zip(months, start_dates, end_dates):
        if pd.Timestamp(ts) >= pd.Timestamp(start_date) and pd.Timestamp(ts) < pd.Timestamp(end_date):
             result_dict[m] = result_dict[m].append(
                pd.DataFrame([[json_line["_source"]["RunningSum9"][i] for i in [23, 47]]],
                           columns=cols,
                           index=[ts]))

                
def read_large_file(file_handler, block_size=200):
    block = []
    restart_timer = True
    for line in file_handler:
        if restart_timer:
            start = time.time()
            restart_timer = False
        block.append(line)
        if len(block) == block_size:
            end = time.time()
            restart_timer = True
            print("Block loaded in {} seconds".format(end-start))
            yield block
            block = []
    if block:
        end = time.time()
        print("Block loaded in {} seconds".format(end-start))
        yield block
        

def plot_read_progress(ax, start, idx, current_block_idx, plot_dir):
    loop = time.time()
    os.makedirs(plot_dir, exist_ok=True)
    ax.scatter(idx, loop - start, c="b")
    ax.set_xlabel("json row index", fontsize=19)
    ax.set_ylabel("time elapsed [s]", fontsize=19)
    plt.title("json block # {}".format(current_block_idx + 1), fontsize=20)
    plt.savefig(plot_dir+"/json_block_{}.png".format(current_block_idx+1))
    
def utc_to_local(utc_dt):
    # get integer timestamp to avoid precision lost
    timestamp = calendar.timegm(utc_dt.timetuple())
    local_dt = datetime.fromtimestamp(timestamp)
    assert utc_dt.resolution >= timedelta(microseconds=1)
    return local_dt.replace(microsecond=utc_dt.microsecond)

# 1) Read json

In [None]:
#1st version with lazy read chunks
with open(json_path) as file_handler:

#get a data by blocks, this has some additional latency
    for current_block_idx, block in enumerate(read_large_file(file_handler, block_size=block_size)):
        
#set up break condition
        if current_block_idx == 10:
            break

        print("Reading block: # {}".format(current_block_idx + 1))

#set up df for block data
        result_dict = {m: pd.DataFrame() for m in months}

#set up plot for block data
        fig, ax = plt.subplots(figsize=(20, 6))
        plt.grid()
        plt.yticks(fontsize=13)
        plt.xticks(fontsize=13)

#start time counter
        start = time.time()

#loop over lines in block
        for idx in range(len(block)):
        
#read single line
            json_line = json.loads(block[idx])
    
#append row data to proper dataframe
            get_json_data(json_line, result_dict)

#make plot on performance, reading speed is around 1/300 for all ch, 1/500 for blm channels only
            if idx%plot_freq == 0:
                plot_read_progress(ax, start, idx, current_block_idx, plot_dir)
            
#save data at each block limit
        print("Saving block # {}".format(current_block_idx + 1))
        [result_dict[m].to_pickle(out_dir+"/month_{}_batch_{}".format(m, current_block_idx + 1)) for m in months]

#close file and plot
        plt.close()
    file_handler.close()

# 2) Read parsed json blocks

In [None]:
json_blocks_path = "Z:Desktop/json_data/json_pickles"
pickle_files = os.listdir(json_blocks_path)
pickle_file_dict = {m: [pf for pf in pickle_files if "_{}_".format(m) in pf] for m in months}
result_df = {m: pd.DataFrame() for m in months}

In [None]:
for m in months:
    print("Month {}".format(m))
    try:
        for idx, pf in enumerate(pickle_file_dict[m]):
            print("Reading json block # {}".format(idx+1))
            with open(json_blocks_path+"/"+pf, "rb") as f:
                json_block = pickle.load(f)
                df_block = pd.DataFrame([tup[1:] for tup in json_block],
                                        columns=["CH24RS9", "CH48RS9"],
                                        index=[tup[0] for tup in json_block])
                result_df[m] = result_df[m].append(df_block)
        result_df[m] = result_df[m].sort_index()
        print("Saving pickle for month {} to {}".format(m, out_dir))
        result_df[m].to_pickle(out_dir+"/pickle_{}".format(m))
    except:
        continue

In [None]:


hours = dates.HourLocator(interval=48)
h_fmt = dates.DateFormatter('%m/%d')
for m in months:
    print("Processing month {}".format(m))
    no_duplicates_df = result_df[m].drop_duplicates()
    no_duplicates_df = no_duplicates_df/bcm_utils.γ[9]/bcm_utils.β
    no_duplicates_df["new_index"] = [np.datetime64(dt) for dt in no_duplicates_df.index.values]
    no_duplicates_df.set_index("new_index", drop=True, inplace=True)
    
    fig, ax = plt.subplots(figsize=(20, 6))
    plt.grid()
    
    for ch in [24, 48]:
    
        # one column of dataframe
        ch_fn_df = no_duplicates_df['CH' + str(ch) + 'RS9']
        x = [pd.Timestamp(t) for t in ch_fn_df.index.values]
        ax.plot(x, ch_fn_df.values, label="CH{}: {}".format(ch, bcm_utils.DETECTOR_NAMES[ch][0]), linewidth=1)
    
        # Plot formatting
        ax.xaxis.set_major_locator(hours)
        ax.xaxis.set_major_formatter(h_fmt)
        plt.yticks(fontsize=13)
        plt.xticks(fontsize=13)
        ax.set_title('Month {} CH 24, 48, RS 10'.format(m), fontsize=19)
        ax.set_ylabel('Signal current (A)', fontsize=19)
        ax.set_xlabel('Time (UTC)', fontsize=19)
        ax.set_yscale("log")
        plt.legend(loc="best", prop={'size': 13})
    plt.savefig("../json_plots/2018_{}_rs10.png".format(m))

# 3) Compare with brildata

In [None]:
brildata = pd.read_pickle("../pickle_data/201810_brildata_rs10")

In [None]:
elasticsearch = no_duplicates_df[no_duplicates_df.index.isin(brildata.index)]

In [None]:
numrows = 200000
ratio_bd = brildata["CH24RS10"].iloc[:numrows]/brildata["CH48RS10"].iloc[:numrows]
ratio_es = elasticsearch["CH24RS9"].iloc[:numrows]/elasticsearch["CH48RS9"].iloc[:numrows]

In [None]:
plt.figure(figsize=(12,6))
plt.grid()
plt.plot(ratio_bd, label="brildata", lw=4)
plt.plot(ratio_es, label="elasticsearch")
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)
plt.ylabel("CH24/CH48", fontsize=19)
plt.xlabel("Time", fontsize=19)
plt.legend()

In [None]:
plt.figure(figsize=(12,6))
plt.grid()
plt.plot(brildata["CH48RS10"], label="brildata (RS 10)", lw=4)
plt.plot(elasticsearch["CH48RS9"], label="elasticsearch (RS 10)")
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)
plt.ylabel("Signal current", fontsize=19)
plt.xlabel("Time", fontsize=19)
plt.yscale("log")
plt.legend(loc="upper right")