Found issues with hd5 files:
- starting timestamp is 1970/01/01 (some files for 2016/09)
- file contains fill number 0 with or without data but in the former case the data is just baseline
- file contains non matching data for claimed fill number
- hd5 duplicates for 2015/09 in bcml_150812-150912
- time incontinuity: missing fills, fill parts

In [None]:
import pandas as pd, numpy as np, matplotlib.dates as dates, seaborn as sns, paramiko, os, re, sys, tables, glob, shutil
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
from ipywidgets import interact, IntSlider
if "./src" not in sys.path:
    sys.path.insert(0, "./src")
import src

# Define parameters

In [None]:
year = 2018

In [None]:
#mask hd5 file list for one month data where several months of data are in the same folder
months = [str(m).zfill(2) for m in list(range(1,13))]

#integrated lumi per year in inverse barns, LHC delivered nominal, from twiki, just for reference
nominal_integrated_lumi = {2015:4.21e15, 2016:40.99e15, 2017:49.79e15, 2018:67.86e15}

# longest running sum used
rs = 10

#column names for channel ratio dataframe
charge_sum_ratio_df_columns = ["CH5/CH1", "CH6/CH2", "CH7/CH3", "CH8/CH4",
           "CH17/CH41", "CH18/CH42", "CH19/CH43", "CH20/CH44", 
           "CH21/CH45", "CH22/CH46", "CH23/CH47", "CH24/CH48", 
           "CH25/CH33", "CH26/CH34", "CH27/CH35", "CH28/CH36", 
           "CH29/CH37", "CH30/CH38", "CH31/CH39", "CH32/CH40",
           "CH24/CH5", "CH48/CH1", "CH24/CH6", "CH48/CH2",
           "CH24/CH7", "CH48/CH3", "CH24/CH8", "CH48/CH4"]

In [None]:
#path to BCM data on /brildata
drive = "Z:/cmsusr/cmsnfsbrildata/brildata/bcml_18recup/bcml_181001_181031"

#find .hd5 files
hd5_list = [filename for filename in glob.iglob(drive + '/*.hd5', recursive=True)]
len(hd5_list)

## Read brilcalc lumi data 
Read from csv and make a dataframe with [lumi sum, fill]. Csv is exported from brilcalc and one file contains one year of lumi info.

In [None]:
#path to csv files containing exported brilcalc data for a whole year
brilcalc_dir = "../brilcalc_data/{}.csv".format(year)

In [None]:
#read excel file and convert lumi data from [μb] to [b]
brilcalc_table = pd.read_csv(brilcalc_dir, header=1)[:-3]
brilcalc_table["delivered_[b]"] = pd.to_numeric(brilcalc_table["delivered(/ub)"], errors="coerce")*1e6
brilcalc_table[['run','fill']] = brilcalc_table['#run:fill'].str.split(':',expand=True)

#sum up delivered lumi per fill, used to filter hd5 files for fills that are in this table
brilcalc_lumi_table = pd.DataFrame({"fill": pd.to_numeric(brilcalc_table["fill"]).unique().tolist(),
                                    "cms_delivered": [brilcalc_table[brilcalc_table["fill"] == fill]["delivered_[b]"].sum()\
                                                      for fill in brilcalc_table["fill"].unique()]})
integrated_lumi = brilcalc_lumi_table.sum()["cms_delivered"]
print("Integrated lumi: {:.4E}".format(integrated_lumi), "[b-1]")
print("Error of integrated lumi:", (integrated_lumi - nominal_integrated_lumi[year])/(integrated_lumi)*100, "%")

## Scan hd5 data
Scan and read BCM detector data from /brildata on .CMS network, from hd5 files. Make a dataframe of size (#timestamps X #channel*#rs)<br>
From the scanned data, select the fills for which there is valid lumi data (based on an exported brilcalc csv). Also select the longest running sum (12) and convert the data to current. Furthermore, drop those fills for which the data shows only the baseline, based on a predefined threshold (might as well drop fills with very little delivered lumi but that data would be more noise biased and useless anyway).

In [None]:
#whether or not to save output excel tables and plots
save_output = True

#output folders
excel_out_dir = "../excel_data_rs9/{}".format(year)
plot_out_dir = "../blm_fill_images_rs9/{}".format(year)
if not os.path.isdir(excel_out_dir):
        os.makedirs(excel_out_dir, exist_ok=True)
if not os.path.isdir(plot_out_dir):
        os.makedirs(plot_out_dir, exist_ok=True)

In [None]:
#scan files month by month
for month in months:

#select hd5 files of the current month
    file_mask = ["_{}{}".format(str(year)[-2:], month) in file[-24:] for file in hd5_list]
    hd5_list_one_month = [hd5_list[i] for i in range(len(hd5_list)) if file_mask[i]]
    no_files = len(hd5_list_one_month)
    print("{} files for {}/{}".format(no_files, year, month))

#read files
    if no_files > 0:
        result_df, fillnum_df, fillnum_list = src.read_hd5(hd5_list_one_month, drive)

#normalize with integration window length and convert from ADC to current
        channels_data_df = result_df[src.get_column_names(range(1, 49),[rs])]/src.γ[rs - 1]/src.β
    
#select data based on fill number and if the fill number is found in brilcalc data
        print("Filtering valid fills with nonzero delivered lumi data.")
        selected_fills = list(brilcalc_lumi_table["fill"])
        filtered_data_df = channels_data_df[fillnum_df["fill"].isin(selected_fills)]
        filtered_fillnum_df = fillnum_df[fillnum_df["fill"].isin(selected_fills)]
        filtered_fillnum_list = list(np.array(fillnum_list)[[fn in selected_fills for fn in fillnum_list]])
        
#sort data by date
        filtered_data_df.sort_index(inplace=True)
        filtered_fillnum_df.sort_index(inplace=True)
        filtered_fillnum_list = sorted(filtered_fillnum_list)
        print("{} valid fills found in hd5 files from {}/{}".format(len(filtered_fillnum_list), year, month))
        
#make plots
        if save_output:
            print("Saving plots of data of these fills.")
            src.plot_fill_data(filtered_data_df,
                               filtered_fillnum_df,
                               filtered_fillnum_list,
                               [1,5,24,48],
                               out_dir=plot_out_dir+"/{}_{}".format(year, month), rs=rs)
        
#drop useless data
        print("Dropping fills where only the baseline is visible.")
        th = 0.5e-10
        bl_mask = [(filtered_data_df[filtered_fillnum_df["fill"].isin([fn])][src.get_column_names([24, 48],[rs])].max() -
                 filtered_data_df[filtered_fillnum_df["fill"].isin([fn])][src.get_column_names([24, 48],[rs])].min()).max() > th
        for fn in filtered_fillnum_list]
        filtered_fillnum_list_wo_bl = list(np.array(filtered_fillnum_list)[bl_mask])

#sum up charge (integrate dQ/dt dt: "integral" to 1s so if sampling time is not 1s, the sums need to be scaled manually)
        print("Summing up collected charge for remaining ({}) fills.".format(len(filtered_fillnum_list_wo_bl)))
        charge_sums = pd.DataFrame([filtered_data_df[filtered_fillnum_df["fill"].isin([fn])].sum()\
                                    for fn in filtered_fillnum_list_wo_bl],
                                   index=filtered_fillnum_list_wo_bl,
                                   columns=filtered_data_df.columns.values)
        
#save summed up charge to excel file
        if save_output:
            charge_sums.to_excel(excel_out_dir+"/charge_sums_{}_{}.xlsx".format(year, month))
            print("Summed charges saved to csv file.")

#create a df from fillnum list with the date range of data in each fill and the (approximate) sampling frequency in [ms]
        fill_dates = pd.DataFrame([[fn,
                                   min(fillnum_df[fillnum_df["fill"]==fn].index.values),
                                   max(fillnum_df[fillnum_df["fill"]==fn].index.values),
                                   pd.Timedelta(np.mean(fillnum_df[fillnum_df["fill"]==fn].index.values[1:100]-\
                                   fillnum_df[fillnum_df["fill"]==fn].index.values[:99]))]
                                   for fn in filtered_fillnum_list_wo_bl],
                                   columns=["fill", "data_start", "data_end", "sampling_time"])
        fill_dates["sampling_time"] = fill_dates["sampling_time"].astype('timedelta64[ms]')
        fill_dates["sampling_time"] = fill_dates["sampling_time"]/1000

#save fill dates and sampling freq. to excel file
        if save_output:
            fill_dates.to_excel(excel_out_dir+"/fill_dates_{}_{}.xlsx".format(year, month))
            print("Fill data dates saved to csv file.")

In [None]:
src.freeze_header(filtered_data_df)

In [None]:
hours = dates.HourLocator(interval=48)
h_fmt = dates.DateFormatter('%m/%d')

fig, ax = plt.subplots(figsize=(20, 6))
plt.grid()

for ch in [24, 48]:

    # one column of dataframe
    ch_fn_df = channels_data_df['CH' + str(ch) + 'RS9'].sort_index()
    x = [pd.Timestamp(t) for t in ch_fn_df.index.values]
    ax.plot(x, ch_fn_df.values, label="CH{}: {}".format(ch, src.DETECTOR_NAMES[ch][0]), linewidth=1)

    # Plot formatting
    ax.xaxis.set_major_locator(hours)
    ax.xaxis.set_major_formatter(h_fmt)
    plt.yticks(fontsize=13)
    plt.xticks(fontsize=13)
    ax.set_title('CH 24, 48, RS 9', fontsize=19)
    ax.set_ylabel('Signal current (A)', fontsize=19)
    ax.set_xlabel('Time (UTC)', fontsize=19)
    ax.set_yscale("log")
    plt.legend(loc="best", prop={'size': 13})
plt.savefig("../rs9_201810_brildata.png")

In [None]:
df_to_save = channels_data_df[["CH24RS10", "CH48RS10"]].sort_index()

In [None]:
df_to_save.to_pickle("../pickle_data/201810_brildata_rs10")