In [None]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from os.path import join
from tqdm.notebook import tqdm
import os
from os.path import basename
import time

from qmlhep.config import processed_data_path, raw_data_path, signal_used

# Remove warnings
import warnings
warnings.filterwarnings("ignore")

# Data Exploration Notebook

This notebook explores and plots the histograms of the data, for each feature, after the data-preprocessing step.

Author: Miguel Caçador Peixoto

In [None]:
bins = 50
num_cols = 4

# Load the pre-processed data
data = pd.concat([pd.read_hdf(join(processed_data_path,  "bkg.h5"), index_col=0), \
        pd.read_hdf(join(processed_data_path, signal_used), index_col=0)])
features = data.columns[:-3]

# Divide into signal and background
signal = data[data["label"] == 1]
background = data[data["label"] == 0]

# Ignore irrelevant features such as 'name' and 'weights' in
# the plotting of the data
features  = list(signal.columns)
for x in ['name', 'weights', 'label']: 
    features.remove(x)

# Normalize monte carlo weights
background['weights'] = (background['weights'] / background['weights'].sum()) * background['weights'].shape[0] / 2
signal['weights'] = (signal['weights'] / signal['weights'].sum()) * signal['weights'].shape[0] / 2

## Plot
num_rows = int(np.ceil((len(list(signal.columns)) - 1) / num_cols)) -1
fig, ax = plt.subplots(num_rows, num_cols, figsize=(40, 60))
i= 0

for x in tqdm(features, total=len(features), desc="Processing..."):
    row, col = int(i/num_cols), i%num_cols
    i +=1

    # Define histogram range
    hist_min = min(signal[x].min(), background[x].min())
    hist_max = max(signal[x].max(), background[x].max())
    hist_range = (hist_min, hist_max)

    # Plot histograms of the background and signal data        
    ax[row, col].hist(background[x], bins=bins, alpha=0.5, label='Background', weights=background['weights'], range=hist_range)
    ax[row, col].hist(signal[x], bins=bins, alpha=0.5, label='Sinal', weights=signal['weights'], range=hist_range)

    ax[row, col].set_title(x)
    ax[row, col].set_yscale('log')        
    ax[row, col].autoscale(enable=True) 
    ax[row, col].legend()

fig.tight_layout()
plt.show()