In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from os.path import join
from tqdm import tqdm
import os
from os.path import basename
from config import bkg_data_path, signal_data_path

# Sanity checks

### Listing all the files on the bkg and signal directory (file path)

In [2]:
bkg_files = glob.glob(join(bkg_data_path, "*.*"))
signal_files = glob.glob(join(signal_data_path, "*/*.*"))
all_files = bkg_files + signal_files

print("Signal files:", len(signal_files), "\nBackground files:", len(bkg_files), "\n> Total:", len(all_files))

Signal files: 5 
Background files: 18 
> Total: 23


## Checking features

In [3]:
book = {}
for path in all_files:
    # Load data
    if path.endswith(".csv"):
        data = pd.read_csv(path)
    elif path.endswith(".h5"):
        data = pd.read_hdf(path)

    # Get features
    features = list(data.columns)

    file_name = basename(path)
    for feature in features:
        if feature not in book:
            book[feature] = []
        book[feature] += [file_name]

    # Saving memory
    del data

After getting a directory with the structure:
- {feature:\[name_of_file\]}

We can compare each of the files features to see if they match

In [25]:
for x in book:
    if len(book[x]) != len(all_files):
        print(f"\nFeature \"{x}\" is missing on",  len(all_files)-len(book[x]), "file(s).")
        print("-> Files that are missing the feature:\n\t", set([basename(x) for x in all_files]) - set(book[x]))
    #print(len(book[x]))


Feature "gen_sample" is missing on 1 file(s).
-> Files that are missing the feature:
	 {'tZFCNC.h5'}

Feature "gen_filter" is missing on 1 file(s).
-> Files that are missing the feature:
	 {'tZFCNC.h5'}

Feature "gen_decay_filter" is missing on 1 file(s).
-> Files that are missing the feature:
	 {'tZFCNC.h5'}

Feature "MissingET_Eta" is missing on 22 file(s).
-> Files that are missing the feature:
	 {'WW_2L_PTW0to250.csv', 'ZZ_2L_PTZ500.csv', 'Zbb_2L_HT0to250.csv', 'Zbb_2L_HT250to500.csv', 'ZZ_2L_PTZ0to250.csv', 'Zjj_2L_HT500.csv', 'Zbb_2L_HT500.csv', 'mch45_HG_13TeV_HG3000_HQ1000_train.csv', 'ZZ_2L_PTZ250to500.csv', 'WW_2L_PTW500.csv', 'WZ_2L_PTZ500.csv', 'WW_2L_PTW250to500.csv', 'mch45_HG_13TeV_HG3000_HQ1000_test.csv', 'mch45_HG_13TeV_wohg_HQ1000_train.csv', 'WZ_2L_PTZ0to250.csv', 'mch45_HG_13TeV_wohg_HQ1000_test.csv', 'WZ_2L_PTZ250to500.csv', 'Zjj_2L_HT250to500.csv', 'ttbar_2L_PTtop0to100.csv', 'ttbar_2L_PTtop100to250.csv', 'Zjj_2L_HT0to250.csv', 'ttbar_2L_PTtop250.csv'}


Since the features highlighted by out little script are irrelevant, we can just mass-delete them on all the files when we pre-process the data.