In [6]:
import numpy as np
import pandas as pd
import warnings

In [7]:
minmass = 3.3
maxmass = 3.7

In [8]:
def shuffle_XY(X,Y):
    seed_int=np.random.randint(300)
    np.random.seed(seed_int)
    np.random.shuffle(X)
    np.random.seed(seed_int)
    np.random.shuffle(Y)
    return X,Y

class no_logit_norm:
	def __init__(self,array):
		self.mean = np.mean(array, axis=0)
		self.std = np.std(array, axis=0)

	def forward(self,array0):
		return (np.copy(array0)-self.mean)/self.std, np.ones(len(array0),dtype=bool)

	def inverse(self,array0):
		return np.copy(array0)*self.std+self.mean

def make_features_baseline(features, label_arr, m2=False):
    E_part = np.sqrt(features[:,0]**2+features[:,1]**2+features[:,2]**2+features[:,3]**2)+np.sqrt(features[:,7]**2+features[:,8]**2+features[:,9]**2+features[:,10]**2)
    p_part2 = (features[:,0]+features[:,7])**2+(features[:,1]+features[:,8])**2+(features[:,2]+features[:,9])**2
    m_jj = np.sqrt(E_part**2-p_part2)
    ind=np.array(features[:,10]> features[:,3]).astype(int)
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="invalid value encountered in true_divide")
        if m2:
            feat1 = np.array([m_jj*1e-3, features[:, 3]*1e-3, features[:,10]*1e-3, features[:, 5]/features[:,4], features[:, 12]/features[:,11], features[:, 6]/features[:,5], features[:, 13]/features[:,12], label_arr])
            feat2 = np.array([m_jj*1e-3, features[:, 10]*1e-3, features[:,3]*1e-3, features[:, 12]/features[:,11], features[:, 5]/features[:,4], features[:, 13]/features[:,12], features[:, 6]/features[:,5], label_arr])
        else:
            feat1 = np.array([m_jj*1e-3, features[:, 3]*1e-3, (features[:,10]-features[:, 3])*1e-3, features[:, 5]/features[:,4], features[:, 12]/features[:,11], features[:, 6]/features[:,5], features[:, 13]/features[:,12], label_arr])
            feat2 = np.array([m_jj*1e-3, features[:, 10]*1e-3, (features[:,3]-features[:, 10])*1e-3, features[:, 12]/features[:,11], features[:, 5]/features[:,4], features[:, 13]/features[:,12], features[:, 6]/features[:,5], label_arr])
    return np.nan_to_num(feat1*ind+feat2*(np.ones(len(ind))-ind)).T

def file_loading(filename, labels=True, signal=0):
    pandas_file = pd.read_hdf(filename)
    if labels:
        label_arr = np.array(pandas_file['label'], dtype=float)
    else: 
        label_arr = np.ones((len(pandas_file['pxj1'])), dtype=float)*signal

    features = np.array(pandas_file[['pxj1', 'pyj1', 'pzj1', 'mj1', 'tau1j1', 'tau2j1', 'tau3j1', 'pxj2', 'pyj2', 'pzj2', 'mj2', 'tau1j2', 'tau2j2', 'tau3j2']], dtype=float)
    features = make_features_baseline(features, label_arr)
    del pandas_file
    return features

In [None]:
def bigdata_prep(data_file, extrabkg_file, signal_file, save=""):
    data = file_loading(data_file)
    print("Loaded data!")
    extra_bkg = file_loading(extrabkg_file, labels=False)
    print("Loaded extrabkg!")
    sig = file_loading(signal_file)
    print("Loaded signal!")
    sig = sig[sig[:,-1]==1]
    bkg = data[data[:,-1]==0]
    
    innersig_mask = (sig[:,0]>minmass) & (sig[:,0]<maxmass)
    innersig = sig[innersig_mask]

    innermask = (bkg[:,0]>minmass) & (bkg[:,0]<maxmass)
    innerbkg = bkg[innermask]

    extrabkg1 = extra_bkg[:312858]
    extrabkg2 = extra_bkg[312858:]

    X_test = np.concatenate((extrabkg2,sig[:20000],extrabkg1[:40000]))
    Y_test = X_test[:,-1]
    X_test = X_test[:,1:5]

    normalisation = no_logit_norm(innerbkg[:,1:5])
    innerbkg, _ = normalisation.forward(innerbkg[:,1:5])
    innersig, _ = normalisation.forward(innersig[:,1:5])
    X_test, _ = normalisation.forward(X_test)

    print("BKG set: ", len(innerbkg), "; SIG set: ", len(innersig), "; Test set: ", len(X_test))

    np.save("data/X_test.npy", X_test)
    np.save("data/signal.npy", innersig)
    np.save("data/Y_test.npy", Y_test)
    np.save("data/bkg.npy", innerbkg)

In [12]:
bkg_file = "/hpcwork/rwth0934/LHCO_dataset/generated_mf/Pythia_QCD_Dijet_Events.h5"
data_file = "/hpcwork/rwth0934/LHCO_dataset/original/events_anomalydetection_v2.features.h5"
extrabkg_file = "/hpcwork/rwth0934/LHCO_dataset/original/events_anomalydetection_qcd_extra_inneronly_features.h5"
bigdata_prep(bkg_file, extrabkg_file, data_file)

Loaded data!
Loaded extrabkg!
Loaded signal!
BKG set:  1224525 ; SIG set:  75299 ; Test set:  360000


In [5]:
X = np.load("signal.npy")
print(X.nbytes)
print(X.shape)

39184800
(1224525, 4)
