In [1]:
# TensorFlow and tf.keras
import tensorflow as tf
#tf.enable_eager_execution()

from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd

import keras
import keras.backend
from keras import layers
from keras import models
import keras.utils

print(tf.__version__)

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import math
import time
import h5py
import sklearn
from sklearn.utils import shuffle

import os
import datetime



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


1.13.1


## Preprocessing

#### This notebook creates three npz files for test, train, and example input variables that are read as input files to `CNN_1D_flat.ipynb`

In [2]:
# load output files from Showjets.ipynb

data = np.load('/mnt/data/ml/Constituent4vec_addmoretaus.npz')
jetdata = np.load('/mnt/data/ml/ShowJetsData_addmoretaus.npz')

In [3]:
# creates new directory daily to avoid writing over files

dirs = ['PreProcessing/']

day = datetime.date.today().strftime('%m%d%y')
    
for d in dirs:
    if not os.path.exists(d+day+'/'):
        print('creating '+d+day+'/')
        os.makedirs(d+day+'/')

creating PreProcessing/072320/


In [95]:
labels = data['labels']
jetlabels = jetdata['labels']

varDict = {}
varDictTrain = {}
varDictTest = {}

qcd_len = data['jetconstPt_log'].flatten()[np.where(labels[:,0] == 1)].shape[0]
z_len = data['jetconstPt_log'].flatten()[np.where(labels[:,1] == 1)].shape[0]

# make size of zz and qcd the same
data_len = np.max([qcd_len, z_len]) - np.abs(qcd_len - z_len)

# size of expert variables
data_len_xv = data_len // 20

len_test = 0
len_train = 0

mask_train = np.ones(shape=(data_len//20,20),dtype=bool)
mask_train[::4] = 0
mask_train[1::8] = 0

mask_train_xv = np.ones(shape=(data_len_xv),dtype=bool)
mask_train_xv[::4] = 0
mask_train_xv[1::8] = 0

mask_test = slice(0,data_len,4)
mask_example = slice(0,data_len,300)

### Select variables to retrieve from Showjets npz files

In [96]:
# variables to get from "jetdata" npz file

xaugs = ['chMult',
        'jetpull',
        'tau1_b05',
        'tau2_b05',
        'tau3_b05',
        'tau1_sd_b05',
        'tau2_sd_b05',
        'tau3_sd_b05',
        'tau1_b10',
        'tau2_b10',
        'tau3_b10',
        'tau1_sd_b10',
        'tau2_sd_b10',
        'tau3_sd_b10',
        'tau1_b15',
        'tau2_b15',
        'tau3_b15',
        'tau1_sd_b15',
        'tau2_sd_b15',
        'tau3_sd_b15',
        'tau1_b20',
        'tau2_b20',
        'tau3_b20',
        'tau1_sd_b20',
        'tau2_sd_b20',
        'tau3_sd_b20',
        'jetMass',
        'jetMassSD',
        ]


# variables to get from "data" (constituents) npz file

particle_list_vars = ['jetconstPt_log',
                      'jetconstEta_abs',
                      'jetconstE_log',
                      'jetconstPt_Jetlog',
                      'charge',
                      'isEle',
                      'isPho',
                      'isMuon',
                      'isCh',
                      'isNh',
                      'delta_eta',
                      'delta_phi',
                      'deltaR_jet',
                      'deltaR_subjet0',
                      'deltaR_subjet1',
                      'dxy',
                      'dz',
                     ]

In [105]:



# loop through particle list variables and reshape, apply testing and training masks

for varName in particle_list_vars:
    if 'labels' in varName: continue
    if 'jetconstEvnum' in varName: continue
    var = data[varName]

    
    qcd_var = var.flatten()[(np.where(labels[:,0] == 1))].reshape(qcd_len//20, 20)
    z_var   = var.flatten()[(np.where(labels[:,1] == 1))].reshape(z_len//20, 20)

    qcd_var  = qcd_var[:int(data_len//20)].reshape(data_len//20, 20)
    z_var    = z_var[:int(data_len//20)].reshape(data_len//20, 20)
    
    qcd_var_train = qcd_var[mask_train]
    qcd_var_train = qcd_var_train.reshape(qcd_var_train.shape[0]//20,20)
    z_var_train = z_var[mask_train]
    z_var_train = z_var_train.reshape(z_var_train.shape[0]//20,20)
  
    qcd_var_test = qcd_var[mask_test]
    z_var_test = z_var[mask_test]
    
    # if first variable, set lengths
    if 'jetconstPt_log' in varName:
        len_train = z_var_train.shape[0]
        len_test = z_var_test.shape[0]

    stackVar = np.vstack((qcd_var, z_var))
    stackVarTrain = np.vstack((qcd_var_train, z_var_train))
    stackVarTest = np.vstack((qcd_var_test, z_var_test))
    
    varDict.update({varName : stackVar})
    varDictTrain.update({varName : stackVarTrain})
    varDictTest.update({varName : stackVarTest})
    
    print(varName, var.shape)
    print(qcd_var.shape, 'qcd')
    print(z_var.shape, 'z')
    print('var, train, test:')
    print(qcd_var.shape)
    print(qcd_var_train.shape)
    print(qcd_var_test.shape)
    

    
# loop through expert variables and reshape, apply testing and training masks
    
for varName in xaugs:
    var = jetdata[varName]

    qcd_var = var.flatten()[(np.where(jetlabels[:,0] == 1))]
    z_var   = var.flatten()[(np.where(jetlabels[:,1] == 1))]

    qcd_var  = qcd_var[:int(data_len_xv)].reshape(data_len_xv, 1)
    z_var    = z_var[:int(data_len_xv)].reshape(data_len_xv, 1)
    

    qcd_var_train = qcd_var[mask_train_xv]
    z_var_train = z_var[mask_train_xv]

    qcd_var_test = qcd_var[mask_test]
    z_var_test = z_var[mask_test]

    stackVar = np.vstack((qcd_var, z_var))
    stackVarTrain = np.vstack((qcd_var_train, z_var_train))
    stackVarTest = np.vstack((qcd_var_test, z_var_test))
    
    varDict.update({varName : stackVar})
    varDictTrain.update({varName : stackVarTrain})
    varDictTest.update({varName : stackVarTest})
    
    print(varName, var.shape)
    print(qcd_var.shape, 'qcd')
    print(z_var.shape, 'z')
    print('var, train, test:')
    print(qcd_var.shape)
    print(qcd_var_train.shape)
    print(qcd_var_test.shape)

jetconstPt_log (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(551735, 20)
(220695, 20)
jetconstEta_abs (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(551735, 20)
(220695, 20)
jetconstE_log (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(551735, 20)
(220695, 20)
jetconstPt_Jetlog (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(551735, 20)
(220695, 20)
charge (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(551735, 20)
(220695, 20)
isEle (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(551735, 20)
(220695, 20)
isPho (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(551735, 20)
(220695, 20)
isMuon (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(551735, 20)
(220695, 20)
isCh (1822770, 20)
(882778, 20) qcd
(882778, 20) z
var, train, test:
(882778, 20)
(5517

### Create labels array

In [7]:

qcd_lab = np.zeros([data_len, 2])
qcd_lab[:,0] = 1
z_lab = np.zeros([data_len, 2])
z_lab[:,1] = 1

qcd_lab_train = np.zeros([len_train, 2])
qcd_lab_train[:,0] = 1
z_lab_train = np.zeros([len_train, 2])
z_lab_train[:,1] = 1

qcd_lab_test = np.zeros([len_test, 2])
qcd_lab_test[:,0] = 1
z_lab_test = np.zeros([len_test, 2])
z_lab_test[:,1] = 1

labels = np.vstack((qcd_lab, z_lab))
trainlabels = np.vstack((qcd_lab_train, z_lab_train))
testlabels = np.vstack((qcd_lab_test, z_lab_test))



### Select variables for batch

Remove particle list variables and/or expert variables from batch by commenting them out. For example:

```
variables = []
expertVariables = []

# expertVariables = ['chMult',
#                    'jetpull',
#                    'tau1_b05',
#                    'tau2_b05',
#                    'tau3_b05',
#                    'tau1_sd_b05',
#                    'tau2_sd_b05',
#                    'tau3_sd_b05',
#                    'tau1_b10',
#                    'tau2_b10',
#                    'tau3_b10',
#                    'tau1_sd_b10',
#                    'tau2_sd_b10',
#                    'tau3_sd_b10',
#                    'tau1_b15',
#                    'tau2_b15',
#                    'tau3_b15',
#                    'tau1_sd_b15',
#                    'tau2_sd_b15',
#                    'tau3_sd_b15',
#                    'tau1_b20',
#                    'tau2_b20',
#                    'tau3_b20',
#                    'tau1_sd_b20',
#                    'tau2_sd_b20',
#                    'tau3_sd_b20',
#                    'jetMass',
#                    'jetMassSD',
#                   ]

variables = ['jetconstEta_abs',
                'jetconstE_log',
                'jetconstPt_Jetlog',
#                 'charge',
                'isEle',
                'isPho',
                'isMuon',
                'isCh',
                'isNh',
                'delta_eta',
                'delta_phi',
                'deltaR_jet',
                'deltaR_subjet0',
                'deltaR_subjet1',
                'dxy',
                'dz',
               ]


```

will create a batch without expert variables and without the charge variable. 

In [14]:
variables = []
expertVariables = []

expertVariables = ['chMult',
                   'jetpull',
                   'tau1_b05',
                   'tau2_b05',
                   'tau3_b05',
                   'tau1_sd_b05',
                   'tau2_sd_b05',
                   'tau3_sd_b05',
                   'tau1_b10',
                   'tau2_b10',
                   'tau3_b10',
                   'tau1_sd_b10',
                   'tau2_sd_b10',
                   'tau3_sd_b10',
                   'tau1_b15',
                   'tau2_b15',
                   'tau3_b15',
                   'tau1_sd_b15',
                   'tau2_sd_b15',
                   'tau3_sd_b15',
                   'tau1_b20',
                   'tau2_b20',
                   'tau3_b20',
                   'tau1_sd_b20',
                   'tau2_sd_b20',
                   'tau3_sd_b20',
                   'jetMass',
                   'jetMassSD',
                  ]

variables = ['jetconstEta_abs',
                'jetconstE_log',
                'jetconstPt_Jetlog',
                'charge',
                'isEle',
                'isPho',
                'isMuon',
                'isCh',
                'isNh',
                'delta_eta',
                'delta_phi',
                'deltaR_jet',
                'deltaR_subjet0',
                'deltaR_subjet1',
                'dxy',
                'dz',
               ]


allVariables = ['jetconstPt_log'] + variables + expertVariables
totalVar = len(allVariables)
variables_in_plots = '__'+str(totalVar)+'var__jetconstPt_log'


### Append input variables to batch, train, test, and example lists

In [101]:
batch = varDict['jetconstPt_log']
trainbatch = varDictTrain['jetconstPt_log']
testbatch = varDictTest['jetconstPt_log']

batch = batch.reshape((batch.shape[0], batch.shape[1], 1))
trainbatch = trainbatch.reshape((trainbatch.shape[0], trainbatch.shape[1], 1))
testbatch = testbatch.reshape((testbatch.shape[0], testbatch.shape[1], 1))

example_batch = trainbatch[mask_example]
example_labels = trainlabels[mask_example]
train_variables = [trainbatch]

example_variables = [example_batch]
train_variables = [trainbatch]
test_variables = [testbatch]

for var in variables:
    
    trainVar = varDictTrain[var]
    
    example_variables.append(trainVar[mask_example].reshape(trainVar[mask_example].shape[0], trainVar[mask_example].shape[1],1))
    train_variables.append(trainVar.reshape(trainVar.shape[0], trainVar.shape[1],1))
    test_variables.append(varDictTest[var].reshape(varDictTest[var].shape[0], varDictTest[var].shape[1],1))

for var in expertVariables:
    
    trainVar = varDictTrain[var]
    
    example_variables.append(trainVar[mask_example])
    train_variables.append(trainVar)
    test_variables.append(varDictTest[var])
    

print('labels', labels.shape)
print('batch', batch.shape)
print('train labels', trainlabels.shape)
print('train batch', trainbatch.shape)
print('test labels', testlabels.shape)
print('test batch', testbatch.shape)

labels (36455400, 2)
batch (1765556, 20, 1)
train labels (1103470, 2)
train batch (1103470, 20, 1)
test labels (441390, 2)
test batch (441390, 20, 1)


### Normalize x-axis for input variables

In [102]:
for i in range(len(test_variables)):
    minn = np.min(batch[i],axis=0)
    maxx = np.max(batch[i],axis=0)
    test_variables[i] = (test_variables[i] - np.min(test_variables[i]))/(np.max(test_variables[i]-np.min(test_variables[i])))
    train_variables[i] = (train_variables[i] - np.min(train_variables[i]))/(np.max(train_variables[i]-np.min(train_variables[i])))
    example_variables[i] = (example_variables[i] - np.min(example_variables[i]))/(np.max(example_variables[i]-np.min(example_variables[i])))
    
    

In [103]:
print('testing variables')
for tevar in test_variables:
    print(tevar.shape)
print()
print('training variables')
for trvar in train_variables:
    print(trvar.shape)
print()
print('example variables')
for exvar in example_variables:
    print(exvar.shape)

testing variables
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)
(441390, 20, 1)

training variables
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)
(1103470, 20, 1)

example variables
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)
(3679, 20, 1)


### Save arrays to test, train, example npz files

In [12]:
totalVar = len(allVariables)
nXvar = len(expertVariables)

file_extension = str(totalVar)+'var_'+str(nXvar)+'Xvar_'+'normalized.npz'

testname = 'test_variables_'+file_extension
trainname = 'train_variables_'+file_extension
examplename = 'example_variables_'+file_extension

np.savez('PreProcessing/'+day+'/'+testname, **{var:test for var, test in zip(allVariables, test_variables)}, labels=testlabels, variables=allVariables)
np.savez('PreProcessing/'+day+'/'+trainname, **{var:test for var, test in zip(allVariables, train_variables)}, labels=trainlabels, variables=allVariables)
np.savez('PreProcessing/'+day+'/'+examplename, **{var:test for var, test in zip(allVariables, example_variables)}, labels=example_labels, variables=allVariables)



In [13]:
!ls -t PreProcessing/{day}/*.npz | head -3

PreProcessing/072320/example_variables_17var_0Xvar_normalized.npz
PreProcessing/072320/train_variables_17var_0Xvar_normalized.npz
PreProcessing/072320/test_variables_17var_0Xvar_normalized.npz
