## Reads in H1 DNN training and generates an output root TTree with DDN output added

This version saves all of the selected branches from the input TTree that go into the pandas df, all of the derived variables, and the DNN output.

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.colors import LogNorm
from matplotlib import rc
from numpy import inf
import os

from os import listdir


import uproot3

import matplotlib as mpl

from datetime import datetime
import subprocess

mpl.rcParams.update({'font.size': 19})
mpl.rcParams.update({'xtick.labelsize': 18}) 
mpl.rcParams.update({'ytick.labelsize': 18}) 
mpl.rcParams.update({'text.usetex' : False})
mpl.rcParams.update({'axes.labelsize': 18}) 
mpl.rcParams.update({'legend.frameon': False}) 



In [4]:
has_gpu = True


regression_training_name = 'training_h1_reg_v4b_tau1b_2'

#data_dir = '/data/owen/DIS-reco/h1-fullsim-2021-09-27-v2a'
#data_dir = '/data/owen/DIS-reco/h1-fullsim-2021-10-12-v4a'
#data_dir = '/data/owen/DIS-reco/h1-2021-10-14-v5f'
#data_dir = 'MLAssistedUnfolding-files'
data_dir = '.'

input_file = 'rapgap_ep0607_tau1b_2_test.root'


dataset_type = 'Rapgap'
#dataset_type = 'Data'

if dataset_type == 'Rapgap' :
    input_file = '%s/%s' % (data_dir,input_file)

if dataset_type == 'Data' :
    input_file = '%s/all-h1-data.root' % data_dir


output_root_file = '%s/dnn-output-h1-v2-%s.root' % (data_dir, dataset_type)

print('\n\n Saving output in %s\n\n' % output_root_file )


#-- for testing
#max_events =   10000
#max_events =   100000

#-- for all events
max_events = 1e9









 Saving output in ./dnn-output-h1-v2-Rapgap.root




### Read in the input TTree.

The input ttree variables should be fairly self explanatory from the names.

The obs_x, obs_y, and obs_Q2 are arrays that contain the following calculations:

0 - electron

1 - E0 E Sigma

2 - E0 Theta Sigma

3 - DA

4 - hadron

5 - ISigma

6 - IDA

7 - Theta Sigma gamma

8 - eSigma

They are there for convenience to compare the DNN predictions with the standard methods.  They are not inputs to the DNN, so don't worry about them if you don't need or want them.


In [5]:
method_names = {}
method_names['[0]'] = 'e'
method_names['[1]'] = 'E0ESigma'
method_names['[2]'] = 'E0ThetaSigma'
method_names['[3]'] = 'DA'
method_names['[4]'] = 'h'
method_names['[5]'] = 'ISigma'
method_names['[6]'] = 'IDA'
method_names['[7]'] = 'ThetaSigmagamma'
method_names['[8]'] = 'eSigma'

for m in method_names :
    print( ' %s is %s' % (m, method_names[m]))

 [0] is e
 [1] is E0ESigma
 [2] is E0ThetaSigma
 [3] is DA
 [4] is h
 [5] is ISigma
 [6] is IDA
 [7] is ThetaSigmagamma
 [8] is eSigma


In [7]:
%%time


ur_file = uproot3.open(input_file)

print (ur_file.keys()) 
ur_tree = ur_file['%s/minitree' % dataset_type ]
print(ur_tree)
ur_tree.show()

selected_branches = []

if dataset_type == 'Rapgap' or dataset_type == 'Django' :
    selected_branches =     [
     'has_isr','has_fsr',
     'tower_sum_40','n_towers_40', 
     'eta_pho_closest_to_ebeam','e_pho_closest_to_ebeam', 'phi_pho_closest_to_ebeam',
     #     'obs_x', 'obs_y', 'obs_Q2',
#     'from_tlv_gen_Q2','from_tlv_gen_x','from_tlv_gen_y',
     'obs_e_e','obs_e_phi',  # 'obs_e_pz','obs_e_pt',
     'obs_e_theta',
     'obs_hfs_e','obs_hfs_pz','obs_hfs_pt','obs_hfs_phi',
#     'obs_dphi',
     'obs_DeltaPhi',
     'obs_e_trk_e',
     # new minimal set of gen-level variables
     'beam_e_e','beam_p_e',
     'gen_HFS_Sigma', 'gen_HFS_T',
     'gen_eUncomb_E', 'gen_eUncomb_theta',
     'gen_eRecomb_E', 'gen_eRecomb_theta',
     # tau1b
     'gen_tau1b',
     'obs_tau1bs','obs_tau1be','obs_tau1bda',
     'obs_cHFSs_M','obs_cHFSs_pt','obs_cHFSs_theta','obs_cHFSs_phi',
    'obs_cHFSs_N',
     'obs_cHFSs_dRavg',
     'obs_cHFSs_dR2avg',
    'obs_cHFSs_Empz',
    'obs_cHFSs_Eppz' ,
    'obs_cHFSe_Empz',
    'obs_cHFSe_Eppz', 
    'obs_cHFSe_keE0',
    'obs_cHFSs_keE0',
    'obs_cHFSs_kesE0',
     # new minimal set of det-level variables 
     #'obs_e_e','obs_e_theta', 
     #'obs_hfs_T','obs_hfs_Sigma'ravg,
     # useful det-level quantities
    'wgt' 
    ]
else :
    selected_branches =     [
         'tower_sum_40','n_towers_40', ''
         'eta_pho_closest_to_ebeam','e_pho_closest_to_ebeam', 'phi_pho_closest_to_ebeam',
         'obs_x', 'obs_y', 'obs_Q2',
         'obs_e_e','obs_e_pz','obs_e_pt','obs_e_phi',
         'obs_hfs_e','obs_hfs_pz','obs_hfs_pt','obs_hfs_phi',
         'obs_dphi',
         'Empz', 'obs_e_trk_e',
         'beam_e_e', 'beam_p_e',
        ]
    
pandas_df   =  ur_tree.pandas.df( selected_branches, entrystop=max_events, flatten=True )

print('\n\n Number of entries in pandas_df:  %d ' % pandas_df.shape[0] )

[b'Rapgap;1']
<TTree b'minitree' at 0x7fb49a41dc40>
wgt                        (no streamer)              asdtype('>f4')
gen_HFS_Sigma              (no streamer)              asdtype('>f4')
gen_HFS_T                  (no streamer)              asdtype('>f4')
gen_eUncomb_E              (no streamer)              asdtype('>f4')
gen_eUncomb_theta          (no streamer)              asdtype('>f4')
gen_eRecomb_E              (no streamer)              asdtype('>f4')
gen_eRecomb_theta          (no streamer)              asdtype('>f4')
obs_e_theta                (no streamer)              asdtype('>f4')
obs_DeltaPhi               (no streamer)              asdtype('>f4')
gen_tau1b                  (no streamer)              asdtype('>f4')
obs_tau1bs                 (no streamer)              asdtype('>f4')
obs_tau1bda                (no streamer)              asdtype('>f4')
obs_tau1bida               (no streamer)              asdtype('>f4')
obs_tau1be                 (no streamer)           

### Add any derived variables here

In [40]:
%%time
## tau1b: to be cleaned up

# rename input quantites (temporarily)
#pandas_df.eval( 'obs_e_E         = obs_e_e', inplace=True )
pandas_df.eval( 'obs_e_pz  = obs_e_e*cos(obs_e_theta)', inplace=True )
pandas_df.eval( 'obs_e_pt  = obs_e_e*sin(obs_e_theta)', inplace=True )
#pandas_df.eval( 'obs_e_theta   = arcsin(obs_e_pt/obs_e_E)', inplace=True )
pandas_df.eval( 'obs_hfs_Sigma = obs_hfs_e - obs_hfs_pz', inplace=True )
pandas_df.eval( 'obs_hfs_T     = obs_hfs_pt', inplace=True )

# useful derived quantites
pandas_df.eval( 'obs_e_Sigma      = obs_e_e - obs_e_pz', inplace=True )
#pandas_df.eval( 'obs_e_tantheta   = sin(obs_e_theta/2.)/cos(obs_e_theta/2.)', inplace=True )
pandas_df.eval( 'obs_e_tantheta   = obs_e_Sigma/obs_e_pt', inplace=True )
pandas_df.eval( 'obs_e_Pt2        = obs_e_pt*obs_e_pt', inplace=True )
pandas_df.eval( 'obs_hfs_tangamma = obs_hfs_Sigma/obs_hfs_pt', inplace=True )

###########################################################
### DIS-DNN originals
###########################################################
pandas_df.eval( 'obs_hfs_Empz = obs_hfs_e - obs_hfs_pz', inplace=True )
pandas_df.eval( 'obs_e_Empz = obs_e_e - obs_e_pz', inplace=True )

pandas_df.eval( 'obs_event_Empz = obs_hfs_Empz + obs_e_Empz', inplace=True )

pandas_df.eval( 'rot_pt1 = 0.70710678 * obs_hfs_pt - 0.70710678 * obs_e_pt', inplace=True )
pandas_df.eval( 'rot_pt2 = 0.70710678 * obs_hfs_pt + 0.70710678 * obs_e_pt', inplace=True )

pandas_df.eval( 'rot_Empz1 = 0.70710678 * obs_hfs_Empz - 0.70710678 * obs_e_Empz', inplace=True )
pandas_df.eval( 'rot_Empz2 = 0.70710678 * obs_hfs_Empz + 0.70710678 * obs_e_Empz', inplace=True )

pandas_df.eval( 'e_ecal_over_trk_ratio = tower_sum_40/obs_e_trk_e', inplace=True )
pandas_df.eval( 'e_ecal_over_trk_ratio = (e_ecal_over_trk_ratio<4)*e_ecal_over_trk_ratio + (e_ecal_over_trk_ratio>4)*4', inplace=True )

pandas_df.eval( 'dphi_pho_closest_to_ebeam = obs_e_phi - phi_pho_closest_to_ebeam', inplace=True )
pandas_df.eval( 'dphi_pho_closest_to_ebeam = (abs(dphi_pho_closest_to_ebeam)<3.14159265)*(dphi_pho_closest_to_ebeam)+(dphi_pho_closest_to_ebeam>3.14159265)*(dphi_pho_closest_to_ebeam-2*3.14159265) + (dphi_pho_closest_to_ebeam<-3.14159265)*(dphi_pho_closest_to_ebeam+2*3.14159265)', inplace=True )
pandas_df.eval( 'dphi_pho_closest_to_ebeam = (dphi_pho_closest_to_ebeam>0)*dphi_pho_closest_to_ebeam + (dphi_pho_closest_to_ebeam<0)*(dphi_pho_closest_to_ebeam+2*3.14159265)', inplace=True )
pandas_df.eval( 'dphi_pho_closest_to_ebeam = (phi_pho_closest_to_ebeam!=0)*(dphi_pho_closest_to_ebeam)+(phi_pho_closest_to_ebeam==0)*(-1)', inplace=True )

pandas_df.eval( 'e_pho_closest_to_ebeam = (e_pho_closest_to_ebeam<30)*e_pho_closest_to_ebeam + (e_pho_closest_to_ebeam>30)*30', inplace=True )

pandas_df.eval( 'n_towers_40 = (n_towers_40<7)*n_towers_40 + (n_towers_40>=7)*7', inplace=True  )

pandas_df.eval( 'has_norad = (has_isr==0) and (has_fsr==0)', inplace=True )

###########################################################

# inelasticity
pandas_df.eval( 'obs_kine_ye   = 1. - obs_e_Sigma/2./beam_e_e', inplace=True )
pandas_df.eval( 'obs_kine_yda  = obs_hfs_tangamma/(obs_hfs_tangamma+obs_e_tantheta)', inplace=True )
pandas_df.eval( 'obs_kine_yh   = obs_hfs_Sigma/2./beam_e_e', inplace=True )
pandas_df.eval( 'obs_kine_ys   = obs_hfs_Sigma/(obs_hfs_Sigma+obs_e_Sigma)', inplace=True )
#pandas_df.eval( 'obs_kine_yes  = 2.*beam_e_e*obs_hfs_Sigma/(obs_hfs_Sigma+obs_e_Sigma)/(obs_hfs_Sigma+obs_e_Sigma)', inplace=True )

# Q2
pandas_df.eval( 'obs_kine_Q2e   = obs_e_Pt2/(1.-obs_kine_ye)', inplace=True )
pandas_df.eval( 'obs_kine_Q2da  = 4.*beam_e_e*beam_e_e/obs_e_tantheta/(obs_hfs_tangamma+obs_e_tantheta)', inplace=True )
pandas_df.eval( 'obs_kine_Q2s   = obs_e_Pt2/(1.-obs_kine_ys)', inplace=True )
pandas_df.eval( 'obs_kine_Q2ida = obs_e_Pt2/(1.-obs_kine_yda)', inplace=True )
#pandas_df.eval( 'obs_kine_Q2h   = obs_hfs_T*obs_hfs_T/(1.-obs_kine_yh)', inplace=True )
#pandas_df.eval( 'obs_kine_Q2ees = 2.*beam_e_e*(2.*obs_e_e - 2.*beam_e_e + obs_hfs_Sigma)', inplace=True )
#pandas_df.eval( 'obs_kine_Q2ets = 2.*beam_e_e*(2.*beam_e_e - obs_hfs_Sigma) / obs_e_tantheta/obs_e_tantheta', inplace=True )
#pandas_df.eval( 'obs_kine_Q2tsg = obs_hfs_T*obs_hfs_T/(1.-obs_kine_yda) ', inplace=True )
##pandas_df.eval( 'obs_kine_Q2tsg = obs_hfs_Sigma*obs_hfs_Sigma/(obs_hfs_tangamma*obs_hfs_tangamma)/(1.-obs_kine_yda) ', inplace=True )

# x
pandas_df.eval( 'obs_kine_xe   = 1./beam_p_e * obs_kine_Q2e   /4. /beam_e_e/obs_kine_ye', inplace=True )
#pandas_df.eval( 'obs_kine_xees = 1./beam_p_e * obs_kine_Q2ees /2. /obs_hfs_Sigma', inplace=True )
#pandas_df.eval( 'obs_kine_xets = 1./beam_p_e * obs_kine_Q2ets /2. /obs_hfs_Sigma', inplace=True )
pandas_df.eval( 'obs_kine_xda  = 1./beam_p_e * obs_kine_Q2da  /4. /beam_e_e/obs_kine_yda', inplace=True )
#pandas_df.eval( 'obs_kine_xh   = 1./beam_p_e * obs_kine_Q2h   /2. /obs_hfs_Sigma', inplace=True )
pandas_df.eval( 'obs_kine_xis  = 1./beam_p_e * obs_kine_Q2s   /2. /obs_hfs_Sigma', inplace=True )
pandas_df.eval( 'obs_kine_xida = 1./beam_p_e * obs_e_e * (1+cos(obs_e_theta))/2./obs_kine_yda', inplace=True ) #check, if theta is correct?!
#pandas_df.eval( 'obs_kine_xtsg = 1./beam_p_e * obs_kine_Q2tsg /2. /obs_hfs_Sigma', inplace=True )



CPU times: user 278 ms, sys: 27.2 ms, total: 305 ms
Wall time: 248 ms


In [41]:
%%time
# gen level DIS-DNN originals 
#pandas_df.eval( 'gen_log_x = log(from_tlv_gen_x)', inplace=True )
#pandas_df.eval( 'gen_log_y = log(from_tlv_gen_y)', inplace=True )
#pandas_df.eval( 'gen_log_Q2 = log(from_tlv_gen_Q2)', inplace=True )


# useful derived quantites
pandas_df.eval( 'gen_e_Sigma      = gen_eRecomb_E * (1. - cos(gen_eRecomb_theta))', inplace=True )
pandas_df.eval( 'gen_e_tantheta   = sin(gen_eRecomb_theta/2.)/cos(gen_eRecomb_theta/2.)', inplace=True )
pandas_df.eval( 'gen_e_Pt2        = gen_eRecomb_E*gen_eRecomb_E * sin(gen_eRecomb_theta)* sin(gen_eRecomb_theta)', inplace=True )
pandas_df.eval( 'gen_HFS_tangamma = gen_HFS_Sigma/gen_HFS_T', inplace=True )

# inelasticity
pandas_df.eval( 'gen_kine_ye   = 1. - gen_e_Sigma/2./beam_e_e', inplace=True )
pandas_df.eval( 'gen_kine_yda  = gen_HFS_tangamma/(gen_HFS_tangamma+gen_e_tantheta)', inplace=True )
pandas_df.eval( 'gen_kine_ys   = gen_HFS_Sigma/(gen_HFS_Sigma+gen_e_Sigma)', inplace=True )
#pandas_df.eval( 'gen_kine_yes  = 2.*beam_e_e*gen_HFS_Sigma/(gen_HFS_Sigma+gen_e_Sigma)/(gen_HFS_Sigma+gen_e_Sigma)', inplace=True )
#pandas_df.eval( 'gen_kine_yh   = gen_HFS_Sigma/2./beam_e_e', inplace=True )

# Q2
pandas_df.eval( 'gen_kine_Q2e   = gen_e_Pt2/(1.-gen_kine_ye)', inplace=True )
pandas_df.eval( 'gen_kine_Q2da  = 4.*beam_e_e*beam_e_e/gen_e_tantheta/(gen_HFS_tangamma+gen_e_tantheta)', inplace=True )
pandas_df.eval( 'gen_kine_Q2s   = gen_e_Pt2/(1.-gen_kine_ys)', inplace=True )
#pandas_df.eval( 'gen_kine_Q2h   = gen_HFS_T*gen_HFS_T/(1.-gen_kine_yh)', inplace=True )
#pandas_df.eval( 'gen_kine_Q2ida = gen_e_Pt2/(1.-gen_kine_yda)', inplace=True )
#pandas_df.eval( 'gen_kine_Q2ees = 2.*beam_e_e*(2.*gen_eRecomb_E - 2.*beam_e_e + gen_HFS_Sigma)', inplace=True )
#pandas_df.eval( 'gen_kine_Q2ets = 2.*beam_e_e*(2.*beam_e_e - gen_HFS_Sigma) / gen_e_tantheta/gen_e_tantheta', inplace=True )
#pandas_df.eval( 'gen_kine_Q2tsg = gen_HFS_T*gen_HFS_T/(1.-gen_kine_yda) ', inplace=True )
##pandas_df.eval( 'gen_kine_Q2tsg = gen_HFS_Sigma*gen_HFS_Sigma/(gen_HFS_tangamma*gen_HFS_tangamma)/(1.-gen_kine_yda) ', inplace=True )

# x
#pandas_df.eval( 'gen_kine_xe   = 1./beam_p_e * gen_kine_Q2e   /4. /beam_e_e/gen_kine_ye', inplace=True )
#pandas_df.eval( 'gen_kine_xees = 1./beam_p_e * gen_kine_Q2ees /2. /gen_HFS_Sigma', inplace=True )
#pandas_df.eval( 'gen_kine_xets = 1./beam_p_e * gen_kine_Q2ets /2. /gen_HFS_Sigma', inplace=True )
#pandas_df.eval( 'gen_kine_xda  = 1./beam_p_e * gen_kine_Q2da  /4. /beam_e_e/gen_kine_yda', inplace=True )
#pandas_df.eval( 'gen_kine_xh   = 1./beam_p_e * gen_kine_Q2h   /2. /gen_HFS_Sigma', inplace=True )
pandas_df.eval( 'gen_kine_xis  = 1./beam_p_e * gen_kine_Q2s   /2. /gen_HFS_Sigma', inplace=True )
#pandas_df.eval( 'gen_kine_xida = 1./beam_p_e * gen_eRecomb_E * (1+cos(gen_eRecomb_theta))/2./gen_kine_yda', inplace=True )
#pandas_df.eval( 'gen_kine_xtsg = 1./beam_p_e * gen_kine_Q2tsg /2. /gen_HFS_Sigma', inplace=True )

# target variables
pandas_df.eval( 'gen_log_x  = log(gen_kine_xis)', inplace=True )
pandas_df.eval( 'gen_log_y  = log(gen_kine_ys)', inplace=True )
pandas_df.eval( 'gen_log_Q2 = log(gen_kine_Q2s)', inplace=True )


CPU times: user 103 ms, sys: 16.1 ms, total: 119 ms
Wall time: 96.9 ms


In [42]:
%%time

    

pandas_df.eval( 'obs_ptbal = 1. - obs_e_pt / obs_hfs_pt', inplace=True )
pandas_df.eval( 'obs_pzbal = 1. - (obs_hfs_Empz + obs_e_Empz)/2./beam_e_e', inplace=True )




pandas_df.eval( 'obs_hfs_theta = arctan2(obs_hfs_pt,obs_hfs_pz)', inplace=True )


CPU times: user 26.2 ms, sys: 9.01 ms, total: 35.2 ms
Wall time: 21.9 ms


## Apply any event selection here.

In [43]:
%%time

print('\n\n Number of entries in pandas_df before selection :  %d ' % pandas_df.shape[0] )


pandas_df = pandas_df.query('obs_event_Empz > 0')


pandas_df = pandas_df.query('obs_event_Empz > 46 and obs_event_Empz < 62')

if dataset_type == 'Rapgap' or dataset_type == 'Django' :
    pandas_df = pandas_df.query('gen_kine_Q2s > 200')

pandas_df = pandas_df.query('obs_hfs_pt > 0')

pandas_df = pandas_df.query('e_ecal_over_trk_ratio > 0')

print('\n\n Number of entries in pandas_df after selection:  %d ' % pandas_df.shape[0] )







 Number of entries in pandas_df before selection :  198588 


 Number of entries in pandas_df after selection:  198588 
CPU times: user 142 ms, sys: 40.6 ms, total: 183 ms
Wall time: 178 ms


## Set up machine learning stuff

In [44]:
os.environ['CUDA_VISIBLE_DEVICES']="0"

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pickle import load

#-- Ben suggested to try this.  2021-08-07
from tensorflow.keras.callbacks import EarlyStopping
earlystopping = EarlyStopping(patience=10,
               verbose=True,
               restore_best_weights=True)

import os

print(tf.config.list_physical_devices())

if has_gpu :
    physical_devices = tf.config.list_physical_devices('GPU') 
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

#####physical_devices = tf.config.list_physical_devices('CPU')

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Load the inputs for the DNN and transform them.   Don't change anything here!

In [45]:
X = np.c_[
    pandas_df['e_ecal_over_trk_ratio'].to_numpy(),
    pandas_df['n_towers_40'].to_numpy(),
    pandas_df['eta_pho_closest_to_ebeam'].to_numpy(),
    pandas_df['e_pho_closest_to_ebeam'].to_numpy(),
    pandas_df['dphi_pho_closest_to_ebeam'].to_numpy(),
#    pandas_df['obs_e_pz'].to_numpy(),
    pandas_df['obs_e_e'].to_numpy(),
    pandas_df['obs_hfs_pz'].to_numpy(),
    pandas_df['obs_hfs_e'].to_numpy(),
    pandas_df['rot_pt1'].to_numpy(),
    pandas_df['rot_Empz1'].to_numpy(),
    pandas_df['rot_pt2'].to_numpy(),
    pandas_df['obs_pzbal'].to_numpy(),
    pandas_df['obs_ptbal'].to_numpy(),
    pandas_df['obs_DeltaPhi'].to_numpy(),
    pandas_df['obs_kine_ys'].to_numpy(),

    pandas_df['obs_cHFSs_pt'].to_numpy(),
    pandas_df['obs_cHFSs_M'].to_numpy(),
    pandas_df['obs_cHFSs_theta'].to_numpy(),
    pandas_df['obs_cHFSs_phi'].to_numpy(),
    pandas_df['obs_cHFSs_N'].to_numpy(),
    pandas_df['obs_cHFSs_dRavg'].to_numpy(),
    
    pandas_df['obs_cHFSs_dR2avg'].to_numpy(),
    pandas_df['obs_cHFSs_Empz'].to_numpy(),
    pandas_df['obs_cHFSs_Eppz'].to_numpy(),
    pandas_df['obs_cHFSe_Empz'].to_numpy(),
    pandas_df['obs_cHFSe_Eppz'].to_numpy(), 
    pandas_df['obs_cHFSe_keE0'].to_numpy(),
    pandas_df['obs_cHFSs_keE0'].to_numpy(),
    pandas_df['obs_cHFSs_kesE0'].to_numpy(),

]




#-- Load the scaler transformations!  These are essential when reusing the training with a different dataset.

scaler = load( open('%s-scalers/input_scaler.pkl' % regression_training_name, 'rb'))
X = scaler.transform(X)

    

## Set up the regression network

In [46]:
model_r = tf.keras.models.load_model('%s_regression' % regression_training_name )
model_r.summary()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                1920      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               66048     
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              5

In [47]:
%%time

mypreds_r = model_r.predict(X,batch_size=1000)

CPU times: user 586 ms, sys: 184 ms, total: 769 ms
Wall time: 371 ms


### Undo the variable transformations

In [48]:
scalerY = load( open('%s-scalers/target_scaler.pkl' % regression_training_name , 'rb'))

inv_trans_pred = scalerY.inverse_transform(mypreds_r)
pred_vals = np.exp( inv_trans_pred )

### Save the results in an output root file

This saves all of the variables in the pandas_df, including the derived ones, in addition to the DNN outputs.

In [49]:
pandas_df.keys()

Index(['has_isr', 'has_fsr', 'tower_sum_40', 'n_towers_40',
       'eta_pho_closest_to_ebeam', 'e_pho_closest_to_ebeam',
       'phi_pho_closest_to_ebeam', 'obs_e_e', 'obs_e_phi', 'obs_e_theta',
       'obs_hfs_e', 'obs_hfs_pz', 'obs_hfs_pt', 'obs_hfs_phi', 'obs_DeltaPhi',
       'obs_e_trk_e', 'beam_e_e', 'beam_p_e', 'gen_HFS_Sigma', 'gen_HFS_T',
       'gen_eUncomb_E', 'gen_eUncomb_theta', 'gen_eRecomb_E',
       'gen_eRecomb_theta', 'gen_tau1b', 'obs_tau1bs', 'obs_tau1be',
       'obs_tau1bda', 'obs_cHFSs_M', 'obs_cHFSs_pt', 'obs_cHFSs_theta',
       'obs_cHFSs_phi', 'obs_cHFSs_N', 'obs_cHFSs_dRavg', 'obs_cHFSs_dR2avg',
       'obs_cHFSs_Empz', 'obs_cHFSs_Eppz', 'obs_cHFSe_Empz', 'obs_cHFSe_Eppz',
       'obs_cHFSe_keE0', 'obs_cHFSs_keE0', 'obs_cHFSs_kesE0', 'wgt',
       'obs_e_pz', 'obs_e_pt', 'obs_hfs_Sigma', 'obs_hfs_T', 'obs_e_Sigma',
       'obs_e_tantheta', 'obs_e_Pt2', 'obs_hfs_tangamma', 'obs_hfs_Empz',
       'obs_e_Empz', 'obs_event_Empz', 'rot_pt1', 'rot_pt2', 'rot_Empz1

### Add the DNN outputs to the pandas data frame

In [50]:
pandas_df['dnn_x'] = pred_vals[:,0]
pandas_df['dnn_Q2'] = pred_vals[:,1]
pandas_df['dnn_y'] = pred_vals[:,2]
pandas_df['dnn_tau1b'] = pred_vals[:,3]

In [51]:
pandas_df.keys()

Index(['has_isr', 'has_fsr', 'tower_sum_40', 'n_towers_40',
       'eta_pho_closest_to_ebeam', 'e_pho_closest_to_ebeam',
       'phi_pho_closest_to_ebeam', 'obs_e_e', 'obs_e_phi', 'obs_e_theta',
       'obs_hfs_e', 'obs_hfs_pz', 'obs_hfs_pt', 'obs_hfs_phi', 'obs_DeltaPhi',
       'obs_e_trk_e', 'beam_e_e', 'beam_p_e', 'gen_HFS_Sigma', 'gen_HFS_T',
       'gen_eUncomb_E', 'gen_eUncomb_theta', 'gen_eRecomb_E',
       'gen_eRecomb_theta', 'gen_tau1b', 'obs_tau1bs', 'obs_tau1be',
       'obs_tau1bda', 'obs_cHFSs_M', 'obs_cHFSs_pt', 'obs_cHFSs_theta',
       'obs_cHFSs_phi', 'obs_cHFSs_N', 'obs_cHFSs_dRavg', 'obs_cHFSs_dR2avg',
       'obs_cHFSs_Empz', 'obs_cHFSs_Eppz', 'obs_cHFSe_Empz', 'obs_cHFSe_Eppz',
       'obs_cHFSe_keE0', 'obs_cHFSs_keE0', 'obs_cHFSs_kesE0', 'wgt',
       'obs_e_pz', 'obs_e_pt', 'obs_hfs_Sigma', 'obs_hfs_T', 'obs_e_Sigma',
       'obs_e_tantheta', 'obs_e_Pt2', 'obs_hfs_tangamma', 'obs_hfs_Empz',
       'obs_e_Empz', 'obs_event_Empz', 'rot_pt1', 'rot_pt2', 'rot_Empz1

In [52]:
branch_dict = {}
data_dict = {}

for k in pandas_df.keys() :
    dict_key = k
    for m in method_names :
        if m in k :
            print( 'found %s in %s' % (m, k))
            dict_key = k.replace( m, '_%s' % method_names[m])
    dict_key = dict_key.replace('[','').replace(']','')  # in case not in method_names
    print( ' key  %s , dict_key %s' % (k, dict_key) )
    print( ' dtype for %s is ' % k, pandas_df[k].dtype)
    branch_dict[dict_key] = pandas_df[k].dtype
    data_dict[dict_key] = pandas_df[k].to_numpy()
    
    

 key  has_isr , dict_key has_isr
 dtype for has_isr is  int8
 key  has_fsr , dict_key has_fsr
 dtype for has_fsr is  int8
 key  tower_sum_40 , dict_key tower_sum_40
 dtype for tower_sum_40 is  float32
 key  n_towers_40 , dict_key n_towers_40
 dtype for n_towers_40 is  int64
 key  eta_pho_closest_to_ebeam , dict_key eta_pho_closest_to_ebeam
 dtype for eta_pho_closest_to_ebeam is  float32
 key  e_pho_closest_to_ebeam , dict_key e_pho_closest_to_ebeam
 dtype for e_pho_closest_to_ebeam is  float64
 key  phi_pho_closest_to_ebeam , dict_key phi_pho_closest_to_ebeam
 dtype for phi_pho_closest_to_ebeam is  float32
 key  obs_e_e , dict_key obs_e_e
 dtype for obs_e_e is  float32
 key  obs_e_phi , dict_key obs_e_phi
 dtype for obs_e_phi is  float32
 key  obs_e_theta , dict_key obs_e_theta
 dtype for obs_e_theta is  float32
 key  obs_hfs_e , dict_key obs_hfs_e
 dtype for obs_hfs_e is  float32
 key  obs_hfs_pz , dict_key obs_hfs_pz
 dtype for obs_hfs_pz is  float32
 key  obs_hfs_pt , dict_key obs_h

In [53]:
branch_dict

{'has_isr': dtype('int8'),
 'has_fsr': dtype('int8'),
 'tower_sum_40': dtype('float32'),
 'n_towers_40': dtype('int64'),
 'eta_pho_closest_to_ebeam': dtype('float32'),
 'e_pho_closest_to_ebeam': dtype('float64'),
 'phi_pho_closest_to_ebeam': dtype('float32'),
 'obs_e_e': dtype('float32'),
 'obs_e_phi': dtype('float32'),
 'obs_e_theta': dtype('float32'),
 'obs_hfs_e': dtype('float32'),
 'obs_hfs_pz': dtype('float32'),
 'obs_hfs_pt': dtype('float32'),
 'obs_hfs_phi': dtype('float32'),
 'obs_DeltaPhi': dtype('float32'),
 'obs_e_trk_e': dtype('float32'),
 'beam_e_e': dtype('float32'),
 'beam_p_e': dtype('float32'),
 'gen_HFS_Sigma': dtype('float32'),
 'gen_HFS_T': dtype('float32'),
 'gen_eUncomb_E': dtype('float32'),
 'gen_eUncomb_theta': dtype('float32'),
 'gen_eRecomb_E': dtype('float32'),
 'gen_eRecomb_theta': dtype('float32'),
 'gen_tau1b': dtype('float32'),
 'obs_tau1bs': dtype('float32'),
 'obs_tau1be': dtype('float32'),
 'obs_tau1bda': dtype('float32'),
 'obs_cHFSs_M': dtype('float3

In [54]:
%%time

root_file3 = uproot3.recreate( output_root_file )

root_file3['dnnout'] = uproot3.newtree( branch_dict )

root_file3['dnnout'].extend( data_dict )

CPU times: user 2.45 s, sys: 27.5 ms, total: 2.48 s
Wall time: 2.5 s
