# Format data for training

In [30]:
# set inputs and options

# default options
opts = dict(
    jet_feats = ["pt","eta","phi","en","px","py","pz","btag"],
    njets = 10,
    lep_feats = ["pt","eta","phi","en","px","py","pz"],
    nleps = 2,
    met_feats = ["phi","sumEt","px","py"],
    truth_feats = ["pt","eta","phi","en","px","py","pz"],
)

# options for delphes files, full hadronic selection
delphes_tth_had = dict(
    inpfile = '/scratch/snx3000/musella/delphes_tth.hd5',
    outdir = '/scratch/snx3000/musella/delphes_tth_had',
    selection = 'num_leptons == 0',
    met_feats = None,
)

# options for cms files, full hadronic selection
cms_tth_had = dict(
    inpfile = '/scratch/snx3000/musella/cms_tth_2.hd5',
    outdir = '/scratch/snx3000/musella/cms_tth_had',
    selection = 'num_leptons == 0',
    met_feats = None,
    jet_feats = ["pt","eta","phi","en","px","py","pz","btagDeepCSV"],
)


# choose = delphes_tth_had
choose = cms_tth_had


# copy specfic options
opts.update(choose)

# copy default values to globals
globals().update(opts)


opts


{'jet_feats': ['pt', 'eta', 'phi', 'en', 'px', 'py', 'pz', 'btagDeepCSV'],
 'njets': 10,
 'lep_feats': ['pt', 'eta', 'phi', 'en', 'px', 'py', 'pz'],
 'nleps': 2,
 'met_feats': None,
 'truth_feats': ['pt', 'eta', 'phi', 'en', 'px', 'py', 'pz'],
 'inpfile': '/scratch/snx3000/musella/cms_tth_2.hd5',
 'outdir': '/scratch/snx3000/musella/cms_tth_had',
 'selection': 'num_leptons == 0'}

In [31]:
# read inputs
df = pd.read_hdf(inpfile)

if selection is not None:
    df = df.query(selection)

jetsa = None
hcanda = None
lepsa = None
meta = None
trutha = None
kina = None

In [32]:
# make dijet higgs candidate combination
def hcand(X):    
    cmb = X[0]
    # print(cmb)
    if type(cmb[0]) == list:
        return np.zeros( (2,njf),np.float32 )
    jets = X[1:].values.reshape(1,-1,njf)
    return jets[:,cmb[0,0]].astype(np.float32)
    

In [33]:
# pad top kinematic fit solutions
def pad(X,npad=6):
    if len(X.shape) < 4:
        X = np.zeros((npad,8,4,2))
    elif X.shape[0] < npad:
        X = np.vstack([X,np.zeros((6-X.shape[0],8,4,2))])
    elif X.shape[0] > npad:
        X = X[:npad]
    return X.reshape(-1,*X.shape)



In [34]:
# make output folder
! mkdir -p $outdir

In [35]:
from pyjlr.utils import make_p4

flats = []

# --------------------------------------------------------------------------------------------------------------
# jets
if jet_feats is not None:
    print('formatting jets...')
    onejet = list(range(njets))
    for ijet in onejet:
        make_p4(df,'jets',ijet)
    njf = len(jet_feats)
    jet_feat_cols = ["jets_%s_%d" % (feat,jet) for jet in onejet for feat in jet_feats  ]
    jetsa = df[jet_feat_cols].values
    flats.append(jetsa)
    jetsa = jetsa.reshape(-1,njets,njf)
    np.save(outdir+"/jets",jetsa)
    print('done')
    
# --------------------------------------------------------------------------------------------------------------
# leptons
if lep_feats is not None:
    print('formatting leptons...')
    nlf = len(lep_feats)
    for ilep in range(nleps):
        make_p4(df,'leptons',ilep)
    lepsa = df[ ["leptons_%s_%d" % (feat,lep) for feat in lep_feats for lep in range(nleps)  ]  ].values
    flats.append(lepsa)
    lepsa = lepsa.reshape(-1,nleps,nlf) 
    np.save(outdir+"/leps",lepsa)
    print('done')

# --------------------------------------------------------------------------------------------------------------
# met
if met_feats is not None:
    print('formatting met...')
    meta = df[ ["met_%s" % feat for feat in met_feats  ]  ].values 
    flats.append(meta)
    np.save(outdir+"/met",meta)
    print('done')

# --------------------------------------------------------------------------------------------------------------
# flat array with all above
print('making flat (nokin) features...')
flata = np.hstack(flats)
np.save(outdir+"/flat_nokin",flata)
print('done')

# --------------------------------------------------------------------------------------------------------------
# jet combinations: higgs candidates and top kin fit solutions
if jet_feats is not None and "jet_cmb" in df.columns:
    print('formatting jet combinations...')
    twojets = list(itertools.combinations(onejet,2))

    twojets2ind ={  cmb:icomb for icomb,cmb in enumerate(twojets)  }
    jet_cols = ["jets_cmb"]+jet_feat_cols+["jets_jets_m2_%d%d" % x for x in twojets]
    
    df["kin_sols"] = df["kin_sols"].apply(pad)#.apply(lambda x: pad_sequences(x,6,value=np.zeros() ).shape)
    
    hcanda = np.vstack( df[["jets_cmb"]+jet_feat_cols].apply(hcand,axis=1,raw=True).tolist() )
    kina = np.vstack(df["kin_sols"].tolist())

    flats.append(hcanda.reshape(hcanda.shape[0],-1))
    flats.append(kina.reshape(kina.shape[0],-1))
    np.save(outdir+"/hcand",hcanda)
    np.save(outdir+"/kinsols",kina)
    print('done')    
    
# --------------------------------------------------------------------------------------------------------------
# flat arrat with all above
print('making flat features...')
flata = np.hstack(flats)
np.save(outdir+"/flat",flata)
print('done')

# --------------------------------------------------------------------------------------------------------------
# target
print('making target...')
jlra = df["JLR"].values
np.save(outdir+"/target",jlra)
print('done')

# --------------------------------------------------------------------------------------------------------------
# truth level info
if truth_feats is not None:
    print('formatting truth...')
    ntf = len(truth_feats)
    trutha = df[ ["%s_%s" % (part,feat) for feat in truth_feats for part in ["top","atop","bottom","abottom"]  ]  ].values 
    trutha = trutha.reshape(-1,4,ntf)
    np.save(outdir+"/truth",trutha)    
    print('done')
    

formatting jets...
done
formatting leptons...
done
making flat (nokin) features...
done
making flat features...
done
making target...
done
formatting truth...
done


In [36]:
# list output folder
! ls -ltrh $outdir

total 12G
-rw-r--r-- 1 musella d78 3.0G Jun 11 14:02 jets.npy
-rw-r--r-- 1 musella d78 528M Jun 11 14:03 leps.npy
-rw-r--r-- 1 musella d78 3.5G Jun 11 14:03 flat_nokin.npy
-rw-r--r-- 1 musella d78  38M Jun 11 14:03 target.npy
-rw-r--r-- 1 musella d78 3.5G Jun 11 14:03 flat.npy
-rw-r--r-- 1 musella d78 1.1G Jun 11 14:03 truth.npy
