# Electron Ion Collider Experiments: Tau Lepton ID

See eic-electron-id for introduction on phsyics, detectors, EIC, DIS, etc.

Tau identification with machine learning
* What is a tau particle- e.g. leptoquarks events (reference for details)
* Separating tau jets from standard model jets. Machine learning.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read data
data = pd.read_csv('data/LeptoAna_r05_p250_e20.csv')
#data = data.astype('float32')
#data = data.dropna(axis=0)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14363 entries, 0 to 14362
Data columns (total 49 columns):
Row                          14363 non-null int64
event                        14363 non-null int64
evtgen_is_tau                14363 non-null int64
evtgen_tau_etotal            6556 non-null float64
evtgen_tau_eta               6556 non-null float64
evtgen_tau_phi               6556 non-null float64
evtgen_tau_decay_prong       14363 non-null int64
evtgen_tau_decay_hcharged    14363 non-null int64
evtgen_tau_decay_lcharged    14363 non-null int64
evtgen_is_uds                14363 non-null int64
evtgen_uds_etotal            6519 non-null float64
evtgen_uds_eta               6519 non-null float64
evtgen_uds_phi               6519 non-null float64
jet_id                       14363 non-null int64
jet_eta                      14363 non-null float64
jet_phi                      14363 non-null float64
jet_etotal                   14363 non-null float64
jet_etrans                   

In [4]:
data.head(1)

Unnamed: 0,Row,event,evtgen_is_tau,evtgen_tau_etotal,evtgen_tau_eta,evtgen_tau_phi,evtgen_tau_decay_prong,evtgen_tau_decay_hcharged,evtgen_tau_decay_lcharged,evtgen_is_uds,...,jetshape_emcal_econe_r02,jetshape_emcal_econe_r03,jetshape_emcal_econe_r04,jetshape_emcal_econe_r05,tracks_count_r02,tracks_count_r04,tracks_rmax_r02,tracks_rmax_r04,tracks_chargesum_r02,tracks_chargesum_r04
0,2,0,1,34.829021,0.221435,0.760311,3,3,0,0,...,4.608803,4.749965,4.966215,5.408717,1,1,0.073429,0.073429,-1,-1


In [5]:
# replace values: DIS = 0, tau = 1
# note whitespace ' ' before ' DIS' and ' tau'
#map_replace = {
#'jet_type':{
#    ' DIS':0,
#    ' tau':1
#}
#}

#data.replace( map_replace, inplace=True )

In [6]:
data['evtgen_is_tau'].value_counts()

0    7807
1    6556
Name: evtgen_is_tau, dtype: int64

In [7]:
#feature_cols = ['n_Above_0p1', 'eta_average', 'Delta_phi_std', 'tower_energy_sum']
#target_col = 'jet_type'
#feature_cols = ['tracks_count_r04', 'tracks_chargesum_r04', 'tracks_rmax_r04', 'jetshape_radius']
feature_cols = [
#    'jet_eta',
#    'jet_phi',
    'jet_etotal',
    'jet_etrans',
    'jet_ptotal',
    'jet_ptrans',
    'jet_minv',
    'jet_mtrans',
    'jet_ncomp',
    'jet_ncomp_above_0p1',
    'jet_ncomp_above_1',
#    'jet_ncomp_above_10',
    'jet_ncomp_emcal',
    'jetshape_radius',
    'jetshape_rms',
    'jetshape_r90',
    'jetshape_econe_r01',
    'jetshape_econe_r02',
    'jetshape_econe_r03',
    'jetshape_econe_r04',
    'jetshape_econe_r05',
    'jetshape_emcal_radius',
    'jetshape_emcal_rms',
    'jetshape_emcal_r90',
    'jetshape_emcal_econe_r01',
    'jetshape_emcal_econe_r02',
    'jetshape_emcal_econe_r03',
    'jetshape_emcal_econe_r04',
    'jetshape_emcal_econe_r05',
#    'tracks_count_r02',
    'tracks_count_r04',
#    'tracks_rmax_r02',
    'tracks_rmax_r04',
#    'tracks_chargesum_r02',
    'tracks_chargesum_r04']

target_col = 'evtgen_is_tau'

features = data[ feature_cols ]
target = data[ target_col ]
target.value_counts()

0    7807
1    6556
Name: evtgen_is_tau, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

# create training and testing vars
#X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.8)
#print (X_train.shape, y_train.shape)
#print (X_test.shape, y_test.shape)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold

penalty = {
    0: 100,
    1: 1
}

#lr = LogisticRegression(class_weight=penalty)

#lr = DecisionTreeClassifier(class_weight=penalty, max_depth=30)

lr = AdaBoostClassifier(random_state=1)

#lr = RandomForestClassifier(class_weight=penalty, random_state=1, max_depth=20)
#lr = RandomForestClassifier(class_weight='balanced', random_state=1)
#lr = RandomForestClassifier(random_state=1)
kf = KFold(features.shape[0], shuffle=True, random_state=1)

predictions = cross_val_predict(lr, features, target, cv=kf)
#predictions = cross_val_predict(lr, features, target, cv=10)

#lr.fit( features, target )
#predictions = lr.predict(features)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (target == 0)
fp = len(data[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (target == 1)
tp = len(data[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (target == 1)
fn = len(data[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (target == 0)
tn = len(data[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print( "True positive: "+str(tp))
print( "True negativee: "+str(tn))
print( "False positive: "+str(fp))
print( "False negative: "+str(fn))
print( "True Positive Rate: "+str(tpr) )
print( "False Positive Rate: "+str(fpr) )



True positive: 6020
True negativee: 7086
False positive: 721
False negative: 536
True Positive Rate: 0.9182428309945089
False Positive Rate: 0.09235301652363263
