In [2]:
# Import libraries
import pylab
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from scipy.stats.stats import pearsonr
%matplotlib inline

# Define to_image function
pixels = ["pixel_{0}".format(i) for i in range(400)] # 20*20 = 400
def to_image(df):
    return  np.expand_dims(np.expand_dims(df[pixels], axis=-1).reshape(-1,20,20), axis=-1)

In [3]:
## Import the data ##

# List file names
datafilenames = ["SharedHits/NormalizedCharge/output_final.h5","SharedHits/AbsoluteCharge/output_final.h5",
                "Shared100NonShared/NormalizedCharge/output_final.h5","Shared100NonShared/AbsoluteCharge/output_final.h5",
                "AllHits/NormalizedCharge/output_final.h5","AllHits/AbsoluteCharge/output_final.h5"]

# Select file you want to use
filenumber = 5
datafilename = datafilenames[filenumber]

# Import data
df = pd.read_hdf("/uscms_data/d3/bbonham/TrackerProject/TrackingDstar/LambaAnalyzer/output_of_postprocess/"+datafilename, 
                 key="df", mode='r')

# Print dataframe info
print '\033[1m' + datafilename[:-16] + '\033[0m' # name printed in bold
print df.info()

# Shared Hits Signal Region
if filenumber == 0 or filenumber == 1:
    signalstring = "['nUniqueSimTracksInSharedHit']>=2"
    backgroundstring = "['nUniqueSimTracksInSharedHit']<=1"
# All Hits Signal Region
if filenumber in [2,3,4,5]:
    signalstring = "['isSharedHit']==1"
    backgroundstring = "['isSharedHit']!=1"

IOError: File /uscms_data/d3/bbonham/TrackerProject/TrackingDstar/LambaAnalyzer/output_of_postprocess/AllHits/AbsoluteCharge/output_final.h5 does not exist

In [None]:
# Load the model
model = load_model('/uscms/home/bbonham/nobackup/TrackerProject/MergedHits/Brandon/Trained_Models_isShared=1/TrainedModel_'+datafilename[:-16].replace('/','_')+'.h5')

In [None]:
# Reproduce the train-test split from when you trained the model.

# Seed must be the same integer as when you trained the model. Always 10 for me. 
# Alternatively, you could save df_train and load it, but I chose to do it with a consistent seed instead. 
train_test_seed = 10
df_train,df_test = train_test_split(df,test_size=0.5,random_state=train_test_seed)
images_train = to_image(df_train)
images_test = to_image(df_test)

In [None]:
# signal_string below should be double checked, or the roc curve may be wrong. 

In [None]:
# Plot ROC curve

ret = model.predict(images_test)
fpr_keras, tpr_keras, thresholds_keras = roc_curve(keras.utils.to_categorical(eval("df_test"+signalstring))[:,1], ret[:,1])
from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)
auc = np.trapz(tpr_keras,fpr_keras)

plt.figure(1)
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve (area = {:.3f})'.format(auc))
plt.show()

In [None]:
# Visualizing our training data by number of hits
print "shared hits with at least 2 sim tracks:"
print float(sum(df_train["nUniqueSimTracksInSharedHit"]>1))/len(df_train["nUniqueSimTracksInSharedHit"])
plt.hist(df_train["nUniqueSimTracksInSharedHit"],histtype="step",bins=6,range=(-0.5,5.5))
plt.xlabel('# Unique Simulation Tracks')
plt.ylabel('Reconstructed $\Lambda$s')
plt.title("Distribution of Simulation Tracks in Training Data")
plt.show()

In [None]:
# Visualizing our training data by merged (2 hits or more) vs not merged (1 hit) 
print "shared hits with at least 2 sim tracks:"
print float(sum(df_train["nUniqueSimTracksInSharedHit"]>1))/len(df_train["nUniqueSimTracksInSharedHit"])
plt.hist(df_train["nUniqueSimTracksInSharedHit"]>1,histtype="step",bins=2,range=(-0.5,1.5), color = 'r')
plt.title("Distribution of Merged Hits vs Not Merged Hits in Training Data")
plt.xticks([0,1],("Not Merged","Merged"))
plt.ylabel('Reconstructed $\Lambda$s')
plt.show()

In [None]:
# The following two cells depend on the definition of signal and background, which I'm not thrilled to change right now. 
# I would rather alter them automatically. 

In [None]:
# Separating signal and background for train and test data then preparing histograms for discriminant plot
# Testing data

signal = ret[ eval("df_test"+signalstring) ]
background = ret[ eval("df_test"+backgroundstring) ]
signal_plt = signal[:,1]
background_plt = background[:,1]

#Training data
ret_train = model.predict(images_train)
signal_train = ret_train[ eval("df_train"+signalstring) ]
background_train = ret_train[ eval("df_train"+backgroundstring) ] # Non-Merged Hits
signal_train_plt = signal_train[:,1]
background_train_plt = background_train[:,1]
Y_back_hist = np.histogram(background_train_plt, bins = 30, range = (0,1))[0]
X_back_hist = np.histogram(background_train_plt, bins = 30, range = (0,1))[1]
Y_sig_hist = np.histogram(signal_train_plt, bins = 30, range = (0,1))[0]
X_sig_hist = np.histogram(signal_train_plt, bins = 30, range = (0,1))[1]

In [None]:
# Plotting Signal and Background Discriminants
plt.figure(figsize=(15,8))
plt.hist(signal_plt, alpha = 0.5, color = 'b', label = 'Signal (test)', range = (0,1), bins = 30,log=True)
plt.hist(background_plt, color = 'r', alpha = 0.5, label = 'Background (test)', range = (0,1), bins = 30,log=True)
plt.scatter(X_back_hist[0:30] + 0.0166 , Y_back_hist, label='Background (train)', color ='r')
plt.scatter(X_sig_hist[0:30] + 0.0166, Y_sig_hist, label='Signal (train)', color='b')
plt.legend(loc='best')
plt.xlabel('Discriminant')
plt.ylabel('Reconstructed $\Lambda$s')
plt.title('CNN Signal and Background Discriminants')
plt.show()

In [None]:
# Adding Discriminant branch to Testing data only, could easily do it for Training data but we don't need to (for our purpose below)
n_events_df_test = len(df_test.index) # number of events in df_test
disc = []
discriminants_test = model.predict(images_test) # returns (prob(notmergedhit), prob(merged hit)), the second number is our discriminant
for i in range (0,n_events_df_test):
    disc.append(discriminants_test[i][1]) 
df_test.insert(0, "Discriminants", disc) #inserting new column in our dataframe at position 0

In [None]:
# Pixel Picture Script
def cluster_map(data_f):
    # sorting by discriminant ascending
    data_f = data_f.sort_values("Discriminants", ascending = True)[0:len(data_f.index)-1]

    shareds =  data_f
    pixelColumns = ["pixel_%i" % x for x in range(400)] # 400
    pixels = shareds[pixelColumns].values

    for row,hit,  in enumerate(pixels):
        x_pos = []
        y_pos = []
        charge = []
        for index,pixel in enumerate(hit):
            if pixel!=0:
                x_pos.append(index%20)
                y_pos.append(np.floor(index/20))  
                charge.append(pixel)
        dis = np.around(data_f.iloc[row,0], decimals = 5)
        text = "Reconstructed $\Lambda$ " + str(row +1) + " with discriminant " + str(dis)
        print text
        
        # Plotting Colorbar    
        fig=plt.figure()
        ax=plt.axes()
        cax = plt.hist2d(x_pos,y_pos,weights=charge,bins=(20,20),range=((0,20),(0,20)))
        cb=fig.colorbar(cax[3])
        cb.set_ticks([0,max(charge)])
        cb.set_label("Scaled ADC",rotation=-90)

        # Title, uses truth value
        hits_column = data_f.columns.get_loc("nUniqueSimTracksInSharedHit")
        if data_f.iloc[row,hits_column] == 1 : # 1 hit
            plt.title('         Not Merged Cluster Charge Map (discriminant = {:.3f})'.format(dis))
        elif data_f.iloc[row,hits_column] > 1 : # 2 or more hits
            plt.title('         Merged Cluster Charge Map (discriminant = {:.3f})'.format(dis))
        else : # 0 hits
            plt.title('         Null Cluster Charge Map (discriminant = {:.3f})'.format(dis))
    
        plt.xlabel("x")
        plt.ylabel("y")
        plt.show()

### Full Discriminant Distribution

In [None]:
# Distribution of 0 - 0.1 discriminant events
df_test["Discriminants"].plot(kind='hist', title = "Discriminant Distribution", bins = 100, figsize=(12,6),log=True);

### Discriminants between [0.0,0.1]

In [None]:
# Pixel Picture Script for [0, 0.1] discriminant testing events

n_pictures = 1 
cluster_map(df_test[(df_test["Discriminants"] < 0.1 )][:(1+n_pictures)])

In [None]:
# Distribution of 0 - 0.1 discriminant events
df_test[(df_test["Discriminants"] < 0.1 )]["Discriminants"].plot(kind='hist', title = "Discriminant in the Range [0, 0.1]", bins = 100, figsize=(12,6));

### Discriminants between [0.4,0.6]

In [None]:
# Pixel Picture Script for [0.4, 0.6] discriminant testing events

n_pictures = 1 
cluster_map(df_test[(df_test["Discriminants"] > 0.3 ) & (df_test["Discriminants"] < 0.6 )][:(1+n_pictures)]);

In [None]:
# Distribution of 0.4 - 0.6 discriminant events
df_test[(df_test["Discriminants"] > 0.4 ) & (df_test["Discriminants"] < 0.6 )]["Discriminants"].plot(kind='hist', title = "Discriminant in the Range [0.4, 0.6]", bins = 100, figsize=(12,6));

### Discriminants Between [0.9,1.0]

In [None]:
# Pixel Picture Script for [0.9, 1] discriminant testing events
n_pictures = 1 
cluster_map(df_test[df_test["Discriminants"] > 0.9][:(1+n_pictures)])

In [None]:
# Distribution of 0.9 - 1 discriminant events
df_test[df_test["Discriminants"] > 0.9]["Discriminants"].plot(kind='hist' , title = "Discriminant in the Range [0.9, 1]", bins = 100, figsize=(12,6));

We are looking at the flatten 2D maps either in x or y <br>
pixelCharge is the pixel charge of the flattened 2D map, sum(pixelCharge) = 1  <br>
pixelPos is the position of the flattened 2D maps pixel (range is 1 to 6)  <br>
$\overline{x} = \sum_1^6 \text{pixelCharge}^i \ \times \ \text{pixelPos}^i$  <br>
$x^{RMS} = \sqrt{  \sum_1^6 (\overline{x} - \text{pixelPos}^i \ \times \ \text{pixelCharge}^i)^2}$

In [None]:
# Script for dx, dy, x^RMS, y^RMS 

# including all testing events
data = df_test 
# sorting by discriminant ascending
#data = data.sort_values("Discriminants", ascending = True)[0:len(data.index)-1]

shareds =  data
pixelColumns = ["pixel_%i" % x for x in range(400)]
pixels = shareds[pixelColumns].values
width = [] # dx
length = [] # dy
x_rms = [] 
y_rms = []

for row,hit,  in enumerate(pixels):
    x_pos = []
    y_pos = []
    charge = []
    arra = np.zeros((20,20))
    for index,pixel in enumerate(hit):   
        if pixel!=0:
            x_pos.append(index%20)
            y_pos.append(np.floor(index/20))  
            charge.append(pixel)
            arra [19 - int(np.floor(index/20))][int(index%20)]= pixel
    
    #Evaluating width and height of every event
    charge_in_x = np.sum(arra,axis=0)
    charge_in_y = np.sum(arra,axis=1)
    charge_x_values = np.where(charge_in_x>0)[0]
    charge_y_values = np.where(charge_in_y>0)[0]
    wid = charge_x_values[-1] - charge_x_values[0] + 1
    le = charge_y_values[-1] - charge_y_values[0] + 1
    width.append(wid)
    length.append(le)
    
    # Evaluating x^RMS, y^RMS of every cluster
    mean_x = 0
    mean_y = 0
    x_ms = 0
    y_ms = 0
    for i in range (0,20):
        mean_x = charge_in_x[i]*(i+1) + mean_x
        mean_y = charge_in_y[i]*(i+1) + mean_y
    for i in range (0,20):
        if charge_in_x[i] > 0:
            x_ms = (mean_x - charge_in_x[i]*(i+1))**2 + x_ms
        if charge_in_y[i] > 0:
            y_ms = (mean_y - charge_in_y[i]*(i+1))**2 + y_ms
    x_rms.append(np.sqrt(x_ms))
    y_rms.append(np.sqrt(y_ms))

 
 # Uncomment this section to display each event information and pixel pictures
     # Event info
#    text = "Event " + str(row +1) + " with discriminant " + str(np.around(data.iloc[row,0], decimals = 5)) + ", width "+ str(wid)+ " and length " +str(le)
#    print text
#    # Plotting Colorbar  
#    fig=plt.figure()
#    ax=plt.axes()
#    cax = plt.hist2d(x_pos,y_pos,weights=charge,bins=(6,6),range=((0,6),(0,6)))
#    cb=fig.colorbar(cax[3])
#    cb.set_ticks([0,max(charge)])
#    cb.set_label("normalized adc",rotation=-90)
#
#    # Title, uses truth value
#    hits_column = df_test.columns.get_loc("nUniqueSimTracksInSharedHit")
#    if data.iloc[row,hits_column] == 1 : # 1 hit
#        plt.title("Not Merged Cluster Charge Map")
#    elif data.iloc[row,hits_column] > 1 : # 2 or more hits
#        plt.title("Merged Cluster Charge Map")
#    else : # 0 hits
#        plt.title("Null Cluster Charge Map")
#    
#    plt.xlabel("x")
#    plt.ylabel("y")
#    plt.show() 



In [None]:
# Adding width and height branch and checking they are there
data.insert(1, "Length", length)
data.insert(1, "Width", width)

# Adding width RMS and height RMS branch and checking they are there
data.insert(1, "Length RMS", y_rms)
data.insert(1, "Width RMS", x_rms)

#print data.info()

In [None]:
# Separating signal and background for testing data
signal = data[ eval("data"+signalstring) ]
background = data[ eval("data"+backgroundstring) ]

# Plotting CNN Signal and Background Width
signal_plt_width = signal["Width"]
background_plt_width = background["Width"]
plt.hist(signal_plt_width,  alpha = 0.5, color = 'b', label = 'Merged Hit', range = (1,10), bins = 9, density=True)
plt.hist(background_plt_width, alpha = 0.5, color = 'r', label = 'Not Merged Hit', range = (1,10), bins = 9, density=True)
plt.legend(loc='best')
plt.title('Merged vs Not Merged $\Delta x$ in Testing Data')
plt.xlabel('$\Delta x$')
plt.ylabel('Reconstructed $\Lambda$s')
plt.show()

In [None]:
# Plotting CNN Signal and Background Height
signal_plt_height = signal["Length"]
background_plt_height = background["Length"]
plt.hist(signal_plt_height, alpha = 0.5, color = 'b', label = 'Merged Hit', range = (1,20), bins = 19, density=True)
plt.hist(background_plt_height, color = 'r', alpha = 0.5, label = 'Not Merged Hit', range = (1,20), bins = 19,density=True)
plt.legend(loc='best')
plt.title('Merged vs Not Merged $\Delta y$ in Testing Data')
plt.xlabel('$\Delta y$')
plt.ylabel('Reconstructed $\Lambda$s')
plt.show()

In [None]:
# Scatter plot of Length vs Width in Testing for correlation in Merged Hits   
x = signal_plt_width
y = signal_plt_height  
correlation = pearsonr(x,y)[0] 
plt.xlabel('$\Delta x$')
plt.ylabel('$\Delta y$')
plt.title('$\Delta y$ vs $\Delta x$ for Merged Clusters in Testing') #(corr = {:.4f})'.format(correlation))
plt.hist2d(x, y,bins = (20, 20), range =[[0, 20], [0,20]], cmap=plt.cm.jet) # cmap=plt.cm.Greys
plt.colorbar()
plt.show()

In [None]:
# Scatter plot of Length vs Width in Testing for correlation in Not Merged Hits
x = background_plt_width
y = background_plt_height
correlation = pearsonr(x,y)[0] 
plt.xlabel('$\Delta x$')
plt.ylabel('$\Delta y$')
plt.title('$\Delta y$ vs $\Delta x$ for Not Merged Clusters in Testing') # (corr = {:.4f})'.format(correlation))
plt.hist2d(x, y,  bins = (20, 20), range =[[0, 20], [0,20]],  cmap=plt.cm.jet)
plt.colorbar()
plt.show()

In [None]:
# Plotting CNN Signal and Background Width
signal = data[(data["nUniqueSimTracksInSharedHit"]>1)]
background = data[(data["nUniqueSimTracksInSharedHit"]<2)]
signal_plt_width = signal["Width RMS"]
background_plt_width = background["Width RMS"]
plt.hist(signal_plt_width, alpha = 0.5, color = 'b', label = 'Merged Hit', range = (0,100), bins = 80, density=True)
plt.hist(background_plt_width, color = 'r', alpha = 0.5, label = 'Not Merged Hit', range = (0,100), bins = 80, density=True)
plt.legend(loc='best')
plt.title('Merged vs Not Merged $x^{RMS}$ in Testing Data (Normalized)')
plt.xlabel('$x^{RMS}$')
plt.ylabel('Reconstructed $\Lambda$s')
plt.show()

In [None]:
# Plotting CNN Signal and Background Height
signal_plt_height = signal["Length RMS"]
background_plt_height = background["Length RMS"]
plt.hist(signal_plt_height, alpha = 0.5, color = 'b', label = 'Merged Hit', range = (0,400), bins = 100, density=True)
plt.hist(background_plt_height, color = 'r', alpha = 0.5, label = 'Not Merged Hit', range = (0,400), bins = 100, density=True)
plt.legend(loc='best')
plt.title('Merged vs Not Merged $y^{RMS}$ in Testing Data (Normalized)')
plt.xlabel('$y^{RMS}$')
plt.ylabel('Reconstructed $\Lambda$s')
plt.show()

In [None]:
# Scatter plot of Length vs Width in Testing for correlation in Merged Hits   
x = signal_plt_width
y = signal_plt_height  
correlation = pearsonr(x,y)[0]
plt.figure(figsize=(15,10))
plt.xlabel('$x^{RMS}$')
plt.ylabel('$y^{RMS}$')
plt.title('$y^{RMS}$ vs $x^{RMS}$ for Merged Clusters in Testing') #(corr = {:.4f})'.format(correlation))
plt.hist2d(x, y,bins = (60, 60), range =[[0, 100], [0,100]], cmap=plt.cm.jet) # cmap=plt.cm.Greys
plt.colorbar()
plt.show()

In [None]:
# Scatter plot of Length vs Width in Testing for correlation in Not Merged Hits
x = background_plt_width
y = background_plt_height
correlation = pearsonr(x,y)[0]
plt.figure(figsize=(15,10))
plt.xlabel('$x^{RMS}$')
plt.ylabel('$y^{RMS}$')
plt.title('$y^{RMS}$ vs $x^{RMS}$ for Not Merged Clusters in Testing') # (corr = {:.4f})'.format(correlation))
plt.hist2d(x, y,bins = (60, 60), range =[[0, 60], [0,60]], cmap=plt.cm.jet) # cmap=plt.cm.Greys
plt.colorbar()
plt.show()