In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, precision_score, log_loss, recall_score, classification_report, f1_score, average_precision_score

def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

def roc_curve_and_score(label, pred_proba):
    """Returns the false positive rate (fpr), true positive rate(tpr) and
    the area under the curve (auc) of the ROC curve.

    Parameters
    ----------
    label : serie
        True binary labels.

    pred_proba : serie
        Target scores, can either be probability estimates of the positive class, 
        confidence values, or non-thresholded measure of decisions.

    Returns
    ------
    fpr: 
        ndarray of shape (>2,)

    tpr:
        ndarray of shape (>2,)
    auc:
        float
    """
    fpr, tpr, _ = roc_curve(label.ravel(), pred_proba.ravel())
    roc_auc = roc_auc_score(label.ravel(), pred_proba.ravel())
    return fpr, tpr, roc_auc

def roc_plot(names, label, probs):
    '''Plots the background efficiency (fpr) vs. signal efficiency (tpr).

    Parameters
    ----------
    names : list
        Name of the algorithms.

    label: serie
        True label of every event.

    probs : list
        Target scores, can either be probability estimates of the positive class, 
        confidence values, or non-thresholded measure of decisions.

    Returns
    ------
    ax:
        The axis for the plot.
    '''
    # Creating the list of colors
    main_colors = ['darkorange', 'green', 'crimson', 'blue', 'green', 'red', 'purple', 'pink', 'gray', 'olive', 'cyan', 'indigo']
    colors = main_colors[:len(names)]
    # Creating the figure an the axis
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(1, 1, 1)
    # Setting some parameters
    matplotlib.rcParams.update({'font.size': 14})
    plt.grid()

    # Plotting the curves
    for name, prob in zip(names, probs):
        rgb = np.random.rand(3,)
        fpr, tpr, roc_auc = roc_curve_and_score(label, prob)
        plt.plot(fpr, tpr, color=rgb, lw=2,
                label='{} AUC={:.3f}'.format(name, roc_auc))

    # Plotting the line for a random classifier
    plt.plot([1, 0], [0, 1], color='navy', lw=1, linestyle='--')

    # Adding the information to the plot
    plt.legend(loc="lower right")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('1 - Specificity')
    plt.ylabel('Recall')
    plt.title('ROC curve')
    plt.show()

    return ax

In [1]:
import pandas as pd
from benchtools.src.datatools import ascii_column

In [2]:
masterkey = ascii_column('../../events_LHCO2020_BlackBox1.masterkey')

In [6]:
masterkey.values

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [10]:
pd.Series(masterkey.iloc[:,0]).values

array([0., 0., 0., ..., 0., 0., 0.])

In [13]:
df_bb.assign(2100=pd.Series(df_key.iloc[:,0]).values)

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (<ipython-input-13-7f11cc8752e2>, line 1)

In [23]:
import numpy as np
import pandas as pd
a = np.zeros((2,10))
b = np.full((2, 10), 7.)

In [10]:
a

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [11]:
b

array([[7., 7., 7., 7., 7., 7., 7., 7., 7., 7.],
       [7., 7., 7., 7., 7., 7., 7., 7., 7., 7.]])

In [21]:
con = np.concatenate((a.T,b.T),axis=0)
con

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [7., 7.],
       [7., 7.],
       [7., 7.],
       [7., 7.],
       [7., 7.],
       [7., 7.],
       [7., 7.],
       [7., 7.],
       [7., 7.],
       [7., 7.]])

In [24]:
pd.DataFrame(con)

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [18]:
dic = {}

In [19]:
dic['clf'] = [1.5,2,318]

In [20]:
dic['clf2'] = [3,5,8,6]

In [21]:
print(dic)

{'clf': [1.5, 2, 318], 'clf2': [3, 5, 8, 6]}


In [23]:
import pandas as pd
pd.DataFrame.from_dict(dic, orient='index').reset_index().rename(columns={'index':'clf'})

Unnamed: 0,clf,0,1,2,3
0,clf,1.5,2,318,
1,clf2,3.0,5,8,6.0


In [2]:
from benchtools.src.clustering import build_features
build_features(path_data='..\..\events_anomalydetection.h5', nbatch=2, outname='test_building', chunksize=100)

Part 1/2


100%|██████████| 100/100 [00:02<00:00, 42.16it/s]


Part 2/2


100%|██████████| 100/100 [00:02<00:00, 42.21it/s]


Done
Merging files


In [5]:
import pandas as pd
df = pd.read_csv("..\data\\test_building.csv")
df.head()

Unnamed: 0,pT_j1,m_j1,eta_j1,phi_j1,E_j1,tau_21_j1,nhadrons_j1,pT_j2,m_j2,eta_j2,phi_j2,E_j2,tau_21_j2,nhadrons_j2,m_jj,deltaR_j12,n_hadrons,label
0,1286.727685,106.912129,0.185508,-2.763676,1313.290435,0.624659,36,1283.220733,63.164215,0.064989,0.393688,1287.481934,0.713248,33,2580.489568,3.159663,109.0,0.0
1,1354.39407,614.269108,0.826505,1.365524,1943.559886,0.311688,84,1325.613761,439.06415,-0.874319,-1.786248,1916.370744,0.276881,97,3859.315047,3.581406,208.0,0.0
2,1214.955723,645.865619,-0.196786,2.040545,1396.840654,0.238205,119,1072.462085,113.76884,0.143831,-1.09033,1089.53063,0.726963,59,2480.769725,3.149348,196.0,0.0
3,1285.227873,516.835248,0.328693,2.975321,1450.485926,0.013429,65,1220.251279,174.796077,0.294854,-0.322661,1285.618789,0.706361,89,2609.893413,3.298155,183.0,0.0
4,1210.415787,129.499352,-0.744836,-2.883347,1567.3453,0.42355,54,1091.785816,155.362262,1.060534,0.264977,1772.340209,0.787662,57,3313.488835,3.629229,169.0,1.0


In [7]:
import pickle

models = pickle.load(open('sklearn_models.sav', 'rb'))

In [8]:
models

[]

In [9]:
df = pd.read_csv('../data/test_building.csv')
df.shape

(200, 18)

In [10]:
df

Unnamed: 0,pT_j1,m_j1,eta_j1,phi_j1,E_j1,tau_21_j1,nhadrons_j1,pT_j2,m_j2,eta_j2,phi_j2,E_j2,tau_21_j2,nhadrons_j2,m_jj,deltaR_j12,n_hadrons,label
0,1286.727685,106.912129,0.185508,-2.763676,1313.290435,0.624659,36,1283.220733,63.164215,0.064989,0.393688,1287.481934,0.713248,33,2580.489568,3.159663,109.0,0.0
1,1354.394070,614.269108,0.826505,1.365524,1943.559886,0.311688,84,1325.613761,439.064150,-0.874319,-1.786248,1916.370744,0.276881,97,3859.315047,3.581406,208.0,0.0
2,1214.955723,645.865619,-0.196786,2.040545,1396.840654,0.238205,119,1072.462085,113.768840,0.143831,-1.090330,1089.530630,0.726963,59,2480.769725,3.149348,196.0,0.0
3,1285.227873,516.835248,0.328693,2.975321,1450.485926,0.013429,65,1220.251279,174.796077,0.294854,-0.322661,1285.618789,0.706361,89,2609.893413,3.298155,183.0,0.0
4,1210.415787,129.499352,-0.744836,-2.883347,1567.345300,0.423550,54,1091.785816,155.362262,1.060534,0.264977,1772.340209,0.787662,57,3313.488835,3.629229,169.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1279.238347,122.028943,1.308153,-0.872599,2541.998010,0.537335,41,1072.828954,69.763470,-0.006886,2.275757,1075.120214,0.893122,38,2874.510989,3.411960,228.0,0.0
196,1594.468365,56.990307,0.466066,2.758662,1771.715684,0.330372,20,1424.730799,147.054933,0.588076,-0.445507,1684.703437,0.779314,48,3026.954860,3.206491,101.0,0.0
197,1207.849912,100.425203,1.631779,0.254791,3207.536132,0.746782,42,1130.133300,48.448257,-0.463122,-2.919449,1254.447766,0.676397,20,3742.912140,3.803211,81.0,0.0
198,1492.891237,267.991206,-0.166400,3.131674,1537.148653,0.040135,66,1454.718112,286.601910,-0.620268,-0.016186,1767.042145,0.775242,82,3074.076771,3.180412,203.0,0.0


In [11]:
from benchtools.src.clustering import jets, event_features, cluster_events, build_features

build_features(path_data="../data/events_anomalydetection_tiny.h5", nbatch=1, outname='test_building_nb', chunksize=10)

Part 1/1


100%|██████████| 10/10 [00:00<00:00, 49.01it/s]

Done
Merging files





In [None]:
df = pd.read_csv('../data/test_building_nb.csv')
df.shape

In [4]:
with open('try.txt') as f:
    lines = lines = [line.rstrip('\n') for line in f]
    
lines

['Uclustermodel.sav', 'GAN-AEmodel.sav']

In [1]:
import os
import pickle
import pandas as pd
from benchtools.src.clustering import jets, event_features, cluster_events, build_features
from benchtools.src.datatools import save_df

In [4]:
event = pd.read_hdf("../data/events_anomalydetection_tiny.h5", stop=1)
event_as_serie = event.iloc[0]
jets_first_event = jets(event_as_serie, R = 1.0, p = -1, minpt=20)
pickle.dump(jets_first_event, open('../data/jets_firts_event.sav', 'wb'))

TypeError: self.constits,self.jet cannot be converted to a Python object for pickling

In [4]:
import h5py
import numpy as np

In [5]:
f = h5py.File('MyDataset.h5', 'a')
for i in range(10):

  # Data to be appended
  new_data = np.ones(shape=(100,64,64)) * i
  new_label = np.ones(shape=(100,1)) * (i+1)

  if i == 0:
    # Create the dataset at first
    f.create_dataset('data', data=new_data, compression="gzip", chunks=True, maxshape=(None,64,64))
    f.create_dataset('label', data=new_label, compression="gzip", chunks=True, maxshape=(None,1)) 
  else:
    # Append new data to it
    f['data'].resize((f['data'].shape[0] + new_data.shape[0]), axis=0)
    f['data'][-new_data.shape[0]:] = new_data

    f['label'].resize((f['label'].shape[0] + new_label.shape[0]), axis=0)
    f['label'][-new_label.shape[0]:] = new_label


In [6]:
import pandas as pd

In [7]:
mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
          {'a': 100, 'b': 200, 'c': 300, 'd': 400},
          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
df = pd.DataFrame(mydict)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400
2,1000,2000,3000,4000


In [9]:
df.iloc[1:3]

Unnamed: 0,a,b,c,d
1,100,200,300,400
2,1000,2000,3000,4000


In [None]:
mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
          {'a': 100, 'b': 200, 'c': 300, 'd': 400},
          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
df = pd.DataFrame(mydict)
df

In [1]:
import nexusformat.nexus as nx
f = nx.nxload('..\data\\tf_model_log.h5')
print(f.tree)

root:NXroot
  @backend = 'tensorflow'
  @keras_version = '2.7.0'
  @model_config = '{"class_name": "Sequential", "config": {"name"...'
  @training_config = '{"loss": "binary_crossentropy", "metrics": [[{...'
  model_weights:NXgroup
    @backend = 'tensorflow'
    @keras_version = '2.7.0'
    @layer_names = ['batch_normalization', 'dense', 'batch_normal...
    batch_normalization:NXgroup
      @weight_names = ['batch_normalization/gamma:0', 'batch_normali...
      batch_normalization:NXgroup
        beta:0 = float32(14)
        gamma:0 = float32(14)
        moving_mean:0 = float32(14)
        moving_variance:0 = float32(14)
    batch_normalization_1:NXgroup
      @weight_names = ['batch_normalization_1/gamma:0', 'batch_norma...
      batch_normalization_1:NXgroup
        beta:0 = float32(512)
        gamma:0 = float32(512)
        moving_mean:0 = float32(512)
        moving_variance:0 = float32(512)
    batch_normalization_2:NXgroup
      @weight_names = ['batch_normalization_2/gamma:0'

In [4]:
# Recreate the exact same model, including its weights and the optimizer
new_model = load_model('..\data\\tf_model_log.h5')

# Show the model architecture
new_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (BatchN  (None, 14)               56        
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 512)               7680      
                                                                 
 batch_normalization_1 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 batch_normalization_2 (Batc  (None, 512)              2

In [5]:
# Importing the data 
df = pd.read_csv("..\data\RD_dataset.csv")
df.head()

Unnamed: 0,pT_j1,m_j1,eta_j1,phi_j1,E_j1,tau_21_j1,nhadrons_j1,pT_j2,m_j2,eta_j2,phi_j2,E_j2,tau_21_j2,nhadrons_j2,m_jj,deltaR_j12,n_hadrons,label
0,1286.727685,106.912129,0.185508,-2.763676,1313.290435,0.624659,36,1283.220733,63.164215,0.064989,0.393688,1287.481934,0.713248,33,2580.489568,3.159663,109.0,0.0
1,1354.39407,614.269108,0.826505,1.365524,1943.559886,0.311688,84,1325.613761,439.06415,-0.874319,-1.786248,1916.370744,0.276881,97,3859.315047,3.581406,208.0,0.0
2,1214.955723,645.865619,-0.196786,2.040545,1396.840654,0.238205,119,1072.462085,113.76884,0.143831,-1.09033,1089.53063,0.726963,59,2480.769725,3.149348,196.0,0.0
3,1285.227873,516.835248,0.328693,2.975321,1450.485926,0.013429,65,1220.251279,174.796077,0.294854,-0.322661,1285.618789,0.706361,89,2609.893413,3.298155,183.0,0.0
4,1210.415787,129.499352,-0.744836,-2.883347,1567.3453,0.42355,54,1091.785816,155.362262,1.060534,0.264977,1772.340209,0.787662,57,3313.488835,3.629229,169.0,1.0


In [6]:
from benchtools.src.datatools import separate_data
df_RD = df.sample(100000, random_state = 1)
X, y = separate_data(df_RD, standarize=False)
X.drop(['m_jj', 'm_j1','m_j2'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training y 30% test

In [7]:
new_model.predict(X_test)

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [9]:
# Evaluate the restored model
loss, acc = new_model.evaluate(X_test, y_test, verbose=2)
print('Restored model, accuracy: {:5.2f}%'.format(100 * acc))

print(new_model.predict(X_test).shape)

938/938 - 4s - loss: 3026.4949 - binary_accuracy: 0.9099 - 4s/epoch - 5ms/step
Restored model, accuracy: 90.99%
(30000, 1)


In [1]:
from importlib.resources import path
import os
import argparse
import pickle
import os.path 
import pandas as pd
import numpy as np
from tqdm import tqdm
#from tabulate import tabulate
from math import ceil
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# Importing the classifiers
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans

# Tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model

# Importing the metrics
from sklearn.metrics import precision_score, log_loss, recall_score, f1_score, balanced_accuracy_score

In [2]:
pickle.load(open('..\data\sklearn_models_log.sav', 'rb'))

[('RandomForestClassifier',
  Pipeline(steps=[('ss', StandardScaler()),
                  ('clf', RandomForestClassifier(random_state=1))])),
 ('GradientBoostingClassifier',
  Pipeline(steps=[('ss', RobustScaler()),
                  ('clf', GradientBoostingClassifier(random_state=4))])),
 ('QuadraticDiscriminantAnalysis',
  Pipeline(steps=[('ss', RobustScaler()),
                  ('clf', QuadraticDiscriminantAnalysis())])),
 ('MLPClassifier',
  Pipeline(steps=[('ss', StandardScaler()),
                  ('clf', MLPClassifier(random_state=7))])),
 ('KMeans',
  Pipeline(steps=[('ss', StandardScaler()),
                  ('clf', KMeans(n_clusters=2, random_state=15))]))]