In [1]:
try:
    import waitGPU
    waitGPU.wait(ngpu=1)
except:
    print("Failed to import waitGPU")

from utils import do_tsne, do_pca, do_gif, df_plot, df_plot2
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import class_weight
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras import layers
from tensorflow import keras
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import json
import time
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

waitGPU: Waiting for the following conditions, checking every 10 seconds. 
Failed to import waitGPU


In [2]:
class BlackBox:
    def __init__(self, data_dir, working_dir, threshold=0.9):
        self.data_dir = data_dir

        dt = datetime.now().strftime("%Y%m%d-%H%M%S")
        current_dir = f"t-{threshold}-{dt}"

        self.working_dir = Path(working_dir).joinpath(current_dir)
        self.working_dir.mkdir(parents=True)

        print(f"Working dir: {self.working_dir}")

        self.df_train = None
        self.df_test = None
        self.bin_cols = []

        self.df_anomalies = None
        self.threshold = threshold

        self.training_performance = []
        self.model = None

    def load_df(self, nbins=100, step=1):

        self.df_train = pd.read_csv(
            Path(self.data_dir, 'train.csv'))   # Hand picked data
        self.df_test = pd.read_csv(
            Path(self.data_dir, 'test.csv'))  # Data to be classified

        print(
            f"Train set {self.df_train.shape} | Test set {self.df_test.shape}")

        # Filter list of columns which will be used for training
        self.bin_cols = [col for col in self.df_train.columns if 'bin_' in col]

        # Remove first and last values as those are over/under flows
        self.bin_cols = self.bin_cols[1:-1]

        if nbins > len(self.bin_cols):
            nbins = len(self.bin_cols)

        self.bin_cols = [f"bin_{i}" for i in range(1, nbins, step)]

    def create_model(self):
        model = keras.Sequential()
        model.add(layers.Dense(units=50, activation="relu", input_shape=(len(self.bin_cols),)))
        model.add(layers.Dropout(0.1))
        model.add(layers.Dense(units=25, activation="relu"))
        model.add(layers.Dropout(0.1))
        model.add(layers.Dense(units=1, activation="sigmoid"))
        model.compile(optimizer="adam",
                      loss="binary_crossentropy",
                      metrics=["accuracy"])
        return model

    def train_model(self, df, train_index):
        """ Trains new model with a given dataset
            Params:
                df (pandas.DataFrame) - dataset for training new model

            Returns:
                model (keras.Sequential) - trained model
        """
        try:
            train_start_time = time.time()
            # Normalization, divide every bin value by total entries
            X = df.filter(self.bin_cols, axis=1).copy().div(df.entries, axis=0)
            y = df["y"]

            # Stratified shuffle split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, stratify=y, test_size=0.25)

            X_train = np.asarray(X_train)
            X_test = np.asarray(X_test)
            y_train = np.asarray(y_train)
            y_test = np.asarray(y_test)

            # Calculate class weights
            cw = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
            cw = dict(enumerate(cw))
#             print(cw)

            # Train
            es = EarlyStopping(monitor='loss', mode='min', patience=5, verbose=1)
            log_dir = self.working_dir.joinpath("tensorboard").joinpath(str(train_index))
            tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=0)
            
            model = self.create_model()
            model.fit(X_train, y_train, verbose=0,
                      batch_size=64,
                      validation_split=0.25,
                      epochs=1000,
                      shuffle=True,
                      callbacks=[es, tensorboard_callback],
                      class_weight=cw)

            # Predict
            y_pred_ = model.predict(X_test)
            y_pred = (y_pred_ > 0.5)

            self.training_performance.append({
                "acc": accuracy_score(y_test, y_pred),
                "f1": f1_score(y_test, y_pred),
                "df_size": df.shape[0],
                "ngood": df[df["y"] == 1].shape[0],
                "nbad": df[df["y"] == 0].shape[0],
                "confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
                "train_time": int(time.time()-train_start_time)
#                 , "class_weight": cw
            })
        except Exception as err:
            print("Failed train_model |", err)
            raise err

        return model

    def predict_run(self, df_run):
        """ Does prediction with given model and dataset

            df_run (pd.DataFrame) - dataset

            Returns:
                df_run (pd.DataFrame) - dataset with additional two columns: 
                                        y_pred - model prediciton [0..1]
                                        y with - label (1 good, 0 bad, 2 anomaly)

        """

        try:
            # Scale each row by dividing by number of total entries within a run
            X = df_run.filter(self.bin_cols, axis=1).div(
                df_run.entries, axis=0)
            X = np.asarray(X)

            y_pred = self.model.predict(X)

            # Predicted label by ANN
            df_run["y_pred"] = y_pred

            # Create new column y and set final label there

            # Predictions with higher probability than threshold are considered as GOOD
            filter_good = df_run['y_pred'] >= self.threshold
            # Predictions with lower probability than threshold are considered as BAD
            filter_bad = df_run['y_pred'] <= 1-self.threshold
            # Predictions between lower and higher thresholds are considered as ANOMALIES
            filter_anon = (
                1-self.threshold < df_run['y_pred']) & (df_run['y_pred'] < self.threshold)

            # Create new column y and set final label there
            df_run.loc[filter_good, 'y'] = 1
            df_run.loc[filter_bad, 'y'] = 0
            df_run.loc[filter_anon, 'y'] = 2
            df_run = df_run.astype({"y": "int32"})

        except Exception as err:
            print("Failed predict_run |", err)
            raise err

        return df_run

    def self_train(self, nruns=None):
        self.t1 = time.time()
        print("Training initial model", end=' | ')
        self.model = self.train_model(self.df_train, train_index=0)
        
        pca_save_path = self.working_dir.joinpath(f"pca/0.jpg")
        dftrain_save_path = self.working_dir.joinpath(f"df/0.jpg")

        do_pca(self.df_train, title=f"0", save_path=pca_save_path)

        df_plot(self.df_train, title=f"0", save_path=dftrain_save_path)
        
        raw_runs = self.df_test.run.unique()
        try:
            oms = pd.read_csv("../data_raw/cmsoms_2017B.csv")
            
            oms = oms[(oms['delivered_lumi']>1)&(oms['duration']>=3600)]
            oms_runs = oms.run_number.unique()
            exclude_runs = set(raw_runs) - set(oms_runs)
            print("Excluding runs:", exclude_runs)
            raw_runs = list(set(raw_runs) - set(exclude_runs))
        except:
            print("Cannot compare with OMS runs")
            pass

        runs = sorted(raw_runs)

        if nruns:
            runs = runs[:nruns]

        for train_index, run_number in enumerate(runs):
            print(f"{train_index+1}. Working with run {run_number}", end=' | ')

            try:
                # Dataset of a single run
                df_run = self.df_test[self.df_test["run"] == run_number].copy()

                if len(df_run) == 0:
                    print(f"Run {run_number} has no data in test dataset")
                    continue

                df_run = self.predict_run(df_run)

                # Take a subset of only anomalies (y=2)
                df_anomalies = df_run[df_run["y"] == 2].copy()
                if self.df_anomalies is None:
                    self.df_anomalies = df_anomalies
                else:
                    self.df_anomalies = pd.concat([self.df_anomalies, df_anomalies],
                                                  ignore_index=True, sort=False)

                # Take a subset of only good and bad predictions, but no anomalies
                df_confident = df_run[df_run["y"] != 2].copy()

                # Add new predictions to a training dataset
                self.df_train = pd.concat([self.df_train, df_confident],
                                          ignore_index=True, sort=False)

                pca_df_save_path = self.working_dir.joinpath(f"pca/{train_index+1}-{run_number}.jpg")
                pca_ta_save_path = self.working_dir.joinpath(f"pca_ta/{train_index+1}-{run_number}.jpg")
                dftrain_save_path = self.working_dir.joinpath(f"df/{train_index+1}-{run_number}.jpg")

                do_pca(self.df_train, bin_cols=self.bin_cols,
                       title=f"{train_index+1}", save_path=pca_df_save_path)
                
                df_ta = pd.concat([self.df_train, self.df_anomalies],
                          ignore_index=True, sort=False)
                
                df_plot2(df_ta, df_run, bin_cols=self.bin_cols, title1=f"{train_index+1}", title2=f"{run_number}", save_path=dftrain_save_path, show=False)
                
                do_pca(df_ta, bin_cols=self.bin_cols,
                       title=f"{train_index+1}", save_path=pca_ta_save_path)
                
                self.model = self.train_model(self.df_train, train_index=train_index+1)
            except Exception as err:
                print("Failed self_train |", err)

        self.t2 = time.time()

    def save(self):

        self.model.save(self.working_dir.joinpath('model.h5'))

        self.df_train.to_csv(self.working_dir.joinpath('df_train.csv'))
        self.df_anomalies.to_csv(self.working_dir.joinpath('df_anomalies.csv'))

        # Make a gif from pca images
        # do_gif(self.working_dir)

        df_ta = pd.concat([self.df_train, self.df_anomalies],
                          ignore_index=True, sort=False)

        # TSNE for classified dataset
        do_tsne(self.df_train, bin_cols=self.bin_cols, save_path=self.working_dir.joinpath(
            f"tsne_df_train.jpg"))

        # TSNE for classified dataset + anomalies
        do_tsne(df_ta, bin_cols=self.bin_cols, save_path=self.working_dir.joinpath(f"tsne_df_ta.jpg"))

        # Plot histograms of classified dataset
        df_plot(self.df_train, bin_cols=self.bin_cols, save_path=self.working_dir.joinpath(f"df_train.jpg"))

        # Plot histograms of anomalies
        df_plot(self.df_anomalies, bin_cols=self.bin_cols, save_path=self.working_dir.joinpath(
            f"df_anomalies.jpg"))

        # Plot histograms of classified dataset + anomalies
        df_plot(df_ta, bin_cols=self.bin_cols, save_path=self.working_dir.joinpath(f"df_ta.jpg"))
        
        # Plot histograms of classified dataset + anomalies
        do_pca(df_ta, bin_cols=self.bin_cols, save_path=self.working_dir.joinpath(f"pca_df_ta.jpg"))

        self.t3 = time.time()
        
        # Time taken for each training
        plt.figure(figsize=(20,20))
        plt.plot([x["train_time"] for x in self.training_performance], label="train_time")
        plt.legend()
        plt.savefig(self.working_dir.joinpath(f"duration.jpg"))
        plt.close()

        self.training_performance = {
            "trainings": self.training_performance,
            "duration": {
                "train": int(self.t2 - self.t1),
                "visual": int(self.t3 - self.t2),
                "total": int(self.t3 - self.t1)
            }
        }

        with open(self.working_dir.joinpath('training_performance.json'), 'w') as fh:
            json.dump(self.training_performance, fh)
            
        print(self.training_performance["duration"])

In [3]:
if __name__ == "__main__":

    batch = False

    # base_dir = Path('/eos/home-m/mantydze/ZeroBias2018B/scripts')
    base_dir = Path('../')

    if not batch:
        data_dir = "data_small"
        box = BlackBox(data_dir=base_dir.joinpath(data_dir),
                       working_dir=base_dir.joinpath("trainings").joinpath(data_dir),
                       threshold=0.9)
        box.load_df(nbins=60, step=2)
        box.self_train()
#         box.self_train(nruns=25)
        box.save()

    #         exit()

    else:
        data_dirs = ["data_0", "data_ext", "data_small"]
        thresholds = sorted([0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95], reverse=True)

#         data_dirs = ["data_small"]
        thresholds = [0.7, 0.8, 0.9]
#         thresholds = [0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

        for data_dir in data_dirs:
            for threshold in thresholds:
                box = BlackBox(data_dir=base_dir.joinpath(data_dir),
                            working_dir=base_dir.joinpath("trainings").joinpath(data_dir),
                            threshold=threshold)
                box.load_df(nbins=60, step=2)
    #                 box.self_train(nruns=25)
                box.self_train()
                box.save()
                print()

Working dir: ../trainings/data_small/t-0.9-20210103-180215
Train set (834, 117) | Test set (26374, 116)
Training initial model | Epoch 00134: early stopping
Excluding runs: {297474, 297494, 297495, 297496, 297497, 297498, 297499, 297501, 297502, 297504, 299042, 299064, 299096, 297598, 297099, 298653, 299178, 299180, 299183, 299185, 298678, 297657, 297658, 297659, 297661, 297662, 297663, 297664, 297666, 297670, 297671, 297672, 297673, 297678, 297168, 297169, 297170, 297171, 297175, 297179, 297180, 297181, 297211, 297215, 297218, 297224, 297225, 299316, 299317, 299318, 299324, 299326, 299327, 297281, 297282, 297283, 297284, 297285, 297286, 297287, 297288, 297289, 297290, 297291, 297293, 297308, 297424, 297426, 297429, 297432, 297435, 298997, 298998, 299000, 297467, 297468, 297469}
1. Working with run 297056 | Epoch 00101: early stopping
2. Working with run 297057 | Epoch 00089: early stopping
3. Working with run 297100 | Epoch 00091: early stopping
4. Working with run 297101 | Epoch 0006

In [4]:
# %load_ext tensorboard
# %tensorboard --logdir ../trainings_test/