In [1]:
import sys
sys.path.append("../")

In [2]:
%load_ext autoreload
%autoreload 2

In [40]:
import pandas as pd
import os,sys
from pathlib import Path
import json 
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-7s %(message)s',
                    stream=sys.stderr, level=logging.INFO)
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.INFO)


#General ML 
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, silhouette_score,confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn import svm



#In-house Module Imports
from config import Configuration 
from datasets import EclipseSampledDataset, VoltaSampledDataset
from utils import *

from modules.models import BaseAutoencoder
from modules.models_helper import *

In [4]:
def readModelConfig(conf, exp_name,cv_index,model_name,system):
    """Reads saved config file and returns as a dictionary"""
    
    import math    
    #config_path = Path('/projectnb/peaclab-mon/aksar/clustering_experiments/{system}/{exp_name}/CV_{cv_index}/{model_name}/model_config.csv'.format(system=system,exp_name=exp_name,cv_index=cv_index,model_name=model_name))
    config_path = conf['model_config_dir'] / "model_config.csv" #/ f"{model_name}/model_config.csv" 
    print(config_path)
    
    conf = {}
    try:
        conf_csv = pd.read_csv(config_path)
    except:
        logging.info("Config.csv doesn't exist")
    

    for column in conf_csv.columns:
        if isinstance(conf_csv[column][0],str):
            if 'dir' in column:
                conf[column] = Path(conf_csv[column][0])
            else:
                conf[column] = conf_csv[column][0]
                
        #FIXME: Find a generic comparison for integers
        elif isinstance(conf_csv[column][0],np.int64):
                conf[column] = conf_csv[column][0]  
                
        elif isinstance(conf_csv[column][0],np.bool_):
                conf[column] = conf_csv[column][0]                  
        else:
            if math.isnan(conf_csv[column][0]):
                conf[column] = None
        
    return conf

In [180]:
class Autoencoder(Model):
    def __init__(self, params):
        super(Autoencoder, self).__init__()
        self.params = params
        
        self.encoder = tf.keras.Sequential([
          #layers.Dropout(params['dropout']),                        
          #layers.Dense(1000, activation=params['hidden_acts']),
          layers.Dense(params['latent_dim'], activation=params['latent_activation'],name='code')
        ],name='Encoder')
        
        self.decoder = tf.keras.Sequential([
          #layers.Dense(1000, activation=params['hidden_acts']),                        
          #layers.Dropout(params['dropout']),                        
          layers.Dense(x_dim, activation='linear'),            
        ],name='Decoder')

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def get_config(self):
        return {
            "model_name": self.params['model_name'],
            #Layer related stuff
            "latent_dim": self.params['latent_dim'],
            "latent_activation": self.params['latent_activation'],
            "dropout_rate": self.params['dropout'],
            "regularizer": self.params['regularizer'],            
            "regularization_rate": self.params['regularization_rate'],                        
            #Compilation related stuff            
            "optimizer": self.optimizer,
            "loss": self.loss,
            #Training
            "epochs": self.params['epochs']
        }    

In [181]:
MODEL_CONFIG = "proctor_exp_1_active_learning"  # change this
SYSTEM = 'volta'  # volta or eclipse
FE_NAME = 'tsfresh' #tsfresh, or mvts => It will set the EXP_NAME. Be careful. 
NUM_FEATURE = 250  # example: 250 ,2000, 4000
classifier_name = 'proctor'
query_strategy = "random"  # "uncertainty", "margin", "entropy", "random", "equal_app"
CV_INDEX = 0  # it can be integer value within the range 0 1 2 3 4
repeat_num = 0
query_size = 250

In [182]:
#Constants
FS_NAME = "CHI"
method = "baseline" if (query_strategy == 'random' or query_strategy == 'equal_app')  else "active_learning"
num_samples_per_pair = 1

if SYSTEM == 'volta':
    OUTPUT_DIR = f'/projectnb/peaclab-mon/{user}/active_learning_experiments'
elif SYSTEM == 'eclipse':    
    OUTPUT_DIR = f'/projectnb/peaclab-mon/{user}/active_learning_experiments_final_hdfs'
    
EXP_NAME = f'{FE_NAME}_experiments'
FEATURE_SELECTION = False
SCALER = 'None' #For now, do the scaling inside the notebook, then you can move that to the class function

logging.warning('Results will be generated in {}, double check please!'.format(MODEL_CONFIG))



In [None]:
  conf = Configuration(ipython=True,
                     overrides={
                         'output_dir': Path(OUTPUT_DIR), #change
                         'system' : SYSTEM,
                         'exp_name':EXP_NAME,                                                  
                         'cv_fold':CV_INDEX, 
                         'model_config': MODEL_CONFIG,
                     })

with open(str(conf['experiment_dir']) + '/anom_dict.json') as f:
    ANOM_DICT = json.load(f)
with open(str(conf['experiment_dir']) + '/app_dict.json') as f:
    APP_DICT = json.load(f)    
    
APP_REVERSE_DICT = {}
for app_name, app_encoding in APP_DICT.items():
    APP_REVERSE_DICT[app_encoding] = app_name    

ANOM_REVERSE_DICT = {}
for anom_name, anom_encoding in ANOM_DICT.items():
    ANOM_REVERSE_DICT[anom_encoding] = anom_name      

In [None]:
if SYSTEM == 'eclipse':
    eclipseDataset = EclipseSampledDataset(conf)
    train_data, train_label, test_data, test_label = eclipseDataset.load_dataset(scaler=SCALER,
                                                                                 cv_fold=CV_INDEX,
                                                                                 borghesi=False, 
                                                                                 mvts=True if FE_NAME == 'mvts' else False, 
                                                                                 tsfresh=True if FE_NAME == 'tsfresh' else False)

elif SYSTEM == 'volta':
    voltaDataset = VoltaSampledDataset(conf)
    train_data, train_label, test_data, test_label = voltaDataset.load_dataset(scaler=SCALER,
                                                                               cv_fold=CV_INDEX,
                                                                               borghesi=False,
                                                                               mvts=True if FE_NAME == 'mvts' else False,
                                                                               tsfresh=True if FE_NAME == 'tsfresh' else False)

assert list(train_data.index) == list(train_label.index) #check the order of the labels     
assert list(test_data.index) == list(test_label.index) #check the order of the labels    

if FEATURE_SELECTION:
    selected_features = pd.read_csv(conf['experiment_dir'] / 'selected_features.csv')
    train_data = train_data[list(selected_features['0'].values)]
    test_data = test_data[list(selected_features['0'].values)]
    
train_label['anom_names'] = train_label.apply(lambda x: ANOM_REVERSE_DICT[x['anom']], axis=1)
train_label['app_names']=train_label['app'].apply(lambda x: APP_REVERSE_DICT[x])
test_label['anom_names'] = test_label.apply(lambda x: ANOM_REVERSE_DICT[x['anom']], axis=1)
test_label['app_names']=test_label['app'].apply(lambda x: APP_REVERSE_DICT[x])

all_data = pd.concat([train_data, test_data])
all_data = all_data.dropna(axis=1, how='any')
all_label = pd.concat([train_label,test_label])

train_data = all_data.loc[train_label.index]
test_data = all_data.loc[test_label.index]
    
logging.info("Train data shape %s",train_data.shape)
logging.info("Train label shape %s",train_label.shape)
logging.info("Test data shape %s",test_data.shape)  
logging.info("Test label shape %s",test_label.shape)

logging.info("Train data label dist: \n%s",train_label['anom'].value_counts())
logging.info("Test data label dist: \n%s",test_label['anom'].value_counts())  

In [185]:
orig_train_data = train_data.copy()
orig_test_data = test_data.copy()

In [186]:
# train_data = orig_train_data
# test_data = orig_test_data
# print(len(train_data.columns))

In [None]:
SCALER = 'MinMax'

if SCALER == 'MinMax':
    
    minmax_scaler = MinMaxScaler().fit(train_data)
    train_data = pd.DataFrame(minmax_scaler.transform(train_data),columns=train_data.columns,index=train_data.index)
    test_data = pd.DataFrame(minmax_scaler.transform(test_data),columns=test_data.columns,index=test_data.index)
    
elif SCALER == 'Standard':
    
    # Standardize data (per feature Z-normalization, i.e. zero-mean and unit variance)        
    scaler = StandardScaler().fit(train_data)
    train_data = pd.DataFrame(scaler.transform(train_data),columns=train_data.columns,index=train_data.index)
    test_data = pd.DataFrame(scaler.transform(test_data),columns=test_data.columns,index=test_data.index)  
    
#Implement new feature selection strategies below
if FS_NAME == 'CHI':
    
    selector = SelectKBest(chi2, k=NUM_FEATURE)
    selector.fit(train_data,train_label['anom'])
    train_data = train_data[train_data.columns[selector.get_support(indices=True)]]
    selected_columns = train_data.columns
    test_data = test_data[test_data.columns & selected_columns]
    
elif FS_NAME == 'TSFRESH':
    logging.warning("NUM_FEATURE parameter will be overwritten by the automatic selection process")
    
    y_train = train_label['anom']
    X_train = train_data

    relevant_features = set()

    for label in y_train.unique():
        y_train_binary = y_train == label
        X_train_filtered = tsfresh.select_features(X_train, y_train_binary)
        print("Number of relevant features for class {}: {}/{}".format(label, X_train_filtered.shape[1], X_train.shape[1]))
        relevant_features = relevant_features.union(set(X_train_filtered.columns))    
    train_data = train_data[relevant_features]
    test_data = test_data[relevant_features]
    NUM_FEATURE = len(relevant_features)
    
elif FS_NAME == 'NONE':
    logging.info("No feature selection strategy is specified, will be using all features")
    NUM_FEATURE = len(train_data.columns)
    
logging.info(train_data.shape)
logging.info(test_data.shape)    

In [None]:
#Read the node_ids considered labeled
labeled_train_label = pd.read_csv(conf['experiment_dir'] / f'CV_{CV_INDEX}'/ f'labeled_train_label_{num_samples_per_pair}.csv', index_col=['node_id'])
labeled_test_label = pd.read_csv(conf['experiment_dir'] / f'CV_{CV_INDEX}'/ f'labeled_test_label_{num_samples_per_pair}.csv', index_col=['node_id'])
node_indices_labeled = list(labeled_train_label['anom'].index.values)

logging.info("Labeled data label dist: \n%s",labeled_train_label['anom'].value_counts())
logging.info("Unlabeled data label dist: \n%s",labeled_test_label['anom'].value_counts())

#Set a new column for label status
node_indices_unlabeled = []
for node in train_label.index:
    if node not in node_indices_labeled:
        node_indices_unlabeled.append(node)
        
train_label['label_status'] = train_label['anom'] # for the full data case
train_label['label_status'] = np.where(train_label.index.get_level_values('node_id').isin(node_indices_unlabeled), -1,train_label['label_status'])

In [189]:
#initial_labeled_pool contains one sample from each application anomaly pair
initial_labeled_pool = train_label[(train_label['label_status'] != -1)]

#Active learning or random sampling will be querying from the same pool
initial_unlabeled_pool = train_label[(train_label['label_status'] == -1)]

In [190]:
params = {}

#Compile related

params['loss'] = 'mse'

params['optimizer'] = 'adadelta'
params['learning_rate'] = 1e-3

#Layer related
params['dropout'] = 0 #This will add dropout after layers
params['regularizer'] = None
params['regularization_rate'] = None

params['hidden_layers'] = []
params['hidden_acts'] = 'relu'

params['latent_dim'] =  2000
params['latent_activation'] = 'relu' #change the activation of the code layer

#Training related
params['epochs'] = 10

if params['optimizer'] == 'adam':
    opt = optimizers.Adam(learning_rate=params['learning_rate'])
elif params['optimizer'] == 'adadelta':
    opt = optimizers.Adadelta(learning_rate=params['learning_rate'])
elif params['optimizer'] == 'sgd':
    opt = optimizers.SGD(learning_rate=params['learning_rate'])


if params['loss'] == 'mse':
    selected_loss = losses.MeanSquaredError()
elif params['loss'] == 'mae':
    selected_loss = losses.MeanAbsoluteError()
    
params['model_name'] = f"dae_{params['latent_dim']}_{params['epochs']}_{params['latent_activation']}"    

In [None]:
logging.info("Train data shape %s",train_data.shape)
logging.info("Train label shape %s",train_label.shape)

In [None]:
#BUILD AE

TRAIN_DATA = train_data.values

x_dim = TRAIN_DATA.shape[1]

autoencoder = Autoencoder(params)       

autoencoder.compile(optimizer=opt, loss=selected_loss)
autoencoder.build(TRAIN_DATA.shape) 
autoencoder.encoder.summary()
autoencoder.decoder.summary()
autoencoder.get_config()    

es = EarlyStopping(monitor='val_loss', 
                                    verbose=1,
                                    patience=10,
                                    mode='min',
                                    restore_best_weights=True)

history = autoencoder.fit(train_data.values, train_data.values,
                epochs=params['epochs'],
                shuffle=True,
                callbacks =[
                              es  
                            ],
                validation_split = 0.2,
                )

plt.plot(history.history["loss"], label="Training Loss")
if not history.history["val_loss"] is None:
    plt.plot(history.history["val_loss"], label="Validation Loss")
    
plt.legend()
plt.show

autoencoder.save(str(conf['model_dir'] / (params['model_name'])))
logging.info("Model saved!")    

### Train the supervised part

In [192]:
MODEL_NAME = MODEL_CONFIG

TOPOLOGY_NAME = params["model_name"]
STACKED = False #Set this when loading a stacked AE

FEATURE_SELECTION = False
REPEAT_SUPERVISED = 1
PERCENTAGE = 0

model_config = readModelConfig(conf,exp_name=EXP_NAME,cv_index=CV_INDEX,model_name=TOPOLOGY_NAME,system=SYSTEM)

/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments/CV_0/proctor_exp_1_active_learning/model_config.csv


In [193]:
#Convert test data to array format
assert list(test_label.index) == list(test_data.index.get_level_values('node_id')) 
test_true_data = test_data.values 
#test_true_data_fs = test_data[list(selected_features['0'].values)] 
test_true_label = test_label['anom'].values 

#Training data with feature selection
#train_data_fs = train_data[list(selected_features['0'].values)]
X_DIM = train_data.shape[1]

In [194]:
#initial_labeled_pool contains one sample from each application anomaly pair
initial_labeled_pool = train_label[(train_label['label_status'] != -1)]

#Active learning or random sampling will be querying from the same pool
initial_unlabeled_pool = train_label[(train_label['label_status'] == -1)]

In [199]:
def runMultipleTrainTestsProctorScikit(name,model_config,
                                       topology,X_train,y_train,X_test,y_test,
                                       repeat,cv_index,size,multimodal=True,stacked=False):
      
                                      
        macro_fscore = 0
        weight_fscore = 0
        miss_rate = 0
        alarm_rate = 0
        score_dict = {}
        
        #It means loading a full autoencoder
        if not stacked:
            reconstructedModel = LoadModel(model_config,topology)
            reconstructedModel = reconstructedModel.encoder
        else:
            reconstructedModel = LoadModel(model_config,topology)
                
        #Test data is constant 
        hidden_test = reconstructedModel.predict(X_test)             
                
        param_grid = {'C': [0.1,0.5, 1, 5, 10, 100, 1000],  
                      'tol': [1e-4, 1e-5, 1e-6,1e-7], 
                      'class_weight': [None, 'balanced'],
                      'max_iter': [1000,3000,5000,10000],
                      'penalty': ['l1','l2']
                      
                      }  
                    
        for i in range(repeat):                                    
            
            if multimodal:
                
                #Only anom vs normal            
                train_binary_label = y_train.copy()
                train_binary_label[train_binary_label != 0] = 1  
                                
                hidden_train_binary = reconstructedModel.predict(X_train)
                            
                
                ####Previous
                clf_binary = svm.LinearSVC(max_iter=10000,C=100)                 
                clf_binary.fit(hidden_train_binary, train_binary_label)                
                
                #Only anomalies
                anom_indices = np.where(y_train != 0)
                train_anom_label = y_train[anom_indices]
                train_anom_data = X_train[anom_indices]  
                
                hidden_train_anom = reconstructedModel.predict(train_anom_data)
                               
                clf_anom = svm.LinearSVC(max_iter=10000,C=100)    
                clf_anom.fit(hidden_train_anom, train_anom_label)                
                
                test_pred_label = filteredTestingProctorScikit(clf_binary,clf_anom,hidden_test)
            
            else:
                if name == 'aksar_svm':
                    hidden_train = reconstructedModel.predict(X_train)
                    
                    #clf = svm.LinearSVC(max_iter=10000,C=100)  
                    #clf = svm.LinearSVC()    
                    clf = svm.SVC()
                    clf.fit(hidden_train, y_train)                            
                    test_pred_label = clf.predict(hidden_test)
                    
                elif name == 'aksar_lr':      
                    
                    hidden_train = reconstructedModel.predict(X_train)
                    clf = LogisticRegression(random_state=1234,
                                                    dual=True,
                                                    penalty='l2',
                                                    solver='liblinear')                      
                    clf.fit(hidden_train, y_train)                            
                    test_pred_label = clf.predict(hidden_test)
                    
                elif name == 'aksar_svm-scale':      
                    hidden_train = reconstructedModel.predict(X_train)
                    clf = svm.SVC(C=1,kernel='rbf',probability=True)
                    clf.fit(hidden_train, y_train)                            
                    test_pred_label = clf.predict(hidden_test)                                                                                             
            
            score_report = classification_report(y_true=y_test, y_pred =test_pred_label,output_dict=True)
            logging.info("#############")
            logging.info(classification_report(y_true=y_test, y_pred =test_pred_label))
            logging.info("#############")            
            alarm_report = falseAnomRateCalc(y_test,test_pred_label)
            
            macro_fscore += score_report['macro avg']['f1-score']            
            weight_fscore += score_report['weighted avg']['f1-score']
            miss_rate += alarm_report['anom_miss_rate'] 
            alarm_rate += alarm_report['false_alarm_rate']
        
        
        score_dict['false_alarm_rate'] = alarm_rate / repeat
        score_dict['anom_miss_rate'] = miss_rate / repeat        
        score_dict['macro_fscore'] = macro_fscore / repeat
        score_dict['weight_fscore'] = weight_fscore / repeat
        
        
#         filename = f"test_Proctor-Kmeans-LogisticRegression-NoLP-NoTH_5_{size}_{cv_index}.json"
#         json_dump = json.dumps(score_dict)
#         f_json = open(model_config['results_dir'] / (filename),"w")
#         f_json.write(json_dump)
#         f_json.close() 
        
        return score_dict

In [None]:
step = 10 #Select 50 samples at a time

for repeat_num in range(0,10):

    macro_f1_scores = []
    anomaly_miss_rates = []
    false_alarm_rates = []
    
    
    temp_labeled_pool_labels = initial_labeled_pool.copy()
    temp_unlabeled_pool_labels = initial_unlabeled_pool.copy()

    logging.info("Temp Labeled data label dist: \n%s",temp_labeled_pool_labels['anom'].value_counts())
    logging.info("Temp Unlabeled data label dist: \n%s",temp_unlabeled_pool_labels['anom'].value_counts())    
    
    train_semisup_data = train_data.loc[temp_labeled_pool_labels.index]
    train_semisup_label = temp_labeled_pool_labels['anom'].values    
        
    temp_score_dict = runMultipleTrainTestsProctorScikit(      
                        'aksar_svm',
                        model_config,
                        TOPOLOGY_NAME,
                        train_semisup_data,
                        train_semisup_label,
                        test_true_data,
                        test_true_label,
                        REPEAT_SUPERVISED,
                        CV_INDEX,
                        num_samples_per_pair,                    
                        multimodal=False, 
                        stacked=STACKED)        


    macro_f1_scores.append(temp_score_dict['macro_fscore'])                                        
    false_alarm_rates.append(temp_score_dict['false_alarm_rate'])                                        
    anomaly_miss_rates.append(temp_score_dict['anom_miss_rate'])      


    for i in range(0,query_size,step):

        logging.info(f"Labeled Pool Size: {len(temp_labeled_pool_labels)};  Unlabeled Pool Size: {len(temp_unlabeled_pool_labels)} \n", )

        temp_selected_labels = temp_unlabeled_pool_labels.sample(step)
        temp_labeled_pool_labels = pd.concat([temp_labeled_pool_labels, temp_selected_labels])
        temp_unlabeled_pool_labels.drop(index=temp_selected_labels.index,inplace=True)

        logging.info("Temp Labeled data label dist: \n%s",temp_labeled_pool_labels['anom'].value_counts())
        logging.info("Temp Unlabeled data label dist: \n%s",temp_unlabeled_pool_labels['anom'].value_counts())

        train_semisup_data = train_data.loc[temp_labeled_pool_labels.index]
        train_semisup_label = temp_labeled_pool_labels['anom'].values


        temp_score_dict = runMultipleTrainTestsProctorScikit(      
                            'aksar_svm',
                            model_config,
                            TOPOLOGY_NAME,
                            train_semisup_data,
                            train_semisup_label,
                            test_true_data,
                            test_true_label,
                            REPEAT_SUPERVISED,
                            CV_INDEX,
                            num_samples_per_pair,                    
                            multimodal=False, 
                            stacked=STACKED)        


        macro_f1_scores.append(temp_score_dict['macro_fscore'])                                        
        false_alarm_rates.append(temp_score_dict['false_alarm_rate'])                                        
        anomaly_miss_rates.append(temp_score_dict['anom_miss_rate'])      
        
        scores = pd.DataFrame()

        for j in range(0,len(macro_f1_scores)):
            scores = scores.append({'query_iter':j*step,
                                    'macro_avg_f1_score':macro_f1_scores[j],
                                    'false_alarm_rate':false_alarm_rates[j],
                                    'anomaly_miss_rate':anomaly_miss_rates[j], 
                                    'repeat_num':repeat_num},
                                   ignore_index = True)


        scores['fold'] = CV_INDEX
        scores['method'] = method
        scores['query_strategy'] = query_strategy
        scores['model'] = "Proctor"
        scores['dataset'] = SYSTEM
        scores['fe'] = FE_NAME
        scores['feature_count'] = NUM_FEATURE   
        scores['query_size'] = query_size


        scores = scores.sort_values(by = ['query_iter']).reset_index(drop = True)

        train_app_names = "all"
        test_app_names = "all"

        filename = f"train:{train_app_names}#test:{test_app_names}#{FE_NAME}#{NUM_FEATURE}#{method}#{query_strategy}#{query_size}#Proctor#{repeat_num}.csv"
        scores.to_csv(Path(conf["results_dir"]) / filename)

        logging.info("Saving: %s", filename)                
