In [1]:
import sys
sys.path.append("../")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.ensemble import RandomForestClassifier

In [4]:
import pandas as pd
import os,sys
from pathlib import Path
import json 
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-7s %(message)s',
                    stream=sys.stderr, level=logging.INFO)
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.INFO)

from sklearn.semi_supervised import LabelPropagation, LabelSpreading

#General ML 
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, silhouette_score,confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from modules.clustering_helpers import select_labeled_samples

#In-house Module Imports
from config import Configuration 
from datasets import EclipseSampledDataset, VoltaSampledDataset
from utils import *

## Guide

This code will generate different labeled data percentages for the selected CV fold.

In [28]:
#Update these
CV_INDEX = 4
OUTER_DIR = "active_learning_experiments_final_hdfs"
SYSTEM = 'eclipse'
FE_NAME = 'tsfresh'
EXP_NAME = f'{FE_NAME}_experiments'

In [None]:
OUTPUT_DIR = f'/projectnb/peaclab-mon/{user}/{OUTER_DIR}'
FEATURE_SELECTION = False
SCALER = 'None' #For now, do the scaling inside the notebook, then you can move that to the class function
MODEL_CONFIG = f'{SYSTEM}_labeled_data_gen'
logging.warning('Results will be generated in {}, double check please!'.format(MODEL_CONFIG))

In [None]:
conf = Configuration(ipython=True,
                     overrides={
                         'output_dir': Path(OUTPUT_DIR), #change
                         'system' : SYSTEM,
                         'exp_name':EXP_NAME,                                                  
                         'cv_fold':CV_INDEX, 
                         'model_config': MODEL_CONFIG,
                     })

with open(str(conf['experiment_dir']) + '/anom_dict.json') as f:
    ANOM_DICT = json.load(f)
with open(str(conf['experiment_dir']) + '/app_dict.json') as f:
    APP_DICT = json.load(f) 
    
APP_REVERSE_DICT = {}
for app_name, app_encoding in APP_DICT.items():
    APP_REVERSE_DICT[app_encoding] = app_name    

ANOM_REVERSE_DICT = {}
for anom_name, anom_encoding in ANOM_DICT.items():
    ANOM_REVERSE_DICT[anom_encoding] = anom_name

In [None]:
if SYSTEM == 'eclipse':
    eclipseDataset = EclipseSampledDataset(conf)
    train_data, train_label, test_data, test_label = eclipseDataset.load_dataset(
                                                         cv_fold=CV_INDEX, 
                                                         scaler=SCALER,
                                                         borghesi=False, 
                                                         mvts=True if FE_NAME == 'mvts' else False, 
                                                         tsfresh=True if FE_NAME == 'tsfresh' else False)
                                                                                 
elif SYSTEM == 'volta':
    voltaDataset = VoltaSampledDataset(conf)
    train_data, train_label, test_data, test_label = voltaDataset.load_dataset(
                                                           cv_fold=CV_INDEX,        
                                                           scaler=SCALER,
                                                           borghesi=False,
                                                           mvts=True if FE_NAME == 'mvts' else False, 
                                                           tsfresh=True if FE_NAME == 'tsfresh' else False)
                                                                               
                                                                               
assert list(train_data.index) == list(train_label.index) #check the order of the labels     
assert list(test_data.index) == list(test_label.index) #check the order of the labels    

if FEATURE_SELECTION:
    selected_features = pd.read_csv(conf['experiment_dir'] / 'selected_features.csv')
    train_data = train_data[list(selected_features['0'].values)]
    test_data = test_data[list(selected_features['0'].values)]
    
train_label['anom_names'] = train_label.apply(lambda x: ANOM_REVERSE_DICT[x['anom']], axis=1)
test_label['anom_names'] = test_label.apply(lambda x: ANOM_REVERSE_DICT[x['anom']], axis=1)

all_data = pd.concat([train_data, test_data])
all_data = all_data.dropna(axis=1, how='any')
all_label = pd.concat([train_label,test_label])

train_data = all_data.loc[train_label.index]
test_data = all_data.loc[test_label.index]
    
logging.info("Train data shape %s",train_data.shape)
logging.info("Train label shape %s",train_label.shape)
logging.info("Test data shape %s",test_data.shape)  
logging.info("Test label shape %s",test_label.shape)

logging.info("Train data label dist: \n%s",train_label['anom'].value_counts())
logging.info("Test data label dist: \n%s",test_label['anom'].value_counts())  

In [32]:
if SYSTEM == 'eclipse':
    num_samples_per_pair = [1,3,5]
    num_of_app = 6
elif SYSTEM == 'volta':
    num_samples_per_pair = [1,7,13]
    num_of_app = 11
else:
    raise

In [None]:
for num_sample_per_pair in num_samples_per_pair:
    
    labeled_train_label, labeled_test_label, node_indices_labeled  = select_labeled_samples(train_label, num_sample_per_pair, num_of_app)

    logging.info("Labeled data label dist: \n%s",labeled_train_label['anom'].value_counts())
    logging.info("Unlabeled data label dist: \n%s",labeled_test_label['anom'].value_counts()) 

    labeled_train_label.to_csv(conf['experiment_dir'] / f'CV_{CV_INDEX}'/ f'labeled_train_label_{num_sample_per_pair}.csv')
    labeled_test_label.to_csv(conf['experiment_dir'] / f'CV_{CV_INDEX}'/ f'labeled_test_label_{num_sample_per_pair}.csv')