In [3]:
import sys
sys.path.append("../")

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import pandas as pd
import os,sys
from pathlib import Path
import json 
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-7s %(message)s',
                    stream=sys.stderr, level=logging.INFO)
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.INFO)

#General ML 
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

#In-house Module Imports
from config import Configuration 
from datasets import EclipseSampledDataset
from utils import *

In [16]:
user = "sencan"
logging.warning(f'Are you sure that you are: {user}?')



In [17]:
#Update these if you are not the desired user
OUTPUT_DIR = f'/projectnb/peaclab-mon/{user}/feature_extraction_experiments'
SYSTEM = 'eclipse'
EXP_NAME = 'tsfresh_experiments'
CV_INDEX = 0
FEATURE_SELECTION = False
SCALER = 'None' #For now, do the scaling inside the notebook, then you can move that to the class function
MODEL_CONFIG = 'baseline_debug'
logging.warning('Results will be generated in {}, double check please!'.format(MODEL_CONFIG))



In [18]:
conf = Configuration(ipython=True,
                     overrides={
                         'output_dir': Path(OUTPUT_DIR), #change
                         'system' : SYSTEM,
                         'exp_name':EXP_NAME,                                                  
                         'cv_fold':CV_INDEX, 
                         'model_config': 'cluster_then_predict',
                     })

with open(str(conf['experiment_dir']) + '/anom_dict.json') as f:
    ANOM_DICT = json.load(f)
with open(str(conf['experiment_dir']) + '/app_dict.json') as f:
    APP_DICT = json.load(f)    
    
APP_REVERSE_DICT = {}
for app_name, app_encoding in APP_DICT.items():
    APP_REVERSE_DICT[app_encoding] = app_name    

ANOM_REVERSE_DICT = {}
for anom_name, anom_encoding in ANOM_DICT.items():
    ANOM_REVERSE_DICT[anom_encoding] = anom_name        

2022-01-21 12:35:32,309 INFO    Setting directory names
2022-01-21 12:35:32,316 INFO    Model config folder already exists, be careful, otherwise it will overwrite!
2022-01-21 12:35:32,319 INFO    Saving configuration as CSV


# The configuration used for this run:
# {'cv_fold': 0,
#  'exp_name': 'tsfresh_experiments',
#  'experiment_dir': PosixPath('/projectnb/peaclab-mon/sencan/feature_extraction_experiments/eclipse/tsfresh_experiments'),
#  'feature_extract': False,
#  'feature_select': False,
#  'hdf_data_path': PosixPath('/projectnb/peaclab-mon/aksar/datasets/eclipse_sampled_hdfs'),
#  'metadata_path': None,
#  'model_config': 'cluster_then_predict',
#  'model_config_dir': PosixPath('/projectnb/peaclab-mon/sencan/feature_extraction_experiments/eclipse/tsfresh_experiments/CV_0/cluster_then_predict'),
#  'model_dir': PosixPath('/projectnb/peaclab-mon/sencan/feature_extraction_experiments/eclipse/tsfresh_experiments/CV_0/cluster_then_predict/model'),
#  'num_split': 5,
#  'operation': 'read',
#  'output_dir': PosixPath('/projectnb/peaclab-mon/sencan/feature_extraction_experiments/eclipse'),
#  'plots_dir': PosixPath('/projectnb/peaclab-mon/sencan/feature_extraction_experiments/eclipse/tsfresh_experiments/C

In [19]:
eclipseDataset = EclipseSampledDataset(conf)
train_data, train_label, test_data, test_label = eclipseDataset.load_dataset(scaler=SCALER,borghesi=False,tsfresh = True)
assert list(train_data.index) == list(train_label.index) #check the order of the labels     
assert list(test_data.index) == list(test_label.index) #check the order of the labels    

if FEATURE_SELECTION:
    selected_features = pd.read_csv(conf['experiment_dir'] / 'selected_features.csv')
    train_data = train_data[list(selected_features['0'].values)]
    test_data = test_data[list(selected_features['0'].values)]
    
train_label['anom_names'] = train_label.apply(lambda x: ANOM_REVERSE_DICT[x['anom']], axis=1)
test_label['anom_names'] = test_label.apply(lambda x: ANOM_REVERSE_DICT[x['anom']], axis=1)

all_data = pd.concat([train_data, test_data])
all_data = all_data.dropna(axis=1, how='any')
all_label = pd.concat([train_label,test_label])

train_data = all_data.loc[train_label.index]
test_data = all_data.loc[test_label.index]
    
logging.info("Train data shape %s",train_data.shape)
logging.info("Train label shape %s",train_label.shape)
logging.info("Test data shape %s",test_data.shape)  
logging.info("Test label shape %s",test_label.shape)

logging.info("Train data label dist: \n%s",train_label['anom'].value_counts())
logging.info("Test data label dist: \n%s",test_label['anom'].value_counts())        

2022-01-21 12:35:39,484 INFO    BaseDataset Class Initialization
2022-01-21 12:35:39,484 INFO    HPCDataset Class Initialization
2022-01-21 12:35:39,485 INFO    EclipseSampledDataset Class Initialization
2022-01-21 12:36:21,859 INFO    Train data shape (1351, 121836)
2022-01-21 12:36:21,860 INFO    Train label shape (1351, 2)
2022-01-21 12:36:21,861 INFO    Test data shape (2462, 121836)
2022-01-21 12:36:21,861 INFO    Test label shape (2462, 2)
2022-01-21 12:36:31,423 INFO    Train data shape (1351, 115159)
2022-01-21 12:36:31,425 INFO    Train label shape (1351, 3)
2022-01-21 12:36:31,426 INFO    Test data shape (2462, 115159)
2022-01-21 12:36:31,426 INFO    Test label shape (2462, 3)
2022-01-21 12:36:31,428 INFO    Train data label dist: 
0    1217
2      34
1      34
4      33
3      33
Name: anom, dtype: int64
2022-01-21 12:36:31,431 INFO    Test data label dist: 
1    542
2    542
3    539
4    536
0    303
Name: anom, dtype: int64


If we do not apply feature selection we would have 8866 columns in total and 7150 of them would be per core metrics

#### After Feature Selection Check How Many Per Core Metrics are there?

After feature selection we have 2344 columns in total, 1939 of them are per core metrics

In [9]:
count = 0
for col in train_data.columns:
    if 'per_core' in col:
        count += 1
print(count)

1939


In [28]:
SCALER = 'MinMax'

if SCALER == 'MinMax':
    
    scaler = MinMaxScaler()    

elif SCALER == 'Standard':
    
    scaler = StandardScaler()

clf = RandomForestClassifier(random_state=0)

pipeline = Pipeline([('transformer', scaler), ('estimator', clf)])

### Running FR Tuncer with MVTS/Tsfresh Data

In [20]:
SCALER = 'MinMax'

if SCALER == 'MinMax':
    
    minmax_scaler = MinMaxScaler().fit(train_data)
    train_data = pd.DataFrame(minmax_scaler.transform(train_data),columns=train_data.columns,index=train_data.index)
    test_data = pd.DataFrame(minmax_scaler.transform(test_data),columns=test_data.columns,index=test_data.index)
    
elif SCALER == 'Standard':
    
    # Standardize data (per feature Z-normalization, i.e. zero-mean and unit variance)        
    scaler = StandardScaler().fit(train_data)
    train_data = pd.DataFrame(scaler.transform(train_data),columns=train_data.columns,index=train_data.index)
    test_data = pd.DataFrame(scaler.transform(test_data),columns=test_data.columns,index=test_data.index)  

In [21]:
NEW_FS = True
NUM_FEATURES = 20000

if NEW_FS:
    selector = SelectKBest(chi2, k=NUM_FEATURES)
    selector.fit(train_data,train_label['anom'])
    train_data = train_data[train_data.columns[selector.get_support(indices=True)]]
    selected_columns = train_data.columns
    test_data = test_data[test_data.columns & selected_columns]
    
logging.info(train_data.shape)
logging.info(test_data.shape)

all_data = pd.concat([train_data, test_data])
all_data = all_data.dropna(axis=1, how='any')
all_label = pd.concat([train_label,test_label])

logging.info(all_data.shape)
logging.info(all_label.shape)

2022-01-21 12:36:43,768 INFO    (1351, 20000)
2022-01-21 12:36:43,769 INFO    (2462, 20000)
2022-01-21 12:36:44,750 INFO    (3813, 20000)
2022-01-21 12:36:44,751 INFO    (3813, 3)


In [22]:
clf = RandomForestClassifier(random_state=0)

pipeline = Pipeline([('estimator', clf)])
#pipeline = Pipeline([('transformer', scaler), ('estimator', clf)])

In [23]:
scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'f1_weighted']
skf = StratifiedKFold(n_splits=5)

scores = cross_validate(pipeline, all_data, all_label['anom'].values,                         
                        cv=skf, 
                        scoring=scoring)

In [25]:
scores

{'fit_time': array([14.36127019, 14.22909045, 14.40623927, 14.01253319, 14.13402009]),
 'score_time': array([0.38078332, 0.36517859, 0.36603427, 0.34352994, 0.36449242]),
 'test_precision_macro': array([0.98951867, 0.98436177, 0.98784029, 0.98828084, 0.99238613]),
 'test_recall_macro': array([0.98951945, 0.98434783, 0.98782609, 0.98778006, 0.99125858]),
 'test_f1_macro': array([0.98951166, 0.98434753, 0.98782586, 0.987894  , 0.99180277]),
 'test_f1_weighted': array([0.99212481, 0.98820423, 0.99082551, 0.99085529, 0.99343037])}

In [26]:
# tsfresh results with 20k feature
scores['test_f1_weighted'].mean()

0.9910880447994387

In [12]:
# mvts results
scores['test_f1_weighted'].mean()

0.9826724483115286

In [13]:
# results after using per core metrics with feature selection (previously it was 0.90)
scores['test_f1_weighted'].mean()

0.923437823541215