In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.ensemble import RandomForestClassifier

In [3]:
import sys
sys.path.append("..")

In [4]:
import pandas as pd
import os,sys
from pathlib import Path
import json 
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-7s %(message)s',
                    stream=sys.stderr, level=logging.INFO)
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.INFO)

from sklearn.semi_supervised import LabelPropagation, LabelSpreading

#General ML 
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, silhouette_score,confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2,f_classif
from modules.clustering_helpers import select_labeled_samples
#In-house Module Imports
from config import Configuration 
from datasets import EclipseSampledDataset, VoltaSampledDataset
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from utils import *
import hdbscan
import re

In [75]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
import json

In [5]:
### new ML models
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

2022-05-06 06:00:53,402 INFO    Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


In [8]:
### Settings
user = "aksar"
logging.warning(f'Are you sure that you are: {user}?')
OUTPUT_DIR = f'/projectnb/peaclab-mon/{user}/active_learning_experiments'
classifier_name = 'rf'
num_samples_per_pair = 1
NUM_FEATURE  = 2000
SYSTEM = 'volta'
FE_NAME = 'tsfresh'
EXP_NAME  = 'tsfresh_experiments'
CV_INDEX = 0
FS_NAME = "CHI"
FEATURE_SELECTION = False
SCALER = 'None' #For now, do the scaling inside the notebook, then you can move that to the class function
MODEL_CONFIG = 'tuning_results' #rf_tuncer or rf_tuncer_worst_case
logging.warning('Results will be generated in {}, double check please!'.format(MODEL_CONFIG))



In [9]:
conf = Configuration(ipython=True,
                     overrides={
                         'output_dir': Path(OUTPUT_DIR), #change
                         'system' : SYSTEM,
                         'exp_name':EXP_NAME,                                                  
                         'cv_fold':CV_INDEX, 
                         'model_config': MODEL_CONFIG,
                     })

with open(str(conf['experiment_dir']) + '/anom_dict.json') as f:
    ANOM_DICT = json.load(f)
with open(str(conf['experiment_dir']) + '/app_dict.json') as f:
    APP_DICT = json.load(f) 
    
APP_REVERSE_DICT = {}
for app_name, app_encoding in APP_DICT.items():
    APP_REVERSE_DICT[app_encoding] = app_name    

ANOM_REVERSE_DICT = {}
for anom_name, anom_encoding in ANOM_DICT.items():
    ANOM_REVERSE_DICT[anom_encoding] = anom_name

2022-05-06 06:01:23,192 INFO    Setting directory names
2022-05-06 06:01:23,198 INFO    Model config folder already exists, be careful, otherwise it will overwrite!
2022-05-06 06:01:23,202 INFO    Saving configuration as CSV


# The configuration used for this run:
# {'cv_fold': 0,
#  'exp_name': 'tsfresh_experiments',
#  'experiment_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments'),
#  'feature_extract': False,
#  'feature_select': False,
#  'hdf_data_path': PosixPath('/projectnb/peaclab-mon/aksar/datasets/tpds_data_hdfs'),
#  'metadata_path': None,
#  'model_config': 'tuning_results',
#  'model_config_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments/CV_0/tuning_results'),
#  'model_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments/CV_0/tuning_results/model'),
#  'num_split': 5,
#  'operation': 'read',
#  'output_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta'),
#  'plots_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments/CV_0/tuning_results/model/plots'),
#  'processed_ldms_d

In [10]:
if SYSTEM == "eclipse":
        eclipseDataset = EclipseSampledDataset(conf)
        train_data, train_label, test_data, test_label = eclipseDataset.load_dataset(
            cv_fold=CV_INDEX,
            scaler=SCALER,
            borghesi=False,
            mvts=True if FE_NAME == "mvts" else False,
            tsfresh=True if FE_NAME == "tsfresh" else False,
        )

elif SYSTEM == "volta":
    voltaDataset = VoltaSampledDataset(conf)
    train_data, train_label, test_data, test_label = voltaDataset.load_dataset(
        cv_fold=CV_INDEX,
        scaler=SCALER,
        borghesi=False,
        mvts=True if FE_NAME == "mvts" else False,
        tsfresh=True if FE_NAME == "tsfresh" else False,
    )

assert list(train_data.index) == list(train_label.index)  # check the order of the labels
assert list(test_data.index) == list(test_label.index)  # check the order of the labels

if FEATURE_SELECTION:
    selected_features = pd.read_csv(conf["experiment_dir"] / "selected_features.csv")
    train_data = train_data[list(selected_features["0"].values)]
    test_data = test_data[list(selected_features["0"].values)]

train_label["anom_names"] = train_label.apply(lambda x: ANOM_REVERSE_DICT[x["anom"]], axis=1)
train_label["app_names"] = train_label["app"].apply(lambda x: APP_REVERSE_DICT[x])
test_label["anom_names"] = test_label.apply(lambda x: ANOM_REVERSE_DICT[x["anom"]], axis=1)
test_label["app_names"] = test_label["app"].apply(lambda x: APP_REVERSE_DICT[x])

all_data = pd.concat([train_data, test_data])
all_data = all_data.dropna(axis=1, how="any")
all_label = pd.concat([train_label, test_label])

train_data = all_data.loc[train_label.index]
test_data = all_data.loc[test_label.index]

logging.info("Train data shape %s", train_data.shape)
logging.info("Train label shape %s", train_label.shape)
logging.info("Test data shape %s", test_data.shape)
logging.info("Test label shape %s", test_label.shape)

logging.info("Train data label dist: \n%s", train_label["anom"].value_counts())
logging.info("Test data label dist: \n%s", test_label["anom"].value_counts())

2022-05-06 06:01:29,356 INFO    BaseDataset Class Initialization
2022-05-06 06:01:29,357 INFO    HPCDataset Class Initialization
2022-05-06 06:01:29,358 INFO    VoltaSampledDataset Class Initialization
2022-05-06 06:04:06,102 INFO    Train data shape (6326, 102311)
2022-05-06 06:04:06,121 INFO    Train label shape (6326, 2)
2022-05-06 06:04:06,122 INFO    Test data shape (14589, 102311)
2022-05-06 06:04:06,122 INFO    Test label shape (14589, 2)
2022-05-06 06:04:57,739 INFO    Train data shape (6326, 99169)
2022-05-06 06:04:57,740 INFO    Train label shape (6326, 4)
2022-05-06 06:04:57,741 INFO    Test data shape (14589, 99169)
2022-05-06 06:04:57,741 INFO    Test label shape (14589, 4)
2022-05-06 06:04:57,744 INFO    Train data label dist: 
0    5694
2     159
4     159
1     158
3     156
Name: anom, dtype: int64
2022-05-06 06:04:57,748 INFO    Test data label dist: 
0    13286
1      332
2      326
3      324
4      321
Name: anom, dtype: int64


In [11]:
SCALER = "MinMax"

if SCALER == "MinMax":

    minmax_scaler = MinMaxScaler().fit(train_data)
    train_data = pd.DataFrame(
        minmax_scaler.transform(train_data), columns=train_data.columns, index=train_data.index
    )
    test_data = pd.DataFrame(
        minmax_scaler.transform(test_data), columns=test_data.columns, index=test_data.index
    )

elif SCALER == "Standard":

    # Standardize data (per feature Z-normalization, i.e. zero-mean and unit variance)
    scaler = StandardScaler().fit(train_data)
    train_data = pd.DataFrame(
        scaler.transform(train_data), columns=train_data.columns, index=train_data.index
    )
    test_data = pd.DataFrame(
        scaler.transform(test_data), columns=test_data.columns, index=test_data.index
    )

# Implement new feature selection strategies below
if FS_NAME == "CHI":

    selector = SelectKBest(chi2, k=NUM_FEATURE)
    selector.fit(train_data, train_label["anom"])
    train_data = train_data[train_data.columns[selector.get_support(indices=True)]]
    selected_columns = train_data.columns
    test_data = test_data[test_data.columns & selected_columns]

elif FS_NAME == "TSFRESH":
    logging.warning(
        "NUM_FEATURE parameter will be overwritten by the automatic selection process"
    )

    y_train = train_label["anom"]
    X_train = train_data

    relevant_features = set()

    for label in y_train.unique():
        y_train_binary = y_train == label
        X_train_filtered = tsfresh.select_features(X_train, y_train_binary)
        print(
            "Number of relevant features for class {}: {}/{}".format(
                label, X_train_filtered.shape[1], X_train.shape[1]
            )
        )
        relevant_features = relevant_features.union(set(X_train_filtered.columns))
    train_data = train_data[relevant_features]
    test_data = test_data[relevant_features]
    NUM_FEATURE = len(relevant_features)

elif FS_NAME == "NONE":
    logging.info("No feature selection strategy is specified, will be using all features")
    NUM_FEATURE = len(train_data.columns)

logging.info(train_data.shape)
logging.info(test_data.shape)

2022-05-06 06:05:38,705 INFO    (6326, 2000)
2022-05-06 06:05:38,706 INFO    (14589, 2000)


In [12]:
labeled_train_label = pd.read_csv(
    conf["experiment_dir"]
    / f"CV_{CV_INDEX}"
    / f"labeled_train_label_{num_samples_per_pair}.csv",
    index_col=["node_id"],
)
labeled_test_label = pd.read_csv(
    conf["experiment_dir"]
    / f"CV_{CV_INDEX}"
    / f"labeled_test_label_{num_samples_per_pair}.csv",
    index_col=["node_id"],
)
node_indices_labeled = list(labeled_train_label["anom"].index.values)

logging.info("Labeled data label dist: \n%s", labeled_train_label["anom"].value_counts())
logging.info("Unlabeled data label dist: \n%s", labeled_test_label["anom"].value_counts())

2022-05-06 06:05:38,809 INFO    Labeled data label dist: 
2    12
4    11
3    11
1    11
0    11
Name: anom, dtype: int64
2022-05-06 06:05:38,812 INFO    Unlabeled data label dist: 
0    5683
4     148
2     147
1     147
3     145
Name: anom, dtype: int64


In [84]:
# Set a new column for label status
node_indices_unlabeled = []
for node in train_label.index:
    if node not in node_indices_labeled:
        node_indices_unlabeled.append(node)
train_label["label_status"] = train_label["anom"]  # for the full data case
train_label["label_status"] = np.where(
    train_label.index.get_level_values("node_id").isin(node_indices_unlabeled),
    -1,
    train_label["label_status"],
)

In [13]:
# initial_labeled_pool contains one sample from each application anomaly pair
initial_labeled_pool = train_label[(train_label["label_status"] != -1)]
# Active learning or random sampling will be querying from the same pool
initial_unlabeled_pool = train_label[(train_label["label_status"] == -1)]

if classifier_name == "rf":
    selected_classifier = RandomForestClassifier()
elif classifier_name == "lr":
    selected_classifier = LogisticRegression()
else:
    selected_classifier = RandomForestClassifier()

scores = pd.DataFrame()

all_app_names = list(APP_DICT.keys())
selected_apps = dict.fromkeys(all_app_names, 0)
selected_anoms = dict.fromkeys(list(ANOM_REVERSE_DICT.keys()), 0)

# Create the label and data for the starting condition composed of selected apps
y_initial = initial_labeled_pool
x_initial = train_data[train_data.index.get_level_values("node_id").isin(y_initial.index)]
y_initial = y_initial["anom"].to_numpy()
"""x_initial = x_initial.to_numpy()

x_unlabeled = train_data[
    train_data.index.get_level_values("node_id").isin(initial_unlabeled_pool.index)
]
y_unlabeled = initial_unlabeled_pool
x_unlabeled = x_unlabeled.to_numpy()"""

KeyError: 'label_status'

In [102]:
rf_param_grid = { 
            'n_estimators': [100, 200, 500, 1000, 2000, 10000],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : [4,8,12,None],
            'criterion' :['gini', 'entropy']
        }

lgbm_param_grid = {
            "num_leaves": [2, 8, 31, 128],
            "learning_rate": [0.01, 0.1, 0.3],
            "max_depth": [-1, 2, 8],
            "colsample_bytree": [0.5, 1.0],
        }

lr_param_grid = {'penalty' : ['l1', 'l2'],
                'C' : [0.1, 0.5, 1.0, 3.0, 5.0],
                'solver' : ['liblinear']}

mlp_param_grid = {
            "max_iter": [100, 200, 500, 1000],
            "hidden_layer_sizes": [(10, 10, 10), (30, 20, 10), (50, 100, 50), (100)],
            "alpha": [0.0001, 0.001, 0.01],
        }


In [88]:
rf_single_param_grid = { 
            'n_estimators': [10,50,100,200],
            'max_depth' : [None,2,4,8,10,20],
            'criterion' :['gini','entropy']
        }

In [89]:
rf_single_param_grid_trial = { 
            'n_estimators': [10],
            'max_depth' : [50],
            'criterion' :['gini']
        }

lr_single_param_grid = { 
            'penalty': ['l1','l2'],
             'C' : [0.1],
            'solver' : ['liblinear']
        }

In [17]:
###
models = ['random_forest', 'logistic_regression', 'lgbm','mlp']

In [26]:
%%time
for model in models:
    if model == 'random_forest':
        logging.info("Tunning Random Forest...")
        rfc = RandomForestClassifier(random_state=42)
        rfc.fit(train_data, train_label['anom'])  # previously we were giving x_initial, y_initial
        pred = rfc.predict(test_data)
        report_dict = classification_report(test_label["anom"], pred, output_dict=True)
        print("Inıtial Macro-Avg  F-1 for Random Forest on Test data: ",report_dict["macro avg"]["f1-score"])
        CV_rfc = GridSearchCV(estimator=rfc, param_grid=rf_param_grid, cv= 5)
        CV_rfc.fit(train_data, train_label['anom'])
        logging.info(CV_rfc.best_params_)
        best_max_features = CV_rfc.best_params_['max_features']
        best_n_estimators = CV_rfc.best_params_['n_estimators']
        best_max_depth    = CV_rfc.best_params_['max_depth']
        best_criterion    = CV_rfc.best_params_['criterion']
        tuned_rfc=RandomForestClassifier(random_state=42, max_features= best_max_features, n_estimators= best_n_estimators, max_depth=best_max_depth, criterion=best_criterion)
        tuned_rfc.fit(train_data, train_label['anom'])
        pred = tuned_rfc.predict(test_data)
        report_dict = classification_report(test_label["anom"], pred, output_dict=True)
        print("Tuned Macro-Avg  F-1 for Random Forest on Test data: ",report_dict["macro avg"]["f1-score"])
        
    elif model == 'lgbm':
        logging.info("Tunning LGBM...")
        train_data = train_data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
        lgbm = LGBMClassifier(objective='multiclass', random_state=5)
        lgbm.fit(x_initial, y_initial)
        pred = lgbm.predict(test_data)
        report_dict = classification_report(test_label["anom"], pred, output_dict=True)
        print("Inıtial Macro-Avg  F-1 for LGBM on Test data: ",report_dict["macro avg"]["f1-score"])
        CV_lgbm = GridSearchCV(estimator=lgbm, param_grid= lgbm_param_grid, cv= 5)
        CV_lgbm.fit(x_initial, y_initial)
        logging.info(CV_lgbm.best_params_)
        best_n_estimators  = CV_lgbm.best_params_['n_estimators']
        best_max_depth     = CV_lgbm.best_params_['max_depth']
        best_learning_rate = CV_lgbm.best_params_['learning_rate']
        best_lambda_l1     = CV_lgbm.best_params_['best_lambda_l1']
        best_lambda_l2     = CV_lgbm.best_params_['best_lambda_l2']
        tuned_lgbm = LGBMClassifier(random_state = 5, n_estimators= best_n_estimators, max_depth=best_max_depth, learning_rate = best_learning_rate, lambda_l1 = best_lambda_l1, lambda_l2 = best_lambda_l2 )
        tuned_lgbm.fit(x_initial, y_initial)
        pred = tuned_lgbm.predict(test_data)
        report_dict = classification_report(test_label["anom"], pred, output_dict=True)
        print("Tuned Macro-Avg  F-1 for Random Forest on Test data: ",report_dict["macro avg"]["f1-score"])
        
    elif model == 'logistic_regression':
        logging.info("Tunning Logistic Regression...")
        lr = LogisticRegression(random_state=0, dual=False, max_iter=12000)
        lr.fit(x_initial, y_initial)
        pred = lr.predict(test_data)
        report_dict = classification_report(test_label["anom"], pred, output_dict=True)
        print("Inıtial Macro-Avg  F-1 forLogistic Regression on Test data: ",report_dict["macro avg"]["f1-score"])
        CV_lr = GridSearchCV(estimator = lr, param_grid=lr_param_grid, cv= 5)
        CV_lr.fit(x_initial, y_initial)
        logging.info(CV_lr.best_params_)
        best_penalty = CV_lr.best_params_['penalty']
        best_C = CV_lr.best_params_['C']        
        best_solver = CV_lr.best_params_['solver']
        tuned_lr = LogisticRegression(max_iter=12000, dual=False, random_state=0, penalty= best_penalty, C = best_C, solver=best_solver)
        tuned_lr.fit(x_initial, y_initial)
        pred = tuned_lr.predict(test_data)
        report_dict = classification_report(test_label["anom"], pred, output_dict=True)
        print("Tuned Macro-Avg  F-1 for Logistic Regression Test data: ",report_dict["macro avg"]["f1-score"])
        
    elif model == "mlp":
        logging.info("Tunning MLP...")
        mlp = MLPClassifier(random_state=1, max_iter=300).fit(x_initial, y_initial)
        pred = mlp.predict(test_data)
        report_dict = classification_report(test_label["anom"], pred, output_dict=True)
        print("Tuned Macro-Avg  F-1 for MLP Regression Test data: ",report_dict["macro avg"]["f1-score"])
        CV_mlp = GridSearchCV(estimator = mlp, param_grid=mlp_param_grid, cv= 5)
        CV_mlp.fit(x_initial, y_initial)
        logging.info(CV_mlp.best_params_)
        
        best_max_iter           = CV_mlp.best_params_['max_iter']     
        best_hidden_layer_sizes = CV_mlp.best_params_['hidden_layer_sizes']
        best_alpha              = CV_mlp.best_params_['alpha']
        
        tuned_mlp = MLPClassifier(max_iter=best_max_iter,hidden_layer_sizes = best_hidden_layer_sizes, alpha = best_alpha)
        tuned_mlp.fit(x_initial, y_initial)
        pred = tuned_mlp.predict(test_data)
        report_dict = classification_report(test_label["anom"], pred, output_dict=True)
        print("Tuned Macro-Avg  F-1 for MLP Test data: ",report_dict["macro avg"]["f1-score"])

2022-04-19 11:33:27,880 INFO    Tunning MLP...


Tuned Macro-Avg  F-1 for MLP Regression Test data:  0.535104999484931


2022-04-19 11:41:44,106 INFO    {'alpha': 0.0001, 'hidden_layer_sizes': 100, 'max_iter': 100}


Tuned Macro-Avg  F-1 for MLP Test data:  0.551988777013474
CPU times: user 8min 16s, sys: 202 ms, total: 8min 16s
Wall time: 8min 16s


In [50]:
%%time
logging.info("Tunning Random Forest...")
rfc = RandomForestClassifier(random_state=42)
rfc.fit(train_data, train_label['anom'])  # previously we were giving x_initial, y_initial
pred = rfc.predict(test_data)
initial_report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print("Inıtial Macro-Avg  F-1 for Random Forest on Test data: ",initial_report_dict["macro avg"]["f1-score"])
CV_rfc = GridSearchCV(estimator=rfc, param_grid=rf_single_param_grid, cv= 5, scoring = 'f1_macro')
CV_rfc.fit(train_data, train_label['anom'])
logging.info(CV_rfc.best_params_)
pred = CV_rfc.predict(test_data)
final_report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print("Tuned Macro-Avg  F-1 for Random Forest on Test data: ",final_report_dict["macro avg"]["f1-score"])

2022-05-06 01:41:48,755 INFO    Tunning Random Forest...


Inıtial Macro-Avg  F-1 for Random Forest on Test data:  0.9507058075295978


2022-05-06 02:02:44,711 INFO    {'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 10}


Tuned Macro-Avg  F-1 for Random Forest on Test data:  0.9522537586113472
CPU times: user 20min 13s, sys: 39.4 s, total: 20min 53s
Wall time: 20min 56s


In [None]:
%%time
logging.info(f"Tunning {MODEL}...")
clf.fit(train_data, train_label["anom"])  # previously we were giving x_initial, y_initial
pred = clf.predict(test_data)
initial_report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print(
    f"Inıtial Macro-Avg  F-1 for {MODEL} on Test data: ",
    initial_report_dict["macro avg"]["f1-score"],
)
CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring="f1_macro")
CV_clf.fit(train_data, train_label["anom"])
logging.info(CV_clf.best_params_)
pred = CV_clf.predict(test_data)
final_report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print(
    f"Tuned Macro-Avg  F-1 for {MODEL} on Test data: ",
    final_report_dict["macro avg"]["f1-score"],
)
CV_clf.best_params_["initial_f1_score"] = initial_report_dict["macro avg"]["f1-score"]
CV_clf.best_params_["tuned_f1_score"] = final_report_dict["macro avg"]["f1-score"]

jsonpath = conf["results_dir"] / f"{MODEL}_Best_Params.json"
jsonpath.write_text(json.dumps(CV_clf.best_params_))

In [66]:
CV_rfc.best_params_['initial_f1_score'] = initial_report_dict["macro avg"]["f1-score"]
CV_rfc.best_params_['tuned_f1_score'] = final_report_dict["macro avg"]["f1-score"]

{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 10}

In [69]:
CV_rfc.best_params_['tuned_f1_score'] = report_dict["macro avg"]["f1-score"]

In [70]:
CV_rfc.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'n_estimators': 10,
 'tuned_f1_score': 0.9522537586113472}

In [90]:
%%time
logging.info("Tunning Random Forest...")
rfc = RandomForestClassifier(random_state=42)
rfc.fit(train_data, train_label['anom'])  # previously we were giving x_initial, y_initial
pred = rfc.predict(test_data)
report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print("Inıtial Macro-Avg  F-1 for Random Forest on Test data: ",report_dict["macro avg"]["f1-score"])
CV_rfc = GridSearchCV(estimator=rfc, param_grid=rf_single_param_grid_trial, cv= 5, scoring = 'f1_macro')
CV_rfc.fit(train_data, train_label['anom'])
logging.info(CV_rfc.best_params_)
pred = CV_rfc.predict(test_data)
report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print("Tuned Macro-Avg  F-1 for Random Forest on Test data: ",report_dict["macro avg"]["f1-score"])

2022-05-06 05:21:30,981 INFO    Tunning Random Forest...


Inıtial Macro-Avg  F-1 for Random Forest on Test data:  0.938938633538768


2022-05-06 05:21:42,780 INFO    {'criterion': 'gini', 'max_depth': 50, 'n_estimators': 10}


Tuned Macro-Avg  F-1 for Random Forest on Test data:  0.8801554781753561
CPU times: user 11.6 s, sys: 482 ms, total: 12 s
Wall time: 12.1 s


In [92]:
jsonpath = conf['results_dir'] / 'RandomForest_Params.json'
jsonpath.write_text(json.dumps(CV_rfc.best_params_))

58

In [31]:
%%time
logging.info("Tunning Logistic Regression...")
lr = LogisticRegression(random_state=0, dual=False, max_iter=12000)
lr.fit(train_data, train_label['anom'])  # previously we were giving x_initial, y_initial
pred = lr.predict(test_data)
report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print("Inıtial Macro-Avg  F-1 for LR on Test data: ",report_dict["macro avg"]["f1-score"])
CV_lr = GridSearchCV(estimator=lr, param_grid=lr_single_param_grid, cv= 5, scoring = 'f1_macro')
CV_lr.fit(train_data, train_label['anom'])
logging.info(CV_lr.best_params_)
pred = CV_lr.predict(test_data)
report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print("Tuned Macro-Avg  F-1 for Random Forest on Test data: ",report_dict["macro avg"]["f1-score"])

2022-05-06 01:16:52,123 INFO    Tunning Logistic Regression...


Inıtial Macro-Avg  F-1 for LR on Test data:  0.9492626220891143


2022-05-06 01:17:59,087 INFO    {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


Tuned Macro-Avg  F-1 for Random Forest on Test data:  0.9294848976348679
CPU times: user 1min 3s, sys: 3.57 s, total: 1min 7s
Wall time: 1min 7s


In [24]:
tuned_mlp = MLPClassifier(max_iter=1000,hidden_layer_sizes = 100, alpha = 0.0001)
tuned_mlp.fit(x_initial, y_initial)
pred = tuned_mlp.predict(test_data)
report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print("Tuned Macro-Avg  F-1 for MLP Test data: ",report_dict["macro avg"]["f1-score"])

Tuned Macro-Avg  F-1 for MLP Test data:  0.5591269880294288


In [99]:
%%time
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(objective='multiclass')
#renamed_train_data = train_data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
#renamed_train_data = renamed_train_data.to_string(header=False)
lgbm.fit(train_data.values,train_label['anom'])
y_pred = lgbm.predict(test_data)
report_dict = classification_report(test_label["anom"], y_pred, output_dict=True)
print("Inıtial Macro-Avg  F-1 for LGBM Test data: ",report_dict["macro avg"]["f1-score"])

Inıtial Macro-Avg  F-1 for LGBM Test data:  0.9618439889706666
CPU times: user 1min 9s, sys: 1.35 s, total: 1min 10s
Wall time: 1min 10s


In [15]:
%%time
MODEL = 'lgbm'
if MODEL == "random_forest":
        param_grid = {
            "n_estimators": [8, 10, 20, 100, 200],
            "max_depth": [None, 4, 8, 10, 20],
            "criterion": ["gini", "entropy"],
        }

        clf = RandomForestClassifier(random_state=42)

elif MODEL == "logistic_regression":

    param_grid = {
        "penalty": ["l1", "l2"],
        "C": [0.001, 0.01, 0.1, 1.0, 10.0],
        "solver": ["liblinear"],
    }

    clf = LogisticRegression(random_state=0, dual=False, max_iter=12000)

elif MODEL == "mlp":

    param_grid = {
        "max_iter": [100, 200, 500, 1000],
        "hidden_layer_sizes": [(10, 10, 10), (30, 20, 10), (50, 100, 50), (100)],
        "alpha": [0.0001, 0.001, 0.01],
    }

    clf = MLPClassifier(random_state=1)

elif MODEL == "lgbm":

    param_grid = {
        "num_leaves": [2, 8, 31, 128],
        "learning_rate": [0.01, 0.1, 0.3],
        "max_depth": [-1, 2, 8],
        "colsample_bytree": [0.5, 1.0],
    }
    clf = LGBMClassifier(objective="multiclass", random_state=5)
    train_data = train_data.values

else:
    raise ("Invalid classifier")

logging.info(f"Tunning {MODEL}...")
clf.fit(train_data, train_label["anom"])  # previously we were giving x_initial, y_initial
pred = clf.predict(test_data)
initial_report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print(
    f"Inıtial Macro-Avg  F-1 for {MODEL} on Test data: ",
    initial_report_dict["macro avg"]["f1-score"],
)
CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring="f1_macro")
CV_clf.fit(train_data, train_label["anom"])
logging.info(CV_clf.best_params_)
pred = CV_clf.predict(test_data)
final_report_dict = classification_report(test_label["anom"], pred, output_dict=True)
print(
    f"Tuned Macro-Avg  F-1 for {MODEL} on Test data: ",
    final_report_dict["macro avg"]["f1-score"],
)
CV_clf.best_params_["initial_f1_score"] = initial_report_dict["macro avg"]["f1-score"]
CV_clf.best_params_["tuned_f1_score"] = final_report_dict["macro avg"]["f1-score"]

jsonpath = conf["results_dir"] / f"{MODEL}_Best_Params.json"
jsonpath.write_text(json.dumps(CV_clf.best_params_))

2022-05-06 06:05:57,427 INFO    Tunning lgbm...


Inıtial Macro-Avg  F-1 for lgbm on Test data:  0.9806170007445253


2022-05-06 08:31:54,666 INFO    {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 8, 'num_leaves': 128}


Tuned Macro-Avg  F-1 for lgbm on Test data:  0.9798569473448232
CPU times: user 2h 22min 55s, sys: 3min 2s, total: 2h 25min 58s
Wall time: 2h 25min 58s


160