# 실험 준비

In [85]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import pickle
import winsound

In [86]:
from src.models.BaseAutoEncoder import BaseSeq2Seq
from src.dataload.tabular import tabularDataset
from src.utils import inference, ensemble_inference
from src.simulation_trainer import BaseTrainer, NewTrainer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
import random

In [87]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def _normalize_tabular(df: pd.DataFrame, label_name: str = "label"):
    scaler = StandardScaler()
    X = scaler.fit_transform(df.drop(label_name, axis=1))
    y = df[label_name]
    return X, y

def simul_split_train_valid_test(df: pd.DataFrame, train_ratio: float = 0.7):
    X, y = _normalize_tabular(df)
    tmp = pd.DataFrame(X)
    tmp["label"] = y
    normal = tmp.loc[tmp["label"] == 0, :].reset_index(drop=True)
    abnormal = tmp.loc[tmp["label"] == 1, :].reset_index(drop=True)
    X_train, X_val_test, y_train, y_val_test = train_test_split(
        normal.drop("label", axis=1),
        normal["label"],
        train_size=train_ratio,
        random_state=42,
        shuffle=False,
    )
    
    X_val, X_test, y_val, y_test = train_test_split(
        X_val_test, y_val_test, train_size=0.5, random_state=42, shuffle=False
    )
    X_test["label"] = y_test
    X_test = pd.concat([X_test, abnormal]).reset_index(drop=True)
    return (
        X_train.values,
        X_val.values,
        X_test.drop("label", axis=1).values,
        y_train.values,
        y_val.values,
        X_test["label"].values,
    )

In [88]:
from argparse import Namespace

config = {
    'trainer_name' : 'BaseTrainer',
    'project' : 'my_paper',
    'train_ratio': .7,
    'batch_size': 512,
    'n_epochs': 500,
    'early_stop_round': 1000,
    # 'hidden_size': [2, 4], # -> 더 큰 데이터에서 4를 진행해야 할 듯 (나중에 benchmark?)
    'hidden_size': [2],
    'window_size': 60,
    'data': 'tabular',
    # 'sampling_term': [1, 5],
    'sampling_term': [1, 5],
    # 'initial_epochs': [10],
    'initial_epochs': [5, 20],
    'sampling_ratio': [0.01, 0.1]
}

# gpu
gpu_id = 0 if torch.cuda.is_available() else -1

if gpu_id == 0:
    config['device'] = 'cuda:0'
else:
    config['device'] = 'cpu'

config = Namespace(**config)

print(config)

Namespace(batch_size=512, data='tabular', device='cuda:0', early_stop_round=1000, hidden_size=[2], initial_epochs=[5, 20], n_epochs=500, project='my_paper', sampling_ratio=[0.01, 0.1], sampling_term=[1, 5], train_ratio=0.7, trainer_name='BaseTrainer', window_size=60)


In [89]:
# torch.backends.cudnn.deterministic = True
# random.seed(0)
# np.random.seed(0)
# torch.manual_seed(0)
# torch.cuda.manual_seed_all(0)

In [90]:
import os

PATH = './sim_data'
file_list = os.listdir(PATH)
file_list_csv = [file for file in file_list if file.endswith('.csv')]
file_list_csv[10:]

['smallNormal.csv',
 'smallTestNoise_010_01.csv',
 'smallTestNoise_010_05.csv',
 'smallTestNoise_010_09.csv',
 'smallTrainNoise_001_01.csv',
 'smallTrainNoise_001_05.csv',
 'smallTrainNoise_001_09.csv',
 'smallTrainNoise_010_01.csv',
 'smallTrainNoise_010_05.csv',
 'smallTrainNoise_010_09.csv']

- 진행한 데이터
  - smallNormal
  - smallTrainNoise_001_01
  - smallTrainNoise_001_09
  - smallTrainNoise_010_01
  - smallTrainNoise_010_09
  - smallTestNoise_010_01
  - smallTestNoise_010_09

In [91]:
data_list = ['smallTestNoise_010_09.csv']
#  'smallTrainNoise_010_01.csv', 'smallTrainNoise_010_09.csv' -> sampling_ratio 바꿔야 함
# 데이터 하나씩 하자 cache에 영향이 있을 듯

n_time = 1
save_idx = '_' + str(0)
# 모델 성능 결과 저장 경로
PATH = './0409_run_result_sim/'
ENSEMBLE_PATH = './0409_ensemble_sim_1/'

for d in data_list:
    data = pd.read_csv('./sim_data/' + d)
    config.data_name = d.split('.')[0]
    #     # 500epoch이고 train에 noise가 있는 경우에만 error, top, down 확인
    # if (config.n_epochs == 500) and ('Train' in config.data_name):
    #     is_debug = True
    # else:
    #     is_debug = False
    is_debug = False
        
    for i in range(n_time):

        print(f'<< {i+1}번재 시작 >>')
        (
            train_x,
            valid_x,
            test_x,
            train_y,
            valid_y,
            test_y,
        ) = simul_split_train_valid_test(data, config.train_ratio)
        
        # resize 'window_size' = 'col_len'
        config.window_size = train_x.shape[1]

        train_dataset = tabularDataset(train_x, train_y)
        valid_dataset = tabularDataset(valid_x, valid_y)
        test_dataset = tabularDataset(test_x, test_y)

        train_dataloader = DataLoader(
            train_dataset, shuffle=False, batch_size=config.batch_size
        )
        valid_dataloader = DataLoader(
            valid_dataset, shuffle=False, batch_size=config.batch_size
        )
        test_dataloader = DataLoader(
            test_dataset, shuffle=False, batch_size=config.batch_size
        )

        total_x = np.concatenate([train_x, valid_x, test_x])
        total_y = np.concatenate([train_y, valid_y, test_y])
        IR = round((len(total_y) - np.sum(total_y)) / np.sum(total_y), 4)
        
        # for inference
        total_dataset = tabularDataset(total_x, total_y)
        total_dataloader = DataLoader(
            total_dataset, shuffle=False, batch_size=config.batch_size
        )

        if is_debug is False:
            for hidden_size in config.hidden_size:
                print(f"-----BaseTrainer starts with hidden_size={hidden_size}-----")
                config.trainer_name = "BaseTrainer"

                model = BaseSeq2Seq(
                    input_size=config.window_size,
                    hidden_size=hidden_size,
                    output_size=config.window_size,
                    dropout_p=0.0,
                ).to(config.device)

                optimizer = optim.Adam(model.parameters())
                criterion = nn.MSELoss()

                # train
                trainer = BaseTrainer(model=model, optimizer=optimizer, crit=criterion)

                train_loss, val_loss, return_epoch, best_model = trainer.train(
                    train_loader=train_dataloader,
                    val_loader=valid_dataloader,
                    config=config,
                    use_wandb=False,
                )

                best_model.to("cpu")                        
                sampling_term = 0
                sampling_ratio = 0
                initial_epoch = 0

                df, tst_ano_score = ensemble_inference(
                    config,
                    total_dataloader,
                    best_model,
                    train_x,
                    valid_x,
                    total_x,
                    total_y,
                    return_epoch,
                    hidden_size,
                    train_loss,
                    val_loss,
                    IR,
                    sampling_term,
                    sampling_ratio,
                    initial_epoch,
                    PATH
                )
                
                df.to_csv(PATH + "result_" + config.data_name + ".csv", index=False)
                
                hp = '_hs' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
                with open(ENSEMBLE_PATH + 'base_' + config.data_name + hp + '.pickle', 'wb') as f:
                    pickle.dump(tst_ano_score, f, pickle.HIGHEST_PROTOCOL)

                torch.cuda.empty_cache()
            
        for hidden_size in config.hidden_size:
            for sampling_ratio in config.sampling_ratio:
                for initial_epoch in config.initial_epochs: 
                    for sampling_term in config.sampling_term:
                        print(
                            f"-----NewTrainer starts-----"
                        )
                        config.trainer_name = "NewTrainer"

                        model = BaseSeq2Seq(
                            input_size=config.window_size,
                            hidden_size=hidden_size,
                            output_size=config.window_size,
                            dropout_p=0.0,
                        ).to(config.device)

                        optimizer = optim.Adam(model.parameters())
                        criterion = nn.MSELoss()

                        # train
                        trainer = NewTrainer(model=model, optimizer=optimizer, crit=criterion)
                        
                        train_loss, val_loss, return_epoch, best_model, errors, tops, downs = trainer.train(
                            train_x=train_x,
                            train_y=train_y,
                            train_loader=train_dataloader,
                            val_loader=valid_dataloader,
                            sampling_term=sampling_term,
                            initial_epoch=initial_epoch,
                            sampling_ratio=sampling_ratio,
                            config=config,
                            use_wandb=False,
                            is_debug=is_debug
                        )
                        if is_debug:
                            hp = '_h' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
                            with open('./run_result_sim_error/newError_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(errors, f, pickle.HIGHEST_PROTOCOL)
                            with open('./run_result_sim_top/newTop_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(tops, f, pickle.HIGHEST_PROTOCOL)
                            with open('./run_result_sim_down/newDown_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(downs, f, pickle.HIGHEST_PROTOCOL)
                        
                        best_model.to("cpu")                                
                        
                        df, tst_ano_score = ensemble_inference(
                            config,
                            total_dataloader,
                            best_model,
                            train_x,
                            valid_x,
                            total_x,
                            total_y,
                            return_epoch,
                            hidden_size,
                            train_loss,
                            val_loss,
                            IR,
                            sampling_term,
                            sampling_ratio,
                            initial_epoch,
                            PATH
                        )
                        
                        df.to_csv(PATH + "result_" + config.data_name + ".csv", index=False)
                        
                        hp = '_hs' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
                        with open(ENSEMBLE_PATH + 'new_' + config.data_name + hp + '.pickle', 'wb') as f:
                            pickle.dump(tst_ano_score, f, pickle.HIGHEST_PROTOCOL)
                        
                        torch.cuda.empty_cache()
                        
frequency = 800
duration = 2000
winsound.Beep(frequency, duration)

<< 1번재 시작 >>
-----BaseTrainer starts with hidden_size=2-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


train_loss=0.561, valid_loss=0.560
New file generated!
-----NewTrainer starts-----
train_loss=0.560, valid_loss=0.569
-----NewTrainer starts-----
train_loss=0.560, valid_loss=0.568
-----NewTrainer starts-----
train_loss=0.557, valid_loss=0.566
-----NewTrainer starts-----
train_loss=0.572, valid_loss=0.581
-----NewTrainer starts-----
train_loss=0.512, valid_loss=0.564
-----NewTrainer starts-----
train_loss=0.548, valid_loss=0.602
-----NewTrainer starts-----
train_loss=0.518, valid_loss=0.565
-----NewTrainer starts-----
train_loss=0.517, valid_loss=0.567


In [92]:
os.listdir('./0409_run_result_sim') 

['result_smallNormal.csv',
 'result_smallTestNoise_010_01.csv',
 'result_smallTestNoise_010_09.csv',
 'result_smallTrainNoise_001_01.csv',
 'result_smallTrainNoise_001_09.csv',
 'result_smallTrainNoise_010_01.csv',
 'result_smallTrainNoise_010_09.csv']

In [93]:
cols = ['trainer_name', 'sampling_term','sampling_ratio','initial_epoch']
name = os.listdir('./0409_run_result_sim')[2]
tmp = pd.read_csv('./0409_run_result_sim/' + name)
tmp.groupby(cols)[['roc_auc','pr_auc']].agg(['mean','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,roc_auc,roc_auc,pr_auc,pr_auc
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std
trainer_name,sampling_term,sampling_ratio,initial_epoch,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
BaseTrainer,0,0.0,0,0.7892,,0.7601,
NewTrainer,1,0.01,5,0.8235,,0.8163,
NewTrainer,1,0.01,20,0.7865,,0.7616,
NewTrainer,1,0.1,5,0.7909,,0.7583,
NewTrainer,1,0.1,20,0.7735,,0.7424,
NewTrainer,5,0.01,5,0.7988,,0.7797,
NewTrainer,5,0.01,20,0.7897,,0.7652,
NewTrainer,5,0.1,5,0.7817,,0.752,
NewTrainer,5,0.1,20,0.7935,,0.7697,


In [94]:
# from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# anomaly_score = np.zeros((len(test_y), 8))

# result_df = pd.DataFrame(columns=['mean','median','max','min'], index=['roc_auc','pr_auc'])

# idx = 0
# for data_name in os.listdir(ENSEMBLE_PATH):
#     if config.data_name in data_name and 'base' in data_name:
#         with open(ENSEMBLE_PATH  + data_name, 'rb') as f:
#             tmp = pickle.load(f)
#             anomaly_score[:, idx] = tmp
#             idx += 1

# tst_ano_scr_med = np.median(anomaly_score, axis=1)
# tst_ano_scr_mean = np.mean(anomaly_score, axis=1)
# tst_ano_scr_max = np.max(anomaly_score, axis=1)
# tst_ano_scr_min = np.min(anomaly_score, axis=1)

# idx = 0
# for tst_ano_scr in [tst_ano_scr_mean, tst_ano_scr_med, tst_ano_scr_max, tst_ano_scr_min]:
#     roc_auc = roc_auc_score(test_y, tst_ano_scr)
#     _precision, _recall, _ = precision_recall_curve(test_y, tst_ano_scr)
#     pr_auc = auc(_recall, _precision)
#     result_df.iloc[0, idx] = roc_auc
#     result_df.iloc[1, idx] = pr_auc
#     idx += 1

# result_df

In [95]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

anomaly_score = np.zeros((len(test_y), 8))

result_df = pd.DataFrame(columns=['mean','median','max','min'], index=['roc_auc','pr_auc'])

idx = 0
for data_name in os.listdir(ENSEMBLE_PATH):
    if config.data_name in data_name and 'new' in data_name:
        with open(ENSEMBLE_PATH  + data_name, 'rb') as f:
            tmp = pickle.load(f)
            anomaly_score[:, idx] = tmp
            idx += 1

tst_ano_scr_med = np.median(anomaly_score, axis=1)
tst_ano_scr_mean = np.mean(anomaly_score, axis=1)
tst_ano_scr_max = np.max(anomaly_score, axis=1)
tst_ano_scr_min = np.min(anomaly_score, axis=1)

idx = 0
for tst_ano_scr in [tst_ano_scr_mean, tst_ano_scr_med, tst_ano_scr_max, tst_ano_scr_min]:
    roc_auc = roc_auc_score(test_y, tst_ano_scr)
    _precision, _recall, _ = precision_recall_curve(test_y, tst_ano_scr)
    pr_auc = auc(_recall, _precision)
    result_df.iloc[0, idx] = roc_auc
    result_df.iloc[1, idx] = pr_auc
    idx += 1

result_df

Unnamed: 0,mean,median,max,min
roc_auc,0.805187,0.792862,0.811166,0.807141
pr_auc,0.788619,0.770093,0.795773,0.791171
