# 실험 준비

In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import pickle
import winsound

In [2]:
from src.models.BaseAutoEncoder import BaseSeq2Seq
from src.dataload.tabular import tabularDataset
from src.utils import ensemble_inference, inference
from src.simulation_trainer import BaseTrainer, NewTrainer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
import random

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def _normalize_tabular(df: pd.DataFrame, label_name: str = "label"):
    scaler = StandardScaler()
    X = scaler.fit_transform(df.drop(label_name, axis=1))
    y = df[label_name]
    return X, y

def simul_split_train_valid_test(df: pd.DataFrame, train_ratio: float = 0.7):
    X, y = _normalize_tabular(df)
    tmp = pd.DataFrame(X)
    tmp["label"] = y
    normal = tmp.loc[tmp["label"] == 0, :].reset_index(drop=True)
    abnormal = tmp.loc[tmp["label"] == 1, :].reset_index(drop=True)
    X_train, X_val_test, y_train, y_val_test = train_test_split(
        normal.drop("label", axis=1),
        normal["label"],
        train_size=train_ratio,
        random_state=42,
        shuffle=False,
    )
    
    X_val, X_test, y_val, y_test = train_test_split(
        X_val_test, y_val_test, train_size=0.5, random_state=42, shuffle=False
    )
    X_test["label"] = y_test
    X_test = pd.concat([X_test, abnormal]).reset_index(drop=True)
    return (
        X_train.values,
        X_val.values,
        X_test.drop("label", axis=1).values,
        y_train.values,
        y_val.values,
        X_test["label"].values,
    )

In [4]:
from argparse import Namespace

config = {
    'trainer_name' : 'BaseTrainer',
    'project' : 'my_paper',
    'train_ratio': .7,
    'batch_size': 512,
    'n_epochs': 1000,
    'early_stop_round': 50,
    'hidden_size': [2],
    'window_size': 60,
    'data': 'tabular',
    # 'sampling_term': [1, 5],
    'sampling_term': [1, 5],
    # 'initial_epochs': [10],
    'initial_epochs': [5, 20],
    'sampling_ratio': [0.01, 0.1]
}

# gpu
gpu_id = 0 if torch.cuda.is_available() else -1

if gpu_id == 0:
    config['device'] = 'cuda:0'
else:
    config['device'] = 'cpu'

config = Namespace(**config)

print(config)

Namespace(batch_size=512, data='tabular', device='cuda:0', early_stop_round=50, hidden_size=[2], initial_epochs=[5, 20], n_epochs=1000, project='my_paper', sampling_ratio=[0.01, 0.1], sampling_term=[1, 5], train_ratio=0.7, trainer_name='BaseTrainer', window_size=60)


In [5]:
# torch.backends.cudnn.deterministic = True
# random.seed(0)
# np.random.seed(0)
# torch.manual_seed(0)
# torch.cuda.manual_seed_all(0)

- 저장해야 할 것
  - epoch=500일 때만
    - top, down
    - error
  - 항상
    - 모델 성능 result

### 시간이 너무 오래걸려서
- big은 제외하고 일단 진행 -> 사이즈를 줄이던지 교수님들한테 피드백받고 추가하자
- small에서 hidden_size=4의 경우는 데이터가 부족한 것 같다. 일단은 **h=4는 제외**하고 진행
- **weight=0.5는 제외**하고 일단 진행
- 500 epoch 부터는 sampling_term=1만 진행
- simulation은 일단 500빼자 -> bench만 해보자
- *BUGFIX* : sampling_term은 다 다시해야함
### 일지 (1회당 30분 걸림, 일단 3회 진행)
- `smallNormal.csv`
    - epoch=1000 & early_stop=50: 3회
    - epoch=500: 3회
- `smallTestNoise_010_01.csv`
    - epoch=1000 & early_stop=50: 3회
    - epoch=500: 3회
- `smallTestNoise_010_09.csv`
    - epoch=1000 & early_stop=50: 3회
- `smallTestNoise_010_05.csv`
  - 제외
- `smallTrainNoise_001_01.csv`
  - epoch=1000 & early_stop=50: 3회
- `smallTrainNoise_001_05.csv`
  - 제외
- `smallTrainNoise_001_09.csv`
  - epoch=1000 & early_stop=50: 3회
  - epoch=500 & init=20 & samTerm=[1,5,10]-> 1회 top down 분석
- `smallTrainNoise_010_01.csv`
  - epoch=1000 & early_stop=50: 3회
  - epoch=500 & init=10 3회-> top down 분석
- `smallTrainNoise_010_05.csv`
  - 제외
- `smallTrainNoise_010_09.csv`
  - epoch=1000 & early_stop=50: 3회
  - epoch=500 & init=10 1회-> top down 분석

### Ensemble
- 위의 config -> 총 8개 모델 이용해서 Ensemble

```
'sampling_term': [1, 5],
'initial_epochs': [5, 20],
'sampling_ratio': [0.01, 0.1]
```

- 실험진행 데이터
  - `smallTrainNoise_010_09.csv`: 3 
  - `smallTrainNoise_010_01.csv`: 3
  - `smallTrainNoise_001_09.csv`: 3
  - `smallTrainNoise_001_01.csv`: 3
  - `smallTestNoise_010_09.csv`: 3
  - `smallTestNoise_010_01.csv`: 3
  - `smallNormal.csv`: 3

In [6]:
data_list = ['smallTrainNoise_010_09.csv']

n_time = 1
PATH = './run_result_sim_nodrop/'
# save_idx = '_2'

for d in data_list:
    data = pd.read_csv('./sim_data/' + d)
    config.data_name = d.split('.')[0]
        # 500epoch이고 train에 noise가 있는 경우에만 error, top, down 확인
    if (config.n_epochs == 500) and ('Train' in config.data_name):
        is_debug = True
    else:
        is_debug = False
        
    for i in range(n_time):

        print(f'<< {i+1}번재 시작 >>')
        (
            train_x,
            valid_x,
            test_x,
            train_y,
            valid_y,
            test_y,
        ) = simul_split_train_valid_test(data, config.train_ratio)
        
        # resize 'window_size' = 'col_len'
        config.window_size = train_x.shape[1]

        train_dataset = tabularDataset(train_x, train_y)
        valid_dataset = tabularDataset(valid_x, valid_y)
        test_dataset = tabularDataset(test_x, test_y)

        train_dataloader = DataLoader(
            train_dataset, shuffle=False, batch_size=config.batch_size
        )
        valid_dataloader = DataLoader(
            valid_dataset, shuffle=False, batch_size=config.batch_size
        )
        test_dataloader = DataLoader(
            test_dataset, shuffle=False, batch_size=config.batch_size
        )

        total_x = np.concatenate([train_x, valid_x, test_x])
        total_y = np.concatenate([train_y, valid_y, test_y])
        IR = round((len(total_y) - np.sum(total_y)) / np.sum(total_y), 4)
        
        # for inference
        total_dataset = tabularDataset(total_x, total_y)
        total_dataloader = DataLoader(
            total_dataset, shuffle=False, batch_size=config.batch_size
        )

        # if is_debug is False:
        #     for hidden_size in config.hidden_size:
        #         print(f"-----BaseTrainer starts with hidden_size={hidden_size}-----")
        #         config.trainer_name = "BaseTrainer"

        #         model = BaseSeq2Seq(
        #             input_size=config.window_size,
        #             hidden_size=hidden_size,
        #             output_size=config.window_size,
        #             dropout_p=0.2,
        #         ).to(config.device)

        #         optimizer = optim.Adam(model.parameters())
        #         criterion = nn.MSELoss()

        #         # train
        #         trainer = BaseTrainer(model=model, optimizer=optimizer, crit=criterion)

        #         train_loss, val_loss, return_epoch, best_model = trainer.train(
        #             train_loader=train_dataloader,
        #             val_loader=valid_dataloader,
        #             config=config,
        #             use_wandb=False,
        #         )

        #         best_model.to("cpu")                        
        #         sampling_term = 0
        #         sampling_ratio = 0
        #         initial_epoch = 0

        #         df, tst_ano_score = ensemble_inference(
        #             config,
        #             total_dataloader,
        #             best_model,
        #             train_x,
        #             valid_x,
        #             total_x,
        #             total_y,
        #             return_epoch,
        #             hidden_size,
        #             train_loss,
        #             val_loss,
        #             IR,
        #             sampling_term,
        #             sampling_ratio,
        #             initial_epoch,
        #             PATH
        #         )
        #         df.to_csv(PATH + "result_" + config.data_name + ".csv", index=False)
                
        #         #hp = '_hs' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
        #         with open('./ensemble_sim_base_1/' + config.data_name + '.pickle', 'wb') as f:
        #             pickle.dump(tst_ano_score, f, pickle.HIGHEST_PROTOCOL)
        #         torch.cuda.empty_cache()
            
        for hidden_size in config.hidden_size:
            for sampling_ratio in config.sampling_ratio:
                for initial_epoch in config.initial_epochs: 
                    for sampling_term in config.sampling_term:
                        print(
                            f"-----NewTrainer starts-----"
                        )
                        config.trainer_name = "NewTrainer"

                        model = BaseSeq2Seq(
                            input_size=config.window_size,
                            hidden_size=hidden_size,
                            output_size=config.window_size,
                            dropout_p=0.2,
                        ).to(config.device)

                        optimizer = optim.Adam(model.parameters())
                        criterion = nn.MSELoss()

                        # train
                        trainer = NewTrainer(model=model, optimizer=optimizer, crit=criterion)
                        
                        train_loss, val_loss, return_epoch, best_model, errors, tops, downs = trainer.train(
                            train_x=train_x,
                            train_y=train_y,
                            train_loader=train_dataloader,
                            val_loader=valid_dataloader,
                            sampling_term=sampling_term,
                            initial_epoch=initial_epoch,
                            sampling_ratio=sampling_ratio,
                            config=config,
                            use_wandb=False,
                            is_debug=is_debug
                        )
                        if is_debug:
                            hp = '_h' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
                            with open('./run_result_sim_error/newError_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(errors, f, pickle.HIGHEST_PROTOCOL)
                            with open('./run_result_sim_top/newTop_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(tops, f, pickle.HIGHEST_PROTOCOL)
                            with open('./run_result_sim_down/newDown_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(downs, f, pickle.HIGHEST_PROTOCOL)
                        
                        best_model.to("cpu")                                

                        df, tst_ano_score = ensemble_inference(
                            config,
                            total_dataloader,
                            best_model,
                            train_x,
                            valid_x,
                            total_x,
                            total_y,
                            return_epoch,
                            hidden_size,
                            train_loss,
                            val_loss,
                            IR,
                            sampling_term,
                            sampling_ratio,
                            initial_epoch,
                            PATH
                        )
                        df.to_csv(PATH + "result_" + config.data_name + ".csv", index=False)
                        
                        hp = '_hs' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
                        with open('./ensemble_sim_nodrop_1/' + config.data_name + hp + '.pickle', 'wb') as f:
                            pickle.dump(tst_ano_score, f, pickle.HIGHEST_PROTOCOL)
                        
                        torch.cuda.empty_cache()
                                                
frequency = 800
duration = 2000
winsound.Beep(frequency, duration)

<< 1번재 시작 >>
-----NewTrainer starts-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


Early Stopped! in Epoch 97:
train_loss=0.821, valid_loss=0.810
New file generated!
-----NewTrainer starts-----
Early Stopped! in Epoch 281:
train_loss=0.797, valid_loss=0.740
-----NewTrainer starts-----
Early Stopped! in Epoch 194:
train_loss=0.796, valid_loss=0.756
-----NewTrainer starts-----
Early Stopped! in Epoch 85:
train_loss=0.813, valid_loss=0.818
-----NewTrainer starts-----
Early Stopped! in Epoch 488:
train_loss=0.730, valid_loss=0.733
-----NewTrainer starts-----
Early Stopped! in Epoch 565:
train_loss=0.762, valid_loss=0.783
-----NewTrainer starts-----
Early Stopped! in Epoch 159:
train_loss=0.766, valid_loss=0.802
-----NewTrainer starts-----
Early Stopped! in Epoch 108:
train_loss=0.771, valid_loss=0.810


In [7]:
df = pd.read_csv('./run_result_sim/result_' + config.data_name + '.csv')
cols = ['trainer_name', 'sampling_term','sampling_ratio','initial_epoch']
df.groupby(cols)[['roc_auc','pr_auc']].agg(['mean','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,roc_auc,roc_auc,pr_auc,pr_auc
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std
trainer_name,sampling_term,sampling_ratio,initial_epoch,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
BaseTrainer,0,0.0,0,0.702217,0.012748,0.638217,0.016841
NewTrainer,1,0.01,5,0.699967,0.012785,0.634067,0.016909
NewTrainer,1,0.01,10,0.6977,0.018557,0.628667,0.026074
NewTrainer,1,0.01,20,0.705667,0.006673,0.643167,0.007978
NewTrainer,1,0.1,5,0.698014,0.008948,0.633457,0.012629
NewTrainer,1,0.1,10,0.699975,0.016628,0.634725,0.022941
NewTrainer,1,0.1,20,0.692275,0.011288,0.623475,0.01661
NewTrainer,5,0.01,5,0.696933,0.009504,0.633867,0.008292
NewTrainer,5,0.01,20,0.701067,0.013002,0.6361,0.020236
NewTrainer,5,0.1,5,0.685867,0.009851,0.615333,0.014739


In [8]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

anomaly_score = np.zeros((len(test_y), 8))

result_df = pd.DataFrame(columns=['mean','median','max','min'], index=['roc_auc','pr_auc'])

PATH = './ensemble_sim_base_1/'
idx = 0
for data_name in os.listdir(PATH):
    if config.data_name in data_name:
        with open(PATH + data_name, 'rb') as f:
            tmp = pickle.load(f)
            anomaly_score[:, idx] = tmp
            idx += 1

tst_ano_scr_med = np.median(anomaly_score, axis=1)
tst_ano_scr_mean = np.mean(anomaly_score, axis=1)
tst_ano_scr_max = np.max(anomaly_score, axis=1)
tst_ano_scr_min = np.min(anomaly_score, axis=1)

idx = 0
for tst_ano_scr in [tst_ano_scr_mean, tst_ano_scr_med, tst_ano_scr_max, tst_ano_scr_min]:
    roc_auc = roc_auc_score(test_y, tst_ano_scr)
    _precision, _recall, _ = precision_recall_curve(test_y, tst_ano_scr)
    pr_auc = auc(_recall, _precision)
    result_df.iloc[0, idx] = roc_auc
    result_df.iloc[1, idx] = pr_auc
    idx += 1

result_df
# if config.data == "tabular":
#     tst_y_true = total_y[-len(tst_ano_scr):]
# else:
    # tst_y_true = total_y[tst_start_idx : len(window_anomaly_score_result)] # 여기 time_series 계산




Unnamed: 0,mean,median,max,min
roc_auc,0.755279,0.74738,0.706091,0.798706
pr_auc,0.708022,0.696569,0.637348,0.755849
