# 실험 준비

In [17]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import pickle
import winsound

In [18]:
from src.models.BaseAutoEncoder import BaseSeq2Seq
from src.dataload.tabular import tabularDataset
from src.utils import inference, ensemble_inference
from src.simulation_trainer import BaseTrainer, NewTrainer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
import random

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def _normalize_tabular(df: pd.DataFrame, label_name: str = "label"):
    scaler = StandardScaler()
    X = scaler.fit_transform(df.drop(label_name, axis=1))
    y = df[label_name]
    return X, y

def simul_split_train_valid_test(df: pd.DataFrame, train_ratio: float = 0.7):
    X, y = _normalize_tabular(df)
    tmp = pd.DataFrame(X)
    tmp["label"] = y
    normal = tmp.loc[tmp["label"] == 0, :].reset_index(drop=True)
    abnormal = tmp.loc[tmp["label"] == 1, :].reset_index(drop=True)
    X_train, X_val_test, y_train, y_val_test = train_test_split(
        normal.drop("label", axis=1),
        normal["label"],
        train_size=train_ratio,
        random_state=42,
        shuffle=False,
    )
    
    X_val, X_test, y_val, y_test = train_test_split(
        X_val_test, y_val_test, train_size=0.5, random_state=42, shuffle=False
    )
    X_test["label"] = y_test
    X_test = pd.concat([X_test, abnormal]).reset_index(drop=True)
    return (
        X_train.values,
        X_val.values,
        X_test.drop("label", axis=1).values,
        y_train.values,
        y_val.values,
        X_test["label"].values,
    )

In [20]:
from argparse import Namespace

config = {
    'trainer_name' : 'BaseTrainer',
    'project' : 'my_paper',
    'train_ratio': .7,
    'batch_size': 512,
    'n_epochs': 500,
    'early_stop_round': 1000,
    # 'hidden_size': [2, 4], # -> 더 큰 데이터에서 4를 진행해야 할 듯 (나중에 benchmark?)
    'hidden_size': [2],
    'window_size': 60,
    'data': 'tabular',
    # 'sampling_term': [1, 5],
    'sampling_term': [1, 5],
    # 'initial_epochs': [10],
    'initial_epochs': [5, 20],
    'sampling_ratio': [0.01, 0.1]
}

# gpu
gpu_id = 0 if torch.cuda.is_available() else -1

if gpu_id == 0:
    config['device'] = 'cuda:0'
else:
    config['device'] = 'cpu'

config = Namespace(**config)

print(config)

Namespace(batch_size=512, data='tabular', device='cuda:0', early_stop_round=1000, hidden_size=[2], initial_epochs=[5, 20], n_epochs=500, project='my_paper', sampling_ratio=[0.01, 0.1], sampling_term=[1, 5], train_ratio=0.7, trainer_name='BaseTrainer', window_size=60)


In [21]:
# torch.backends.cudnn.deterministic = True
# random.seed(0)
# np.random.seed(0)
# torch.manual_seed(0)
# torch.cuda.manual_seed_all(0)

In [22]:
import os

PATH = './tabular_data'
file_list = os.listdir(PATH)
file_list_csv = [file for file in file_list if file.endswith('.csv')]
file_list_csv

['abalone9-18.csv',
 'shuttle-c0-vs-c4.csv',
 'vowel0.csv',
 'wine.csv',
 'yeast-1-2-8-9_vs_7.csv',
 'yeast4.csv',
 'yeast5.csv',
 'yeast6.csv']

In [23]:
data_list = file_list[1:]

n_time = 1
save_idx = '_' + str(0)
# 모델 성능 결과 저장 경로
PATH = './0410_run_result_tabular/'
ENSEMBLE_PATH = './0410_ensemble_sim_1/'

for d in data_list:
    data = pd.read_csv('./tabular_data/' + d)
    config.data_name = d.split('.')[0]
    #     # 500epoch이고 train에 noise가 있는 경우에만 error, top, down 확인
    # if (config.n_epochs == 500) and ('Train' in config.data_name):
    #     is_debug = True
    # else:
    #     is_debug = False
    is_debug = False
        
    for i in range(n_time):

        print(f'<< {i+1}번재 시작 >>')
        (
            train_x,
            valid_x,
            test_x,
            train_y,
            valid_y,
            test_y,
        ) = simul_split_train_valid_test(data, config.train_ratio)
        
        # resize 'window_size' = 'col_len'
        config.window_size = train_x.shape[1]

        train_dataset = tabularDataset(train_x, train_y)
        valid_dataset = tabularDataset(valid_x, valid_y)
        test_dataset = tabularDataset(test_x, test_y)

        train_dataloader = DataLoader(
            train_dataset, shuffle=False, batch_size=config.batch_size
        )
        valid_dataloader = DataLoader(
            valid_dataset, shuffle=False, batch_size=config.batch_size
        )
        test_dataloader = DataLoader(
            test_dataset, shuffle=False, batch_size=config.batch_size
        )

        total_x = np.concatenate([train_x, valid_x, test_x])
        total_y = np.concatenate([train_y, valid_y, test_y])
        IR = round((len(total_y) - np.sum(total_y)) / np.sum(total_y), 4)
        
        # for inference
        total_dataset = tabularDataset(total_x, total_y)
        total_dataloader = DataLoader(
            total_dataset, shuffle=False, batch_size=config.batch_size
        )

        if is_debug is False:
            for hidden_size in config.hidden_size:
                print(f"-----BaseTrainer starts with hidden_size={hidden_size}-----")
                config.trainer_name = "BaseTrainer"

                model = BaseSeq2Seq(
                    input_size=config.window_size,
                    hidden_size=hidden_size,
                    output_size=config.window_size,
                    dropout_p=0.0,
                ).to(config.device)

                optimizer = optim.Adam(model.parameters())
                criterion = nn.MSELoss()

                # train
                trainer = BaseTrainer(model=model, optimizer=optimizer, crit=criterion)

                train_loss, val_loss, return_epoch, best_model = trainer.train(
                    train_loader=train_dataloader,
                    val_loader=valid_dataloader,
                    config=config,
                    use_wandb=False,
                )

                best_model.to("cpu")                        
                sampling_term = 0
                sampling_ratio = 0
                initial_epoch = 0

                df, tst_ano_score = ensemble_inference(
                    config,
                    total_dataloader,
                    best_model,
                    train_x,
                    valid_x,
                    total_x,
                    total_y,
                    return_epoch,
                    hidden_size,
                    train_loss,
                    val_loss,
                    IR,
                    sampling_term,
                    sampling_ratio,
                    initial_epoch,
                    PATH
                )
                
                df.to_csv(PATH + "result_" + config.data_name + ".csv", index=False)
                
                hp = '_hs' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
                with open(ENSEMBLE_PATH + 'base_' + config.data_name + hp + '.pickle', 'wb') as f:
                    pickle.dump(tst_ano_score, f, pickle.HIGHEST_PROTOCOL)

                torch.cuda.empty_cache()
            
        for hidden_size in config.hidden_size:
            for sampling_ratio in config.sampling_ratio:
                for initial_epoch in config.initial_epochs: 
                    for sampling_term in config.sampling_term:
                        print(
                            f"-----NewTrainer starts-----"
                        )
                        config.trainer_name = "NewTrainer"

                        model = BaseSeq2Seq(
                            input_size=config.window_size,
                            hidden_size=hidden_size,
                            output_size=config.window_size,
                            dropout_p=0.0,
                        ).to(config.device)

                        optimizer = optim.Adam(model.parameters())
                        criterion = nn.MSELoss()

                        # train
                        trainer = NewTrainer(model=model, optimizer=optimizer, crit=criterion)
                        
                        train_loss, val_loss, return_epoch, best_model, errors, tops, downs = trainer.train(
                            train_x=train_x,
                            train_y=train_y,
                            train_loader=train_dataloader,
                            val_loader=valid_dataloader,
                            sampling_term=sampling_term,
                            initial_epoch=initial_epoch,
                            sampling_ratio=sampling_ratio,
                            config=config,
                            use_wandb=False,
                            is_debug=is_debug
                        )
                        if is_debug:
                            hp = '_h' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
                            with open('./run_result_sim_error/newError_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(errors, f, pickle.HIGHEST_PROTOCOL)
                            with open('./run_result_sim_top/newTop_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(tops, f, pickle.HIGHEST_PROTOCOL)
                            with open('./run_result_sim_down/newDown_'+ config.data_name + hp + save_idx + '.pickle', 'wb') as f:
                                pickle.dump(downs, f, pickle.HIGHEST_PROTOCOL)
                        
                        best_model.to("cpu")                                
                        
                        df, tst_ano_score = ensemble_inference(
                            config,
                            total_dataloader,
                            best_model,
                            train_x,
                            valid_x,
                            total_x,
                            total_y,
                            return_epoch,
                            hidden_size,
                            train_loss,
                            val_loss,
                            IR,
                            sampling_term,
                            sampling_ratio,
                            initial_epoch,
                            PATH
                        )
                        
                        df.to_csv(PATH + "result_" + config.data_name + ".csv", index=False)
                        
                        hp = '_hs' + str(hidden_size) + '_st' + str(sampling_term) + '_sr' + str(sampling_ratio) + '_ie' + str(initial_epoch)
                        with open(ENSEMBLE_PATH + 'new_' + config.data_name + hp + '.pickle', 'wb') as f:
                            pickle.dump(tst_ano_score, f, pickle.HIGHEST_PROTOCOL)
                        
                        torch.cuda.empty_cache()
                        
frequency = 800
duration = 2000
winsound.Beep(frequency, duration)

<< 1번재 시작 >>
-----BaseTrainer starts with hidden_size=2-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


train_loss=0.029, valid_loss=0.027
New file generated!
-----NewTrainer starts-----
train_loss=0.018, valid_loss=0.009
-----NewTrainer starts-----
train_loss=0.166, valid_loss=0.005
-----NewTrainer starts-----
train_loss=0.052, valid_loss=0.014
-----NewTrainer starts-----
train_loss=0.008, valid_loss=0.007
-----NewTrainer starts-----
train_loss=0.031, valid_loss=0.019
-----NewTrainer starts-----
train_loss=0.010, valid_loss=0.012
-----NewTrainer starts-----
train_loss=0.057, valid_loss=0.025
-----NewTrainer starts-----
train_loss=0.071, valid_loss=0.087
<< 1번재 시작 >>
-----BaseTrainer starts with hidden_size=2-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


train_loss=0.521, valid_loss=0.636
New file generated!
-----NewTrainer starts-----
train_loss=0.500, valid_loss=0.531
-----NewTrainer starts-----
train_loss=0.442, valid_loss=0.566
-----NewTrainer starts-----
train_loss=0.471, valid_loss=0.500
-----NewTrainer starts-----
train_loss=0.534, valid_loss=0.591
-----NewTrainer starts-----
train_loss=0.416, valid_loss=0.495
-----NewTrainer starts-----
train_loss=0.483, valid_loss=0.602
-----NewTrainer starts-----
train_loss=0.413, valid_loss=0.515
-----NewTrainer starts-----
train_loss=0.495, valid_loss=0.668
<< 1번재 시작 >>
-----BaseTrainer starts with hidden_size=2-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


train_loss=0.448, valid_loss=0.495
New file generated!
-----NewTrainer starts-----
train_loss=0.480, valid_loss=0.507
-----NewTrainer starts-----
train_loss=0.468, valid_loss=0.504
-----NewTrainer starts-----
train_loss=0.444, valid_loss=0.479
-----NewTrainer starts-----
train_loss=0.448, valid_loss=0.504
-----NewTrainer starts-----
train_loss=0.434, valid_loss=0.530
-----NewTrainer starts-----
train_loss=0.411, valid_loss=0.480
-----NewTrainer starts-----
train_loss=0.421, valid_loss=0.489
-----NewTrainer starts-----
train_loss=0.406, valid_loss=0.512
<< 1번재 시작 >>
-----BaseTrainer starts with hidden_size=2-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


train_loss=0.323, valid_loss=0.715
New file generated!
-----NewTrainer starts-----
train_loss=0.327, valid_loss=0.534
-----NewTrainer starts-----
train_loss=0.396, valid_loss=0.617
-----NewTrainer starts-----
train_loss=0.274, valid_loss=0.462
-----NewTrainer starts-----
train_loss=0.386, valid_loss=0.699
-----NewTrainer starts-----
train_loss=0.297, valid_loss=0.588
-----NewTrainer starts-----
train_loss=0.423, valid_loss=0.744
-----NewTrainer starts-----
train_loss=0.346, valid_loss=0.698
-----NewTrainer starts-----
train_loss=0.350, valid_loss=0.569
<< 1번재 시작 >>
-----BaseTrainer starts with hidden_size=2-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


train_loss=0.647, valid_loss=0.509
New file generated!
-----NewTrainer starts-----
train_loss=0.427, valid_loss=0.425
-----NewTrainer starts-----
train_loss=0.503, valid_loss=0.482
-----NewTrainer starts-----
train_loss=0.495, valid_loss=0.438
-----NewTrainer starts-----
train_loss=0.493, valid_loss=0.506
-----NewTrainer starts-----
train_loss=0.500, valid_loss=0.572
-----NewTrainer starts-----
train_loss=0.474, valid_loss=0.479
-----NewTrainer starts-----
train_loss=0.363, valid_loss=0.408
-----NewTrainer starts-----
train_loss=0.369, valid_loss=0.425
<< 1번재 시작 >>
-----BaseTrainer starts with hidden_size=2-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


train_loss=0.568, valid_loss=0.539
New file generated!
-----NewTrainer starts-----
train_loss=0.630, valid_loss=0.662
-----NewTrainer starts-----
train_loss=0.333, valid_loss=0.449
-----NewTrainer starts-----
train_loss=0.504, valid_loss=0.506
-----NewTrainer starts-----
train_loss=0.469, valid_loss=0.512
-----NewTrainer starts-----
train_loss=0.495, valid_loss=0.627
-----NewTrainer starts-----
train_loss=0.313, valid_loss=0.467
-----NewTrainer starts-----
train_loss=0.325, valid_loss=0.493
-----NewTrainer starts-----
train_loss=0.275, valid_loss=0.407
<< 1번재 시작 >>
-----BaseTrainer starts with hidden_size=2-----


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = y_test


train_loss=0.374, valid_loss=0.463
New file generated!
-----NewTrainer starts-----
train_loss=0.572, valid_loss=0.689
-----NewTrainer starts-----
train_loss=0.409, valid_loss=0.512
-----NewTrainer starts-----
train_loss=0.369, valid_loss=0.478
-----NewTrainer starts-----
train_loss=0.391, valid_loss=0.385
-----NewTrainer starts-----
train_loss=0.373, valid_loss=0.518
-----NewTrainer starts-----
train_loss=0.375, valid_loss=0.561
-----NewTrainer starts-----
train_loss=0.399, valid_loss=0.564
-----NewTrainer starts-----
train_loss=0.294, valid_loss=0.408


In [24]:
os.listdir('./0410_run_result_tabular') 

['result_abalone9-18.csv',
 'result_shuttle-c0-vs-c4.csv',
 'result_vowel0.csv',
 'result_wine.csv',
 'result_yeast-1-2-8-9_vs_7.csv',
 'result_yeast4.csv',
 'result_yeast5.csv',
 'result_yeast6.csv']

In [34]:
cols = ['trainer_name', 'sampling_term','sampling_ratio','initial_epoch']
name = os.listdir('./0410_run_result_tabular')[7]
tmp = pd.read_csv('./0410_run_result_tabular/' + name)
tmp.groupby(cols)[['roc_auc','pr_auc']].agg(['mean','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,roc_auc,roc_auc,pr_auc,pr_auc
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std
trainer_name,sampling_term,sampling_ratio,initial_epoch,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
BaseTrainer,0,0.0,0,0.6241,,0.1941,
NewTrainer,1,0.01,5,0.7567,,0.2808,
NewTrainer,1,0.01,20,0.695,,0.2647,
NewTrainer,1,0.1,5,0.6569,,0.221,
NewTrainer,1,0.1,20,0.735,,0.2977,
NewTrainer,5,0.01,5,0.7465,,0.2826,
NewTrainer,5,0.01,20,0.6578,,0.2195,
NewTrainer,5,0.1,5,0.5623,,0.1452,
NewTrainer,5,0.1,20,0.6388,,0.2125,


In [26]:
# from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# anomaly_score = np.zeros((len(test_y), 8))

# result_df = pd.DataFrame(columns=['mean','median','max','min'], index=['roc_auc','pr_auc'])

# idx = 0
# for data_name in os.listdir(ENSEMBLE_PATH):
#     if config.data_name in data_name and 'base' in data_name:
#         with open(ENSEMBLE_PATH  + data_name, 'rb') as f:
#             tmp = pickle.load(f)
#             anomaly_score[:, idx] = tmp
#             idx += 1

# tst_ano_scr_med = np.median(anomaly_score, axis=1)
# tst_ano_scr_mean = np.mean(anomaly_score, axis=1)
# tst_ano_scr_max = np.max(anomaly_score, axis=1)
# tst_ano_scr_min = np.min(anomaly_score, axis=1)

# idx = 0
# for tst_ano_scr in [tst_ano_scr_mean, tst_ano_scr_med, tst_ano_scr_max, tst_ano_scr_min]:
#     roc_auc = roc_auc_score(test_y, tst_ano_scr)
#     _precision, _recall, _ = precision_recall_curve(test_y, tst_ano_scr)
#     pr_auc = auc(_recall, _precision)
#     result_df.iloc[0, idx] = roc_auc
#     result_df.iloc[1, idx] = pr_auc
#     idx += 1

# result_df

In [27]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

anomaly_score = np.zeros((len(test_y), 8))

result_df = pd.DataFrame(columns=['mean','median','max','min'], index=['roc_auc','pr_auc'])

idx = 0
for data_name in os.listdir(ENSEMBLE_PATH):
    if config.data_name in data_name and 'new' in data_name:
        with open(ENSEMBLE_PATH  + data_name, 'rb') as f:
            tmp = pickle.load(f)
            anomaly_score[:, idx] = tmp
            idx += 1

tst_ano_scr_med = np.median(anomaly_score, axis=1)
tst_ano_scr_mean = np.mean(anomaly_score, axis=1)
tst_ano_scr_max = np.max(anomaly_score, axis=1)
tst_ano_scr_min = np.min(anomaly_score, axis=1)

idx = 0
for tst_ano_scr in [tst_ano_scr_mean, tst_ano_scr_med, tst_ano_scr_max, tst_ano_scr_min]:
    roc_auc = roc_auc_score(test_y, tst_ano_scr)
    _precision, _recall, _ = precision_recall_curve(test_y, tst_ano_scr)
    pr_auc = auc(_recall, _precision)
    result_df.iloc[0, idx] = roc_auc
    result_df.iloc[1, idx] = pr_auc
    idx += 1

result_df

Unnamed: 0,mean,median,max,min
roc_auc,0.694233,0.671298,0.733159,0.704587
pr_auc,0.248507,0.233635,0.255832,0.289785
