In [1]:
# !pip install pytorch-tabnet

In [None]:
import os
from glob import glob
import math

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.metrics import Metric

from google.colab import drive
import torch.nn.functional as F
import torch.optim as optim
# from datasets import Dataset
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler # 표준화 패키지 라이브러리
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import pickle
from tqdm import tqdm
import random
from warnings import filterwarnings
import gc
from collections import Counter
import matplotlib.pyplot as plt

filterwarnings('ignore')
device = torch.device('cpu')
os.getcwd()

# Data Load

In [None]:
path = './data/*.pkl'

file_lst = sorted(glob(path))[1:]
file_lst[6] = './data/pre_processed_split_0tpt_data_6.pkl'
pca_file_lst = sorted(glob('../kyungho/3W-main/data/total_data_psudo_labeling*.pkl'))
print(file_lst, pca_file_lst)

In [None]:
for i, paths in enumerate(zip(file_lst,pca_file_lst)):
    file_path, pca_path = paths
    print(i, file_path, pca_path)
    with open(file_path, 'rb') as f:
        globals()['df_event_{}'.format(i)] = pickle.load(f)
        globals()['df_event_{}'.format(i)].reset_index(inplace = True, drop = True)

## Outlier 제거
- 참고: https://stackoverflow.com/questions/35827863/remove-outliers-in-pandas-dataframe-using-percentiles

In [None]:
# Add delta and remove outlier
def add_delta(data_frame):
    for column in ['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP']:
        column_pctchg = column+'_pctchg'
        data_frame[column_pctchg] = data_frame.groupby(['id_label'])[column].pct_change()
        data_frame[column_pctchg][data_frame[column_pctchg].isna()] = 0
    return data_frame

def remove_outlier(df):
    target_cols = ['P-PDG', 'P-TPT', 'T-TPT','P-MON-CKP', 'T-JUS-CKP',]
    df = df[df['id_label'].str.startswith('WELL')]
    df = add_delta(df)
    Q1 = df[target_cols].quantile(0.25)
    Q3 = df[target_cols].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df[target_cols] < (Q1 - 1.5 * IQR)) | (df[target_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df

df_event_0_out = remove_outlier(df_event_0)
df_event_1_out = remove_outlier(df_event_1)
df_event_2_out = remove_outlier(df_event_2)
df_event_3_out = remove_outlier(df_event_3)
df_event_4_out = remove_outlier(df_event_4)
df_event_5_out = remove_outlier(df_event_5)
df_event_6_out = remove_outlier(df_event_6)
df_event_7_out = remove_outlier(df_event_7)
df_event_8_out = remove_outlier(df_event_8)

# Train Test split

In [None]:
# extract the list of each data type(WELL, SIMULATED, DRAWN)
instance_dict = {'WELL': [], 'SIMULATED': [], 'DRAWN': []}
for i in range(9):
  instance_lst = list(globals()['df_event_{}'.format(i)]['id_label'].unique())
  for instance in instance_lst:
    if instance.startswith('WELL'):
      instance_dict['WELL'].extend([instance])
    elif instance.startswith('SIMULATED'):
      instance_dict['SIMULATED'].extend([instance])
    elif instance.startswith('DRAWN'):
      instance_dict['DRAWN'].extend([instance])

In [None]:
# check the number of events in 'WELL' data
class_lst = []
for i in instance_dict['WELL']:
  class_lst.append(i.split('_')[-1])
Counter(class_lst)

In [None]:
# split train and test instances
def split_train_test_instance(instance_dict, dTypes = ['WELL', 'SIMULATED', 'DRAWN'],train_ratio = 0.6, test_ratio = 0.2):
    total_len = 0
    simul_drawn_lst = []
    for dType in dTypes:
        total_len += len(instance_dict[dType])
        simul_drawn_lst += instance_dict[dType]
    well_len = len(instance_dict['WELL'])

    num_of_test = int(total_len*test_ratio)
    num_of_val = int(total_len*(1-train_ratio-test_ratio))
    num_of_train = total_len - num_of_test - num_of_val

    # extract test_lst first(Only WELL data)
    test_lst = random.sample(instance_dict['WELL'], num_of_test)
    
    # then make whole lst --> shuffle lst --> split train validation set
    rest_well_lst = [i for i in instance_dict['WELL'] if i not in test_lst]
    total_lst = rest_well_lst + simul_drawn_lst
    random.shuffle(total_lst)

    train_lst = total_lst[:num_of_train]
    val_lst = total_lst[num_of_train:]

    return train_lst, val_lst, test_lst

In [None]:
train_ratio = 0.7
test_ratio = 0.15
val_ratio = 1 - train_ratio - test_ratio
dTypes = ['WELL', 'SIMULATED', 'DRAWN']
train_lst, val_lst, test_lst = split_train_test_instance(instance_dict, dTypes = dTypes, train_ratio = train_ratio, test_ratio = test_ratio)

print(train_lst, len(train_lst), val_lst, len(val_lst), test_lst, len(test_lst), sep = '\n')
print('total number of lst:', len(instance_dict['WELL'])+len(instance_dict['DRAWN'])+len(instance_dict['SIMULATED']))
print('total number of lst after removing duplicated:', len(set(train_lst + val_lst + test_lst)))
print('train:', sorted(set([i.split('_')[-1] for i in train_lst])), 'val:', sorted(set([i.split('_')[-1] for i in val_lst])), 'test:', sorted(set([i.split('_')[-1] for i in test_lst])), sep = '\n')

In [None]:
# 참고: https://rfriend.tistory.com/590
def make_empty_df(pca_n_components):
    pca_lst = ['pca_1', 'pca_2','pca_3', 'pca_4', 'pca_5']
    columns = ['P-PDG', 'P-PDG_std','P-PDG_pctchg_mean', 'P-PDG_pctchg_std',
               'P-TPT', 'P-TPT_std', 'P-TPT_pctchg_mean', 'P-TPT_pctchg_std',
               'T-TPT', 'T-TPT_std', 'T-TPT_pctchg_mean', 'T-TPT_pctchg_std',
               'P-MON-CKP', 'P-MON-CKP_std', 'P-MON-CKP_pctchg_mean', 'P-MON-CKP_pctchg_std',
               'T-JUS-CKP', 'T-JUS-CKP_std', 'T-JUS-CKP_pctchg_mean', 'T-JUS-CKP_pctchg_std',
               'class',]
    columns.extend(pca_lst[:pca_n_components])
    data_frame = pd.DataFrame(columns=columns)
    return data_frame


def apply_window(data_frame, window_size = 5000, overlap_ratio = 0.2, pca_n_components = 2):
    if pca_n_components:
        pca = PCA(n_components=pca_n_components)
    
    new_df = make_empty_df(pca_n_components)
    for n in tqdm(range(int(len(data_frame)/(window_size*(1-overlap_ratio))))):
        try:
            if int(window_size*(n+1) - window_size*overlap_ratio*n) <= len(data_frame):
                # print(window_size*n*(1-overlap_ratio),window_size*(n+1) - window_size*overlap_ratio*n)
                sliced_df = data_frame[int(window_size*n*(1-overlap_ratio)):int(window_size*(n+1) - window_size*overlap_ratio*n)]
                pca_data = pca.fit_transform(sliced_df.drop( ['id_label','class'], axis=1))
                if any(sliced_df['class'] != 0):
                    class_label =  int(sliced_df['class'].value_counts().index[sliced_df['class'].value_counts().index < 10][0]) # 기본 모드
                    
                    # class_label = 1
                else:
                    # continue
                    class_label = 0
                new_df = pd.concat([new_df,pd.DataFrame({'P-PDG':[sliced_df['P-PDG'].mean()],
                                                        'P-PDG_std':[sliced_df['P-PDG'].std()],
                                                        'P-PDG_pctchg_mean':[sliced_df['P-PDG_pctchg'].mean()],
                                                        'P-PDG_pctchg_std':[sliced_df['P-PDG_pctchg'].std()],
                                                        'P-TPT':[sliced_df['P-TPT'].mean()],
                                                        'P-TPT_std':[sliced_df['P-TPT'].std()],
                                                        'P-TPT_pctchg_mean':[sliced_df['P-TPT_pctchg'].mean()],
                                                        'P-TPT_pctchg_std':[sliced_df['P-TPT_pctchg'].std()],
                                                        'T-TPT':[sliced_df['T-TPT'].mean()],
                                                        'T-TPT_std':[sliced_df['T-TPT'].std()],
                                                        'T-TPT_pctchg_mean':[sliced_df['T-TPT_pctchg'].mean()],
                                                        'T-TPT_pctchg_std':[sliced_df['T-TPT_pctchg'].std()],
                                                        'P-MON-CKP':[sliced_df['P-MON-CKP'].mean()],
                                                        'P-MON-CKP_std':[sliced_df['P-MON-CKP'].std()],
                                                        'P-MON-CKP_pctchg_mean':[sliced_df['P-MON-CKP_pctchg'].mean()],
                                                        'P-MON-CKP_pctchg_std':[sliced_df['P-MON-CKP_pctchg'].std()],
                                                        'T-JUS-CKP':[sliced_df['T-JUS-CKP'].mean()],
                                                        'T-JUS-CKP_std':[sliced_df['T-JUS-CKP'].std()],
                                                        'T-JUS-CKP_pctchg_mean':[sliced_df['T-JUS-CKP_pctchg'].mean()],
                                                        'T-JUS-CKP_pctchg_std':[sliced_df['T-JUS-CKP_pctchg'].std()],
                                                        'class': class_label,
                                                        'pca_1':[pca_data[:,0].mean()],
                                                        'pca_2':[pca_data[:,1].mean()],
                                                        #'pca_3':[pca_data.iloc[:,2].mean()],
                                                        })])

        except:
            continue
    return new_df

def build_dataset(data_frame, train_lst, val_lst, test_lst, window_size = 5000, overlap_ratio = 0.2, pca_n_components = 2):
  gc.collect()
  train_data_frame = data_frame[data_frame['id_label'].isin(train_lst)]
  val_data_frame = data_frame[data_frame['id_label'].isin(val_lst)]
  test_data_frame = data_frame[data_frame['id_label'].isin(test_lst)]

#   train_data_frame = remove_outlier(train_data_frame)

  train_df = apply_window(train_data_frame, window_size = window_size, overlap_ratio = overlap_ratio)
  val_df = apply_window(val_data_frame, window_size = window_size, overlap_ratio = overlap_ratio)
  test_df = apply_window(test_data_frame, window_size = window_size, overlap_ratio = overlap_ratio)
  return train_df, val_df, test_df

In [None]:
# applying window
window_size = 1200
window_size_short = window_size
window_size_long =  window_size
pca_n_components = 2
overlap_ratio = 0.95

df_event_0_window_train, df_event_0_window_val, df_event_0_window_test = build_dataset(df_event_0_out, train_lst, val_lst, test_lst, window_size = window_size_long, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
df_event_1_window_train, df_event_1_window_val, df_event_1_window_test = build_dataset(df_event_1_out, train_lst, val_lst, test_lst, window_size = window_size_long, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
df_event_2_window_train, df_event_2_window_val, df_event_2_window_test = build_dataset(df_event_2_out, train_lst, val_lst, test_lst, window_size = window_size_short, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
df_event_3_window_train, df_event_3_window_val, df_event_3_window_test = build_dataset(df_event_3_out, train_lst, val_lst, test_lst, window_size = window_size_long, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
df_event_4_window_train, df_event_4_window_val, df_event_4_window_test = build_dataset(df_event_4_out, train_lst, val_lst, test_lst, window_size = window_size_short, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
df_event_5_window_train, df_event_5_window_val, df_event_5_window_test = build_dataset(df_event_5_out, train_lst, val_lst, test_lst, window_size = window_size_long, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
df_event_6_window_train, df_event_6_window_val, df_event_6_window_test = build_dataset(df_event_6_out, train_lst, val_lst, test_lst, window_size = window_size_short, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
df_event_7_window_train, df_event_7_window_val, df_event_7_window_test = build_dataset(df_event_7_out, train_lst, val_lst, test_lst, window_size = window_size_long, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
df_event_8_window_train, df_event_8_window_val, df_event_8_window_test = build_dataset(df_event_8_out, train_lst, val_lst, test_lst, window_size = window_size_short, overlap_ratio = overlap_ratio, pca_n_components = pca_n_components)
gc.collect()


In [None]:
merged_df_train =  pd.concat([df_event_0_window_train.sample(int(len(df_event_0_window_train)*0.25)),
                        df_event_1_window_train,
                        df_event_2_window_train,
                        df_event_3_window_train.sample(int(len(df_event_3_window_train)*0.25)),
                        df_event_4_window_train.sample(int(len(df_event_4_window_train)*0.25)),
                        df_event_5_window_train,
                        df_event_6_window_train,
                        df_event_7_window_train,
                        df_event_8_window_train])

merged_df_val =  pd.concat([df_event_0_window_val,
                        df_event_1_window_val,
                        df_event_2_window_val,
                        df_event_3_window_val,
                        df_event_4_window_val,
                        df_event_5_window_val,
                        df_event_6_window_val,
                        df_event_7_window_val,
                        df_event_8_window_val])

merged_df_test =  pd.concat([df_event_0_window_test,
                        df_event_1_window_test,
                        df_event_2_window_test,
                        df_event_3_window_test,
                        df_event_4_window_test,
                        df_event_5_window_test,
                        df_event_6_window_test,
                        df_event_7_window_test,
                        df_event_8_window_test])


In [None]:
print(merged_df_train['class'].value_counts())
print(merged_df_val['class'].value_counts())
print(merged_df_test['class'].value_counts())

In [None]:
# P-PDG_pctchg_mean = -inf, 혹은 inf인 row 제외: train(14개), val(10개), test(13)
merged_df_train = merged_df_train[~merged_df_train['P-PDG_pctchg_std'].isna()]
merged_df_val = merged_df_val[~merged_df_val['P-PDG_pctchg_std'].isna()]
merged_df_test = merged_df_test[~merged_df_test['P-PDG_pctchg_std'].isna()]

In [None]:
features = ['P-PDG', 'P-PDG_std','P-PDG_pctchg_mean', 'P-PDG_pctchg_std',
            'P-TPT', 'P-TPT_std', 'P-TPT_pctchg_mean', 'P-TPT_pctchg_std',
            'T-TPT', 'T-TPT_std', 'T-TPT_pctchg_mean', 'T-TPT_pctchg_std',
            'P-MON-CKP', 'P-MON-CKP_std', 'P-MON-CKP_pctchg_mean', 'P-MON-CKP_pctchg_std',
            'T-JUS-CKP', 'T-JUS-CKP_std', 'T-JUS-CKP_pctchg_mean', 'T-JUS-CKP_pctchg_std',
            'pca_1', 'pca_2']
X_train_pre, y_train = merged_df_train.loc[:,features], merged_df_train['class']
X_train_pre

In [None]:
# normalize
# Pandas automatically applies colomn-wise function in the code above.
# merged_new_df = (merged_df - merged_df.mean())/merged_df.std()
features = ['P-PDG', 'P-PDG_std','P-PDG_pctchg_mean', 'P-PDG_pctchg_std',
            'P-TPT', 'P-TPT_std', 'P-TPT_pctchg_mean', 'P-TPT_pctchg_std',
            'T-TPT', 'T-TPT_std', 'T-TPT_pctchg_mean', 'T-TPT_pctchg_std',
            'P-MON-CKP', 'P-MON-CKP_std', 'P-MON-CKP_pctchg_mean', 'P-MON-CKP_pctchg_std',
            'T-JUS-CKP', 'T-JUS-CKP_std', 'T-JUS-CKP_pctchg_mean', 'T-JUS-CKP_pctchg_std',
            'pca_1', 'pca_2']




X_train_pre, y_train = merged_df_train.loc[:,features], merged_df_train['class']
X_val_pre, y_val = merged_df_val.loc[:,features], merged_df_val['class']
X_test_pre, y_test = merged_df_test.loc[:,features], merged_df_test['class']

# # it seems better
# X_train = (X_train_pre - X_train_pre.mean())/X_train_pre.std()
# X_val = (X_val_pre - X_val_pre.mean())/X_val_pre.std()
# X_test = (X_test_pre - X_test_pre.mean())/X_test_pre.std()

# X_train = (X_train_pre)/X_train_pre.std()
# X_val = (X_val_pre)/X_val_pre.std()
# X_test = (X_test_pre)/X_test_pre.std()

# 각 칼럼의 std는 정규화에서 제외
columns = ['P-PDG', 'P-PDG_pctchg_mean','P-TPT','P-TPT_pctchg_mean', 'T-TPT','T-TPT_pctchg_mean', 'P-MON-CKP','P-MON-CKP_pctchg_mean', 'T-JUS-CKP','T-JUS-CKP_pctchg_mean',
           'P-PDG_std', 'P-TPT_std', 'T-TPT_std', 'P-MON-CKP_std', 'T-JUS-CKP_std',
           'pca_1', 'pca_2']

X_train_pre.loc[:,columns] = (X_train_pre.loc[:,columns] - X_train_pre.loc[:,columns].mean())/X_train_pre.loc[:,columns].std()
X_val_pre.loc[:,columns] = (X_val_pre.loc[:,columns] - X_val_pre.loc[:,columns].mean())/X_val_pre.loc[:,columns].std()
X_test_pre.loc[:,columns] = (X_test_pre.loc[:,columns] - X_test_pre.loc[:,columns].mean())/X_test_pre.loc[:,columns].std()

X_train = X_train_pre
X_val = X_val_pre
X_test = X_test_pre

In [None]:
# training
# print(merged_df.columns)
# features = ['P-PDG', 'P-TPT','T-TPT', 'P-MON-CKP', 'T-JUS-CKP',]
# target = ['class']

# ratio_1, ratio_2 = int(0.7*len(merged_df)), int(0.15*len(merged_df))

X_train = X_train[features].values
y_train = y_train.values.astype(int)

X_val = X_val[features].values
y_val = y_val.values.astype(int)

X_test = X_test[features].values
y_test = y_test.values.astype(int)

In [None]:
# options
optimizer_fn = torch.optim.AdamW
optimizer_params = dict(lr=0.0001)
batch_size = 64
patience = 10

In [None]:
print(f'{optimizer_fn.__name__}, {train_ratio}, {val_ratio: .2f}, {test_ratio}, long = {window_size_long} short = {window_size_short}, {overlap_ratio}, lr = {optimizer_params}, batch_size = {batch_size}, no_weigth_loss, patience = {patience}, val_acc = ?, test_acc = ?, no Norm, no Simul')

In [None]:
max_epochs = 200

# weighted cross entropy
weight_for_class = []
total_obs = len(y_train)
for key, value in sorted(Counter(y_train).items()):
    print(f'{key} is in event, {value}')
    weight_for_class.append(1 - (value/total_obs))

weight_for_class = torch.Tensor(weight_for_class).type(torch.float32)
print(weight_for_class)

clf = TabNetClassifier(
    optimizer_fn=optimizer_fn,
    optimizer_params=optimizer_params,#1e-2
    # cat_emb_dim = [window_size, window_size, window_size, window_size, window_size]
    )  #TabNetRegressor()

result = clf.fit(
    X_train = X_train,
    y_train = y_train,
    eval_set=[(X_val, y_val)],
    eval_metric = ["accuracy", ],
    loss_fn = nn.CrossEntropyLoss(weight_for_class),
    # loss_fn = nn.CrossEntropyLoss(),
    batch_size = batch_size,
    max_epochs = max_epochs,
    drop_last = True,
    patience = patience
)
# AdamW, 0.7,  0.15, 0.15, long = 1200 short = 1200, 0.95, lr = {'lr': 0.001}, batch_size = 64, no_weigth_loss, patience = 10, val_acc = 0.96924, test_acc = 0.954, no Norm, no Simul
# AdamW, 0.7,  0.15, 0.15, long = 1200 short = 1200, 0.95, lr = {'lr': 0.001}, batch_size = 128, no_weigth_loss, patience = 15, val_acc = .883, test_acc = .957, no Norm, no Simul

In [None]:
preds = clf.predict(X_test)
accuracy_score(preds, y_test)

In [None]:
# saving_path_name = './tabnet_test_acc_9904'
# clf.save_model(saving_path_name)

In [None]:
print(classification_report(y_test, preds))