# 1. 필요 라이브러리 다운로드

In [None]:
!pip install pytorch-tabnet
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
from collections import Counter
from datetime import datetime
import matplotlib.pyplot as plt
import random
import time
import copy
from glob import glob

import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0
Mounted at /content/drive


# 2. 데이터 처리
## 2.1 Raw Data -> 1차 데이터 정리
    * 'P-JUS-CKGL','T-JUS-CKGL', 'QGL'은 라벨 전반으로 없으니 제외
    * 인스턴스 내부에 결측된 요소가 10%가 넘는 칼럼이 존재한다면 제거
    * 단 클래스가 결측된 것은 예외 처리
    * 10% 미만의 결측은 linear 보간
    * 6번 라벨만 T-TPT가 비정상적으로 결측이 많아 이는 0으로 psudo value를 부여한다.
    * 아래 데이터는 이미 만들어진 데이터이다.

### 2.1.1 데이터 만들기

In [None]:
def make_instance_name(x):
# input (str) : 해당 observation이 위치한 file_path
# output (str) : file_path에서 well_id만 추출한 것

    pattern = r"/([^/]+)\.csv$"
    match1 = re.search(pattern, x)
    return match1.group(1)

def missing_data_proportion(df):
# input (df) : 라벨 단위 observation_df
# output (df) : 라벨 단위 instance_df

    index_lst = []
    missing_lst = []
    columns = df.columns
    instance_names = df['id_label'].unique()

    for i, name in enumerate(instance_names):
        instance_df = df[df['id_label'] == name]
        total = len(instance_df)
        missing_num_arr = np.round(instance_df.isna().sum().values / total * 100, decimals=3)
        index_lst.append(name)
        missing_lst.append(missing_num_arr)

    result = pd.DataFrame(data=missing_lst, index = index_lst, columns = columns)
    return result

def interplolate_without_class(df, method='linear'):
# input (df) : 라벨 단위 observation_df
# output (df) : 라벨 단위 observation_df
# 기본 보간 방법은 선형 보간

    columns = df.columns
    columns = columns.drop('class')
    class_columns = df['class']

    df = df[columns].interpolate(method = method)
    df['class'] = class_columns
    return df

label_nums = 9
merged_data_path = # 경로 지정해주세요 #
data_paths = sorted(glob(merged_data_path))


df_dict = {label : pd.read_csv(path) for label, path in enumerate(tqdm(data_paths))}

threshold = 10

before_interpolation_description = []
after_interpolation_description = []
observation_nums_lst = []
for label in tqdm(range(label_nums)):
    #1. 칼럼명 변경 / 불용 칼럼 제거 / well_id_labe 칼럼 추가
    df_dict[label]['id_label'] =  df_dict[label]['file_name'].apply(make_instance_name) + '_' + str(label)
    df_dict[label].drop(['Unnamed: 0', 'timestamp', 'event_type', 'P-JUS-CKGL', 'T-JUS-CKGL', 'QGL', 'instance_type','file_name'], axis=1, inplace=True)

    #2. 인스턴스 별 칼럼의 결측 비율 조사
    missing_proportion_df = missing_data_proportion(df_dict[label])

    #3. Threshold 이상의 observation만 남기고 나머지 observation 제거
    index = missing_proportion_df[(missing_proportion_df > threshold).sum(axis=1) == 0].index
    df_dict[label] = df_dict[label][df_dict[label]['id_label'].isin(index)]
    print(f"{label} 라벨의 최종 관측 개수: {len(df_dict[label])}")

    #4. 라벨 제외한 df 보간
    observation_nums_lst.append(len(df_dict[label]))
    before_interpolation_description.append(df_dict[label].isna().sum().values)
    df_dict[label] = interplolate_without_class(df_dict[label])
    after_interpolation_description.append(df_dict[label].isna().sum().values)

In [None]:
df_notpt = {6 : pd.read_csv(data_paths[6])}
threshold = 10
label = 6

before_interpolation_description = []
after_interpolation_description = []
observation_nums_lst = []

#1. 칼럼명 변경 / 불용 칼럼 제거 / well_id_labe 칼럼 추가
df_notpt[label]['id_label'] =  df_notpt[label]['file_name'].apply(make_instance_name) + '_' + str(label)
df_notpt[label].drop(['Unnamed: 0', 'timestamp', 'event_type', 'T-TPT','P-JUS-CKGL', 'T-JUS-CKGL', 'QGL', 'instance_type','file_name'], axis=1, inplace=True)

#2. 인스턴스 별 칼럼의 결측 비율 조사
missing_proportion_df = missing_data_proportion(df_notpt[label])

#3. Threshold 이상의 observation만 남기고 나머지 observation 제거
index = missing_proportion_df[(missing_proportion_df > threshold).sum(axis=1) == 0].index
df_notpt[label] = df_notpt[label][df_dict[label]['id_label'].isin(index)]
print(f"{label} 라벨의 최종 관측 개수: {len(df_notpt[label])}")

#4. 라벨 제외한 df 보간
observation_nums_lst.append(len(df_notpt[label]))
before_interpolation_description.append(df_dict[label].isna().sum().values)
df_notpt[label] = interplolate_without_class(df_notpt[label])
after_interpolation_description.append(df_dict[label].isna().sum().values)

df_notpt = df_notpt[label]

df = df_dict[label]

t_tpt=pd.Series([0.]*5804790)

t_tpt_2 = df['T-TPT']
result = pd.concat([t_tpt, t_tpt_2])
df_notpt['T-TPT'] = result

# 원하는 칼럼 순서 지정
new_order = ['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP', 'id_label', 'class']

# 칼럼 순서 변경
df_notpt = df_notpt.reindex(columns=new_order)

In [None]:
labels = [0,1,2,3,4,5,6,7,8]
raw_data_dict = {}
for label in tqdm(labels):
    if label == 6:
        raw_data_dict[label] = df_dict[label]
    else:
        raw_data_dict[label] = df_notpt

100%|██████████| 9/9 [00:37<00:00,  4.12s/it]


## 2.2 1차 데이터 -> 2차 데이터

    * 5개 feature에 대해서 차분을 진행해 또 다른 5개 칼럼을 만든다.
    * 총 10개 칼럼에 대해 표준화를 진행한다.
    * 총 10개 feature 형성 완료

### 2.2.1 데이터 만들기

In [None]:
labels = [0,1,2,3,4,5,6,7,8]
feature_columns = list(raw_data_dict[0].drop(columns=['id_label', 'class']).columns)
diff_columns = ['Diff-P-PDG', 'Diff-P-TPT','Diff-T-TPT','Diff-P-MON-CKP','Diff-T-JUS-CKP']
merged_columns = [val for pair in zip(feature_columns, diff_columns) for val in pair]

scaled_diff_dict = {}
for label in tqdm(labels):
    instance_lst = raw_data_dict[label]['id_label'].unique()
    result = pd.DataFrame()
    for i, instance in enumerate(instance_lst):
        instance_df = raw_data_dict[label][raw_data_dict[label]['id_label'] == instance].reset_index(drop=True)
        id_label = instance_df['id_label'].reset_index(drop=True)
        class_ = instance_df['class'].reset_index(drop=True)
        instance_df = instance_df.drop(columns=['id_label', 'class']).reset_index(drop=True)

        diff_df = instance_df.diff().fillna(0)
        diff_df.columns = diff_columns
        result_df = pd.concat([instance_df, diff_df], axis=1)
        result_df = result_df[merged_columns].reset_index(drop=True)

        scaler = StandardScaler()
        result_df = pd.DataFrame(scaler.fit_transform(result_df), columns=merged_columns)
        result_df['id_label'] = id_label
        result_df['class'] = class_
        result = pd.concat([result, result_df]).reset_index(drop=True)
        if (i+1) % 10 == 0:
            print(f"{label} : {i+1}/{len(instance_lst)} instance 차분 및 표준화 완료")

    scaled_diff_dict[label] = result

  0%|          | 0/9 [00:00<?, ?it/s]

0 : 10/511 instance 차분 및 표준화 완료
0 : 20/511 instance 차분 및 표준화 완료
0 : 30/511 instance 차분 및 표준화 완료
0 : 40/511 instance 차분 및 표준화 완료
0 : 50/511 instance 차분 및 표준화 완료
0 : 60/511 instance 차분 및 표준화 완료
0 : 70/511 instance 차분 및 표준화 완료
0 : 80/511 instance 차분 및 표준화 완료
0 : 90/511 instance 차분 및 표준화 완료
0 : 100/511 instance 차분 및 표준화 완료
0 : 110/511 instance 차분 및 표준화 완료
0 : 120/511 instance 차분 및 표준화 완료
0 : 130/511 instance 차분 및 표준화 완료
0 : 140/511 instance 차분 및 표준화 완료
0 : 150/511 instance 차분 및 표준화 완료
0 : 160/511 instance 차분 및 표준화 완료
0 : 170/511 instance 차분 및 표준화 완료
0 : 180/511 instance 차분 및 표준화 완료
0 : 190/511 instance 차분 및 표준화 완료
0 : 200/511 instance 차분 및 표준화 완료
0 : 210/511 instance 차분 및 표준화 완료
0 : 220/511 instance 차분 및 표준화 완료
0 : 230/511 instance 차분 및 표준화 완료
0 : 240/511 instance 차분 및 표준화 완료
0 : 250/511 instance 차분 및 표준화 완료
0 : 260/511 instance 차분 및 표준화 완료
0 : 270/511 instance 차분 및 표준화 완료
0 : 280/511 instance 차분 및 표준화 완료
0 : 290/511 instance 차분 및 표준화 완료
0 : 300/511 instance 차분 및 표준화 완료
0 : 310/511 instanc

 11%|█         | 1/9 [14:08<1:53:06, 848.27s/it]

1 : 10/128 instance 차분 및 표준화 완료
1 : 20/128 instance 차분 및 표준화 완료
1 : 30/128 instance 차분 및 표준화 완료
1 : 40/128 instance 차분 및 표준화 완료
1 : 50/128 instance 차분 및 표준화 완료
1 : 60/128 instance 차분 및 표준화 완료
1 : 70/128 instance 차분 및 표준화 완료
1 : 80/128 instance 차분 및 표준화 완료
1 : 90/128 instance 차분 및 표준화 완료
1 : 100/128 instance 차분 및 표준화 완료
1 : 110/128 instance 차분 및 표준화 완료
1 : 120/128 instance 차분 및 표준화 완료


 22%|██▏       | 2/9 [17:47<55:47, 478.20s/it]  

2 : 10/19 instance 차분 및 표준화 완료


 33%|███▎      | 3/9 [17:49<26:04, 260.80s/it]

3 : 10/106 instance 차분 및 표준화 완료
3 : 20/106 instance 차분 및 표준화 완료
3 : 30/106 instance 차분 및 표준화 완료
3 : 40/106 instance 차분 및 표준화 완료
3 : 50/106 instance 차분 및 표준화 완료
3 : 60/106 instance 차분 및 표준화 완료
3 : 70/106 instance 차분 및 표준화 완료
3 : 80/106 instance 차분 및 표준화 완료
3 : 90/106 instance 차분 및 표준화 완료
3 : 100/106 instance 차분 및 표준화 완료


 44%|████▍     | 4/9 [19:04<15:36, 187.33s/it]

4 : 10/344 instance 차분 및 표준화 완료
4 : 20/344 instance 차분 및 표준화 완료
4 : 30/344 instance 차분 및 표준화 완료
4 : 40/344 instance 차분 및 표준화 완료
4 : 50/344 instance 차분 및 표준화 완료
4 : 60/344 instance 차분 및 표준화 완료
4 : 70/344 instance 차분 및 표준화 완료
4 : 80/344 instance 차분 및 표준화 완료
4 : 90/344 instance 차분 및 표준화 완료
4 : 100/344 instance 차분 및 표준화 완료
4 : 110/344 instance 차분 및 표준화 완료
4 : 120/344 instance 차분 및 표준화 완료
4 : 130/344 instance 차분 및 표준화 완료
4 : 140/344 instance 차분 및 표준화 완료
4 : 150/344 instance 차분 및 표준화 완료
4 : 160/344 instance 차분 및 표준화 완료
4 : 170/344 instance 차분 및 표준화 완료
4 : 180/344 instance 차분 및 표준화 완료
4 : 190/344 instance 차분 및 표준화 완료
4 : 200/344 instance 차분 및 표준화 완료
4 : 210/344 instance 차분 및 표준화 완료
4 : 220/344 instance 차분 및 표준화 완료
4 : 230/344 instance 차분 및 표준화 완료
4 : 240/344 instance 차분 및 표준화 완료
4 : 250/344 instance 차분 및 표준화 완료
4 : 260/344 instance 차분 및 표준화 완료
4 : 270/344 instance 차분 및 표준화 완료
4 : 280/344 instance 차분 및 표준화 완료
4 : 290/344 instance 차분 및 표준화 완료
4 : 300/344 instance 차분 및 표준화 완료
4 : 310/344 instanc

 56%|█████▌    | 5/9 [20:55<10:39, 159.84s/it]

5 : 10/442 instance 차분 및 표준화 완료
5 : 20/442 instance 차분 및 표준화 완료
5 : 30/442 instance 차분 및 표준화 완료
5 : 40/442 instance 차분 및 표준화 완료
5 : 50/442 instance 차분 및 표준화 완료
5 : 60/442 instance 차분 및 표준화 완료
5 : 70/442 instance 차분 및 표준화 완료
5 : 80/442 instance 차분 및 표준화 완료
5 : 90/442 instance 차분 및 표준화 완료
5 : 100/442 instance 차분 및 표준화 완료
5 : 110/442 instance 차분 및 표준화 완료
5 : 120/442 instance 차분 및 표준화 완료
5 : 130/442 instance 차분 및 표준화 완료
5 : 140/442 instance 차분 및 표준화 완료
5 : 150/442 instance 차분 및 표준화 완료
5 : 160/442 instance 차분 및 표준화 완료
5 : 170/442 instance 차분 및 표준화 완료
5 : 180/442 instance 차분 및 표준화 완료
5 : 190/442 instance 차분 및 표준화 완료
5 : 200/442 instance 차분 및 표준화 완료
5 : 210/442 instance 차분 및 표준화 완료
5 : 220/442 instance 차분 및 표준화 완료
5 : 230/442 instance 차분 및 표준화 완료
5 : 240/442 instance 차분 및 표준화 완료
5 : 250/442 instance 차분 및 표준화 완료
5 : 260/442 instance 차분 및 표준화 완료
5 : 270/442 instance 차분 및 표준화 완료
5 : 280/442 instance 차분 및 표준화 완료
5 : 290/442 instance 차분 및 표준화 완료
5 : 300/442 instance 차분 및 표준화 완료
5 : 310/442 instanc

 67%|██████▋   | 6/9 [39:34<24:17, 485.88s/it]

6 : 10/221 instance 차분 및 표준화 완료
6 : 20/221 instance 차분 및 표준화 완료
6 : 30/221 instance 차분 및 표준화 완료
6 : 40/221 instance 차분 및 표준화 완료
6 : 50/221 instance 차분 및 표준화 완료
6 : 60/221 instance 차분 및 표준화 완료
6 : 70/221 instance 차분 및 표준화 완료
6 : 80/221 instance 차분 및 표준화 완료
6 : 90/221 instance 차분 및 표준화 완료
6 : 100/221 instance 차분 및 표준화 완료
6 : 110/221 instance 차분 및 표준화 완료
6 : 120/221 instance 차분 및 표준화 완료
6 : 130/221 instance 차분 및 표준화 완료
6 : 140/221 instance 차분 및 표준화 완료
6 : 150/221 instance 차분 및 표준화 완료
6 : 160/221 instance 차분 및 표준화 완료
6 : 170/221 instance 차분 및 표준화 완료
6 : 180/221 instance 차분 및 표준화 완료
6 : 190/221 instance 차분 및 표준화 완료
6 : 200/221 instance 차분 및 표준화 완료
6 : 210/221 instance 차분 및 표준화 완료
6 : 220/221 instance 차분 및 표준화 완료


 78%|███████▊  | 7/9 [43:42<13:36, 408.34s/it]

7 : 10/15 instance 차분 및 표준화 완료


 89%|████████▉ | 8/9 [43:53<04:41, 281.82s/it]

8 : 10/81 instance 차분 및 표준화 완료
8 : 20/81 instance 차분 및 표준화 완료
8 : 30/81 instance 차분 및 표준화 완료
8 : 40/81 instance 차분 및 표준화 완료
8 : 50/81 instance 차분 및 표준화 완료
8 : 60/81 instance 차분 및 표준화 완료
8 : 70/81 instance 차분 및 표준화 완료
8 : 80/81 instance 차분 및 표준화 완료


100%|██████████| 9/9 [44:17<00:00, 295.31s/it]


## 2.3 2차 데이터 -> 3차 데이터

    * PCA 데이터를 추가할 것이다.
    * 이 때 원본 표준화 데이터에 대한 pca, 차분 표준화 데이터에 대한 pca, 전체 표준화 데이터에 대한 pca, 이렇게 3번의 걸쳐 pca를 진행한다.
    * 적정 차원 축소 수는 elbow rule을 통해 결정한다.
    * 3,4,5로 결정

### 2.3.1 적정 차원수 결정

In [None]:
def determine_pca_dimension(X, max_components, threshold):
    pca = PCA(n_components=max_components)
    pca.fit(X)
    variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(variance_ratio)
    optimal_dimension = np.argmax(cumulative_variance_ratio >= threshold) + 1
    return optimal_dimension

def determine_svd_dimension(X, max_components, threshold):
    svd = TruncatedSVD(n_components=max_components)
    svd.fit(X)
    explained_variance_ratio = svd.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

    n_components = np.argmax(cumulative_variance_ratio >= threshold) + 1

    return n_components

labels = [0,1,2,3,4,5,6,7,8]
normal_columns = [column for column in scaled_diff_dict[0].columns if 'Diff' not in column and column != 'id_label' and column != 'class']
diff_columns = [column for column in scaled_diff_dict[0].columns if 'Diff' in column and column != 'id_label' and column != 'class']

normal_pca_lst = []
diff_pca_lst = []
total_pca_lst = []

normal_nmf_lst = []
diff_nmf_lst = []
total_nmf_lst = []

normal_svd_lst = []
diff_svd_lst = []
total_svd_lst = []

threshold = 0.9

for label in tqdm(labels):
    df = scaled_diff_dict[label]
    instance_lst = df['id_label'].unique()
    for instance in tqdm(instance_lst):
        instance_df = df[df['id_label'] == instance]
        id_label = instance_df['id_label']
        label = instance_df['class']
        instance_df = instance_df.drop(columns=['id_label', 'class'])
        normal_df = instance_df[normal_columns]
        diff_df = instance_df[diff_columns]
        total_df = instance_df

        normal_pca_optimal_dimension = determine_pca_dimension(X = normal_df, max_components=normal_df.shape[1], threshold = threshold)
        diff_pca_optimal_dimension = determine_pca_dimension(X = diff_df, max_components=diff_df.shape[1], threshold = threshold)
        total_pca_optimal_dimension = determine_pca_dimension(X = total_df, max_components=total_df.shape[1], threshold = threshold)

        normal_svd_optimal_dimension = determine_svd_dimension(X = normal_df, max_components=normal_df.shape[1], threshold = threshold)
        diff_svd_optimal_dimension = determine_svd_dimension(X = diff_df, max_components=diff_df.shape[1], threshold = threshold)
        total_svd_optimal_dimension = determine_svd_dimension(X = total_df, max_components=total_df.shape[1], threshold = threshold)

        normal_pca_lst.append(normal_pca_optimal_dimension)
        diff_pca_lst.append(diff_pca_optimal_dimension)
        total_pca_lst.append(total_pca_optimal_dimension)

        normal_svd_lst.append(normal_svd_optimal_dimension)
        diff_svd_lst.append(diff_svd_optimal_dimension)
        total_svd_lst.append(total_svd_optimal_dimension)

normal_pca_counter = sorted(Counter(normal_pca_lst).items(), key = lambda x:x[0])
diff_pca_counter = sorted(Counter(diff_pca_lst).items(), key = lambda x:x[0])
total_pca_counter = sorted(Counter(total_pca_lst).items(), key = lambda x:x[0])

normal_pca_components = max(normal_pca_counter, key=lambda x:x[1])[0]
diff_pca_components = max(diff_pca_counter, key=lambda x:x[1])[0]
total_pca_components = max(total_pca_counter, key=lambda x:x[1])[0]

normal_svd_counter = sorted(Counter(normal_svd_lst).items(), key = lambda x:x[0])
diff_svd_counter = sorted(Counter(diff_svd_lst).items(), key = lambda x:x[0])
total_svd_counter = sorted(Counter(total_svd_lst).items(), key = lambda x:x[0])

normal_svd_components = max(normal_svd_counter, key=lambda x:x[1])[0]
diff_svd_components = max(diff_svd_counter, key=lambda x:x[1])[0]
total_svd_components = max(total_svd_counter, key=lambda x:x[1])[0]

decomposition_components = {'pca' : {'normal': normal_pca_components,'diff':diff_pca_components, 'total':total_pca_components},
                            'svd' : {'normal': normal_svd_components, 'diff':diff_svd_components, 'total':total_svd_components}}

print("normal_pca 적정 차원 축소 수", normal_pca_counter)
print("diff_pca 적정 차원 축소 수", diff_pca_counter)
print("total_pca 적정 차원 축소 수", total_pca_counter)

print("normal_svd 적정 차원 축소 수", normal_svd_counter)
print("diff_svd 적정 차원 축소 수", diff_svd_counter)
print("total_svd 적정 차원 축소 수", total_svd_counter)

decomposition_components = {'pca' : {'normal': normal_pca_components,'diff':diff_pca_components, 'total':total_pca_components}}

  0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/511 [00:00<?, ?it/s][A
  0%|          | 1/511 [00:00<06:06,  1.39it/s][A
  0%|          | 2/511 [00:01<06:28,  1.31it/s][A
  1%|          | 3/511 [00:02<06:39,  1.27it/s][A
  1%|          | 4/511 [00:03<06:42,  1.26it/s][A
  1%|          | 5/511 [00:03<06:45,  1.25it/s][A
  1%|          | 6/511 [00:04<06:40,  1.26it/s][A
  1%|▏         | 7/511 [00:05<06:38,  1.26it/s][A
  2%|▏         | 8/511 [00:06<06:39,  1.26it/s][A
  2%|▏         | 9/511 [00:07<06:26,  1.30it/s][A
  2%|▏         | 10/511 [00:07<06:29,  1.29it/s][A
  2%|▏         | 11/511 [00:08<06:50,  1.22it/s][A
  2%|▏         | 12/511 [00:10<08:02,  1.03it/s][A
  3%|▎         | 13/511 [00:11<08:07,  1.02it/s][A
  3%|▎         | 14/511 [00:11<07:56,  1.04it/s][A
  3%|▎         | 15/511 [00:12<07:40,  1.08it/s][A
  3%|▎         | 16/511 [00:13<07:23,  1.12it/s][A
  3%|▎         | 17/511 [00:14<07:00,  1.18it/s][A
  4%|▎         | 18/511 [00:15<06:45,  1.21i

normal_pca 적정 차원 축소 수 [(1, 578), (2, 451), (3, 651), (4, 186), (5, 1)]
diff_pca 적정 차원 축소 수 [(2, 165), (3, 696), (4, 833), (5, 173)]
total_pca 적정 차원 축소 수 [(2, 28), (3, 308), (4, 345), (5, 463), (6, 439), (7, 268), (8, 14), (9, 2)]
normal_svd 적정 차원 축소 수 [(1, 577), (2, 450), (3, 653), (4, 186), (5, 1)]
diff_svd 적정 차원 축소 수 [(2, 165), (3, 696), (4, 833), (5, 173)]
total_svd 적정 차원 축소 수 [(2, 28), (3, 308), (4, 345), (5, 461), (6, 440), (7, 269), (8, 14), (9, 2)]





### 2.3.2 데이터 만들기

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def apply_decomposition(X, method, n_components):
    if method == 'pca':
        decomposition_method = PCA(n_components=n_components)
    # elif method == 't-sne':
    #     decomposition_method = TSNE(n_components=n_components)

    X_decomposition = decomposition_method.fit_transform(X)
    return X_decomposition

decomposed_data = {}
labels = [0,1,2,3,4,5,6,7,8]
# labels = [7]
for label in tqdm(labels):
    df = scaled_diff_dict[label]
    instance_lst = df['id_label'].unique()

    label_normal_pca_df = pd.DataFrame([])
    label_diff_pca_df = pd.DataFrame([])
    label_total_pca_df = pd.DataFrame([])

    # label_normal_tsne_df = pd.DataFrame([])
    # label_diff_tsne_df = pd.DataFrame([])
    # label_total_tsne_df = pd.DataFrame([])

    for instance in tqdm(instance_lst):
        instance_df = df[df['id_label'] == instance]
        id_label = instance_df['id_label'].reset_index(drop=True)
        class_ = instance_df['class'].reset_index(drop=True)
        instance_df = instance_df.drop(columns=['id_label', 'class']).reset_index(drop=True)
        normal_df = instance_df[normal_columns]
        diff_df = instance_df[diff_columns]
        total_df = instance_df

        for method, n_components_dict in decomposition_components.items():
            normal_component = n_components_dict['normal']
            diff_component = n_components_dict['diff']
            total_component = n_components_dict['total']

            normal_decomposed_df = pd.DataFrame(apply_decomposition(X = normal_df, method = method, n_components = normal_component))
            diff_decomposed_df = pd.DataFrame(apply_decomposition(X = diff_df, method = method, n_components = diff_component))
            total_decomposed_df = pd.DataFrame(apply_decomposition(X = total_df, method = method, n_components = total_component))

            normal_decomposed_df['id_label'] = id_label
            normal_decomposed_df['class'] = class_
            diff_decomposed_df['id_label'] = id_label
            diff_decomposed_df['class'] = class_
            total_decomposed_df['id_label'] = id_label
            total_decomposed_df['class'] = class_

            if method == 'pca':

                label_normal_pca_df = pd.concat([label_normal_pca_df, normal_decomposed_df])
                label_diff_pca_df = pd.concat([label_diff_pca_df, diff_decomposed_df])
                label_total_pca_df = pd.concat([label_total_pca_df, total_decomposed_df])

            # elif method == 't-sne':
            #     label_normal_tsne_df = pd.concat([label_normal_tsne_df, normal_decomposed_df])
            #     label_diff_tsne_df = pd.concat([label_diff_tsne_df, diff_decomposed_df])
            #     label_total_tsne_df = pd.concat([label_total_tsne_df, total_decomposed_df])

    decomposed_data[label] = {'pca' : {'normal': label_normal_pca_df.reset_index(drop=True), 'diff' : label_diff_pca_df.reset_index(drop=True), 'total' : label_total_pca_df.reset_index(drop=True)}}

  0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/511 [00:00<?, ?it/s][A
  0%|          | 1/511 [00:00<04:56,  1.72it/s][A
  0%|          | 2/511 [00:01<05:42,  1.48it/s][A
  1%|          | 3/511 [00:02<05:42,  1.48it/s][A
  1%|          | 4/511 [00:02<06:19,  1.34it/s][A
  1%|          | 5/511 [00:03<06:44,  1.25it/s][A
  1%|          | 6/511 [00:04<06:24,  1.31it/s][A
  1%|▏         | 7/511 [00:05<06:09,  1.36it/s][A
  2%|▏         | 8/511 [00:05<05:59,  1.40it/s][A
  2%|▏         | 9/511 [00:06<05:58,  1.40it/s][A
  2%|▏         | 10/511 [00:07<05:49,  1.43it/s][A
  2%|▏         | 11/511 [00:07<05:51,  1.42it/s][A
  2%|▏         | 12/511 [00:08<06:04,  1.37it/s][A
  3%|▎         | 13/511 [00:09<06:08,  1.35it/s][A
  3%|▎         | 14/511 [00:10<06:15,  1.33it/s][A
  3%|▎         | 15/511 [00:11<06:34,  1.26it/s][A
  3%|▎         | 16/511 [00:11<06:12,  1.33it/s][A
  3%|▎         | 17/511 [00:12<06:11,  1.33it/s][A
  4%|▎         | 18/511 [00:13<06:15,  1.31i

## 2.4 3차 데이터 -> 4차 데이터

    * 2차 데이터와 3차 데이터에서 만들어진 각각의 데이터 프레임을 하나로 병합해 total_data를 만든다

### 2.4.1 데이터 만들기

In [None]:
labels = [0,1,2,3,4,5,6,7,8]
total_df_dict = {}
for label in labels:
    decomposed_data[label]['pca']['normal'].columns = ['Normal-0','Normal-1','Normal-2','id_label','class']
    decomposed_data[label]['pca']['diff'].columns = ['Diff-0','Diff-1','Diff-2','Diff-3','id_label','class']
    decomposed_data[label]['pca']['total'].columns = ['Total-0','Total-1','Total-2','Total-3','Total-4','id_label','class']


    result = pd.concat([scaled_diff_dict[label].drop(columns=['id_label', 'class']),
               decomposed_data[label]['pca']['normal'].drop(columns=['id_label', 'class']),
               decomposed_data[label]['pca']['diff'].drop(columns=['id_label', 'class']),
               decomposed_data[label]['pca']['total'].drop(columns=['id_label', 'class']),
               scaled_diff_dict[label]['id_label'],
               scaled_diff_dict[label]['class']], axis=1)

    del decomposed_data[label], scaled_diff_dict[label]
    total_df_dict[label] = result

## 2.5 4차 데이터 -> 5차 데이터

    * 전체를 몇 등분 할 것인지 결정
    * 실험을 위해 4~10등분 데이터를 모두 만듦

### 2.5.1 데이터 만들기

In [None]:
def summarize_timeseries(df_dict, fold, option_dict, labels = [0,1,2,3,4,5,6,7,8]):
    result_label_dict = {}
    normal_columns = ['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP']
    diff_columns = ['Diff-P-PDG', 'Diff-P-TPT','Diff-T-TPT','Diff-P-MON-CKP','Diff-T-JUS-CKP']

    normal_pca_columns = ['Normal-0', 'Normal-1', 'Normal-2']
    diff_pca_columns = ['Diff-0', 'Diff-1', 'Diff-2', 'Diff-3']
    total_pca_columns = ['Total-0', 'Total-1', 'Total-2', 'Total-3', 'Total-4']

    for label in tqdm(labels):
        df = df_dict[label]
        instance_lst = df['id_label'].unique()
        result_label_df = pd.DataFrame([])

        for instance in instance_lst:
            instance_df = df[df['id_label']==instance].reset_index(drop=True)
            id_label = instance_df['id_label']
            class_ = instance_df['class']
            instance_df = instance_df.drop(columns=['id_label', 'class'])

            length = len(instance_df)
            duration = length // fold
            start_points = list(range(0, length, duration))
            if fold < len(start_points):
                start_points = start_points[0:-1]

            result_df = pd.DataFrame([])
            for i, start_point in enumerate(start_points):
                iteration_df = instance_df.iloc[start_point : start_point + duration]

                if option_dict['normal_columns_mean'] == True:
                    suppressed_columns = [column + f'-mean-{i}' for column in normal_columns]
                    normal_columns_mean = pd.DataFrame(iteration_df[normal_columns].mean()).T
                    normal_columns_mean.columns = suppressed_columns
                    result_df = pd.concat([result_df, normal_columns_mean], axis=1)

                if option_dict['normal_columns_std'] == True:
                    suppressed_columns = [column + f'-std-{i}' for column in normal_columns]
                    normal_columns_std = pd.DataFrame(iteration_df[normal_columns].std()).T
                    normal_columns_std.columns = suppressed_columns
                    result_df = pd.concat([result_df, normal_columns_std], axis=1)

                if option_dict['diff_columns_mean'] == True:
                    suppressed_columns = [column + f'-mean-{i}' for column in diff_columns]
                    diff_columns_mean = pd.DataFrame(iteration_df[diff_columns].mean()).T
                    diff_columns_mean.columns = suppressed_columns
                    result_df = pd.concat([result_df, diff_columns_mean], axis=1)

                if option_dict['diff_columns_std'] == True:
                    suppressed_columns = [column + f'-std-{i}' for column in diff_columns]
                    diff_columns_std = pd.DataFrame(iteration_df[diff_columns].std()).T
                    diff_columns_std.columns  = suppressed_columns
                    result_df = pd.concat([result_df, diff_columns_std], axis=1)

                if option_dict['normal_pca_mean'] == True:
                    suppressed_columns = [column + f'-mean-{i}' for column in normal_pca_columns]
                    normal_pca_mean = pd.DataFrame(iteration_df[normal_pca_columns].mean()).T
                    normal_pca_mean.columns = suppressed_columns
                    result_df = pd.concat([result_df, normal_pca_mean], axis=1)

                if option_dict['normal_pca_std'] == True:
                    suppressed_columns = [column + f'-std-{i}' for column in normal_pca_columns]
                    normal_pca_std = pd.DataFrame(iteration_df[normal_pca_columns].std()).T
                    normal_pca_std.columns = suppressed_columns
                    result_df = pd.concat([result_df, normal_pca_std], axis=1)

                if option_dict['diff_pca_mean'] == True:
                    suppressed_columns = [column + f'-mean-{i}' for column in diff_pca_columns]
                    diff_pca_mean = pd.DataFrame(iteration_df[diff_pca_columns].mean()).T
                    diff_pca_mean.columns = suppressed_columns
                    result_df = pd.concat([result_df, diff_pca_mean], axis=1)

                if option_dict['diff_pca_std'] == True:
                    suppressed_columns = [column + f'-std-{i}' for column in diff_pca_columns]
                    diff_pca_std = pd.DataFrame(iteration_df[diff_pca_columns].std()).T
                    diff_pca_std.columns = suppressed_columns
                    result_df = pd.concat([result_df, diff_pca_std], axis=1)

                if option_dict['total_pca_mean'] == True:
                    suppressed_columns = [column + f'-mean-{i}' for column in total_pca_columns]
                    total_pca_mean = pd.DataFrame(iteration_df[total_pca_columns].mean()).T
                    total_pca_mean.columns = suppressed_columns
                    result_df = pd.concat([result_df, total_pca_mean], axis=1)

                if option_dict['total_pca_std'] == True:
                    suppressed_columns = [column + f'-std-{i}' for column in total_pca_columns]
                    total_pca_std = pd.DataFrame(iteration_df[total_pca_columns].std()).T
                    total_pca_std.columns = suppressed_columns
                    result_df = pd.concat([result_df, total_pca_std], axis=1)

            result_df = pd.concat([result_df, pd.Series(id_label[0], name='id_label')], axis=1)
            result_df = pd.concat([result_df, pd.Series(label, name='label')], axis=1)
            result_label_df = pd.concat([result_label_df, result_df], axis = 0)

        result_label_dict[label] = result_label_df.reset_index(drop=True)

    return result_label_dict

# folds = 10
option_dict = {
    'normal_columns_mean' : True,
    'normal_columns_std' : True,
    'diff_columns_mean' : True,
    'diff_columns_std' : True,
    'normal_pca_mean' : True,
    'normal_pca_std' : True,
    'diff_pca_mean' : True,
    'diff_pca_std' : True,
    'total_pca_mean' : True,
    'total_pca_std' : True
    }
labels = [0,1,2,3,4,5,6,7,8]


folds_lst = [4,5,6,7,8,9,10]
summarized_data_dict = {}
for fold in tqdm(folds_lst):
    summarized_data = summarize_timeseries(
                    df_dict = total_df_dict,
                    fold = fold,
                    option_dict = option_dict,
                    labels = labels
                    )
    summarized_data_dict[fold] = summarized_data

  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [06:50<54:42, 410.27s/it][A
 22%|██▏       | 2/9 [08:48<27:47, 238.24s/it][A
 33%|███▎      | 3/9 [08:50<13:04, 130.76s/it][A
 44%|████▍     | 4/9 [09:44<08:22, 100.43s/it][A
 56%|█████▌    | 5/9 [11:28<06:45, 101.40s/it][A
 67%|██████▋   | 6/9 [20:24<12:28, 249.47s/it][A
 78%|███████▊  | 7/9 [22:37<07:02, 211.10s/it][A
 89%|████████▉ | 8/9 [22:44<02:26, 146.21s/it][A
100%|██████████| 9/9 [23:09<00:00, 154.36s/it]
 14%|█▍        | 1/7 [23:09<2:18:55, 1389.24s/it]
  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [07:04<56:35, 424.41s/it][A
 22%|██▏       | 2/9 [09:00<28:21, 243.06s/it][A
 33%|███▎      | 3/9 [09:03<13:20, 133.36s/it][A
 44%|████▍     | 4/9 [09:57<08:31, 102.26s/it][A
 56%|█████▌    | 5/9 [11:45<06:56, 104.22s/it][A
 67%|██████▋   | 6/9 [20:32<12:24, 248.05s/it][A
 78%|███████▊  | 7/9 [22:45<07:00, 210.38s/it][A
 89%|████████▉ | 8/9 [22:53<02:25

In [None]:
summarized_data_dict[4][0]

Unnamed: 0,P-PDG-mean-0,P-TPT-mean-0,T-TPT-mean-0,P-MON-CKP-mean-0,T-JUS-CKP-mean-0,P-PDG-std-0,P-TPT-std-0,T-TPT-std-0,P-MON-CKP-std-0,T-JUS-CKP-std-0,...,Total-2-mean-3,Total-3-mean-3,Total-4-mean-3,Total-0-std-3,Total-1-std-3,Total-2-std-3,Total-3-std-3,Total-4-std-3,id_label,label
0,0.000000,0.615676,0.852359,0.019990,0.356914,0.000000,4.828952e-01,0.715777,0.928559,0.847099,...,0.006595,0.055714,-0.204601,1.165634,1.151504,0.912399,1.072548,0.942410,WELL-00001_20170201020207_0,0
1,0.000000,1.010183,-0.443731,0.069040,-0.108259,0.000000,9.555700e-01,0.786857,0.941658,0.960409,...,-0.162147,-0.230606,0.190239,1.156304,1.052537,0.684557,1.141042,1.116439,WELL-00001_20170201070114_0,0
2,0.000000,-0.199333,0.899593,0.018440,-0.003933,0.000000,5.618810e-01,0.579134,0.972609,0.967689,...,-0.199396,0.071058,0.072987,0.937882,0.867845,1.040346,1.018352,0.541980,WELL-00001_20170201120124_0,0
3,0.000000,0.945954,0.975394,-0.010227,0.158668,0.000000,7.623852e-01,0.512879,0.922656,0.955883,...,0.279359,0.071011,0.002012,0.917650,1.050012,1.012898,0.855504,0.765179,WELL-00001_20170201170311_0,0
4,0.000000,0.718127,0.859184,0.002534,0.194236,0.000000,9.367770e-01,0.093983,0.938001,0.851993,...,-0.085899,-0.373878,0.100636,1.288146,1.191901,1.070106,1.090296,1.097628,WELL-00001_20170201220228_0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,0.000000,0.000000,-1.084698,-0.619569,-0.873057,0.000000,0.000000e+00,0.541548,0.064531,0.650739,...,-0.031827,-0.241164,-0.505133,0.734929,1.142791,1.170620,1.252313,0.826140,WELL-00006_20170828040343_0,0
507,0.000000,-0.141943,-0.484313,-1.299088,-0.281014,0.000000,2.775869e-17,0.913991,0.249999,0.902782,...,-0.096985,-0.081024,-0.027350,0.803104,0.641046,0.577895,1.043387,0.997537,WELL-00006_20170828090311_0,0
508,0.000000,-0.168303,0.115903,1.182953,0.060705,0.000000,2.775867e-17,0.719313,0.313409,0.947469,...,0.226965,-0.360206,-0.040033,0.825982,0.905999,0.842239,0.747907,0.836908,WELL-00006_20170828140031_0,0
509,0.377788,0.202585,-0.913643,-0.025344,-0.662716,0.632955,4.652730e-01,0.481394,0.755114,0.308405,...,-0.295297,-0.007106,-0.036361,2.024198,1.459070,1.214713,1.136419,1.069263,WELL-00007_20170801180000_0,0
