# Import

In [1]:
import os
import sys
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Data Load

In [2]:
# 데이터 로드
TRAIN_DF_RAW = pd.read_csv("./train.csv")
TEST_DF_RAW = pd.read_csv("./test.csv")

COLUMNS_IN_TRAIN_DATASET = TRAIN_DF_RAW.columns.drop(["Timestamp", 'anomaly'])

# Nomalize

In [3]:
# 정규화 과정
TRN_MIN = TRAIN_DF_RAW[COLUMNS_IN_TRAIN_DATASET].min()
TRN_MAX = TRAIN_DF_RAW[COLUMNS_IN_TRAIN_DATASET].max()

def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TRN_MAX[c] != TRN_MIN[c]:
            ndf[c] = (df[c] - TRN_MIN[c]) / (TRN_MAX[c] - TRN_MIN[c])
    return ndf

TRAIN_DF = normalize(TRAIN_DF_RAW[COLUMNS_IN_TRAIN_DATASET])
TEST_DF = normalize(TEST_DF_RAW[COLUMNS_IN_TRAIN_DATASET])

# Hyperparameter

In [4]:
# 하이퍼 파라미터 세팅
WINDOW_GIVEN = 40
WINDOW_SIZE = 41
BATCH_SIZE = 1024 
N_HIDDENS = 150
N_HIDDENS_2 = 70
N_LAYERS = 3
N_EPOCHS = 5

# Custom dataset

In [5]:
class TimeSeriesDataset(Dataset):
    def __init__(self, ts, df, stride=1):
        self.ts = np.array(ts)
        self.vals = np.array(df, dtype=np.float32)
        
        self.valid_idx = np.arange(0, len(self.ts) - WINDOW_SIZE + 1, stride)
        self.num_win = len(self.valid_idx)

        self.pre_ts = self.ts[self.valid_idx + WINDOW_SIZE - 1]
        self.pre_in = np.array([self.vals[i:i + WINDOW_GIVEN] for i in self.valid_idx])
        self.pre_tgt = self.vals[self.valid_idx + WINDOW_SIZE - 1]

    def __len__(self):
        return self.num_win

    def __getitem__(self, idx):
        return {
            "timestamps": self.pre_ts[idx],
            "input": torch.from_numpy(self.pre_in[idx]),
            "target": torch.from_numpy(self.pre_tgt[idx])
        }

In [6]:
DATASET_TRAIN = TimeSeriesDataset(TRAIN_DF_RAW["Timestamp"], TRAIN_DF, stride=1)
TRAIN_LOADER = torch.utils.data.DataLoader(DATASET_TRAIN, batch_size=BATCH_SIZE, shuffle=False)

# Model Define

In [7]:
class GRU_Linear(nn.Module):
    def __init__(self, n_tags):
        super(GRU_Linear, self).__init__()
        self.gru = nn.GRU(
            input_size=n_tags,
            hidden_size=N_HIDDENS,
            num_layers=3,
            bidirectional=True,
            dropout=0.1,
        )
        self.fc = nn.Linear(N_HIDDENS * 2, N_HIDDENS_2)
        self.dense = nn.Linear(N_HIDDENS_2, n_tags)
        self.relu = nn.LeakyReLU(negative_slope=0.1)

    def forward(self, input_sequence):
        input_sequence = input_sequence.transpose(0, 1)
        self.gru.flatten_parameters()
        gru_outputs, _ = self.gru(input_sequence)
        last_gru_output = gru_outputs[-1]
        
        output = self.fc(last_gru_output)
        output = self.relu(output)
        output = self.dense(output)
        output = torch.sigmoid(output)
        
        return output

In [8]:
MODEL = GRU_Linear(n_tags=TRAIN_DF.shape[1]).cuda()
criterion = nn.MSELoss()
optimizer = optim.Adam(MODEL.parameters(), lr=1e-5)

# Train Model

In [9]:
def train_model(model, train_loader, optimizer, criterion, n_epochs, device):
    train_losses = []
    best_model = {
        "loss": float('inf'),
        "state": None,
        "epoch": 0
    }

    for epoch in range(n_epochs):
        model.train()
        epoch_loss = 0.0

        with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{n_epochs}", unit="batch") as t:
            for batch in t:
                inputs = batch["input"].to(device)
                targets = batch["target"].to(device)

                outputs = model(inputs)
                loss = criterion(outputs, targets)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()

                t.set_postfix(loss=loss.item())

        avg_epoch_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_epoch_loss)

        print(f"Epoch {epoch + 1}/{n_epochs}, Average Train Loss: {avg_epoch_loss:.4f}")
        
        if avg_epoch_loss < best_model["loss"]:
            best_model["state"] = model.state_dict()
            best_model["loss"] = avg_epoch_loss
            best_model["epoch"] = epoch + 1

    return train_losses, best_model

In [10]:
train_losses, best_model = train_model(MODEL, TRAIN_LOADER, optimizer, criterion, N_EPOCHS, device='cuda')

Epoch 1/5: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 486/486 [00:17<00:00, 27.19batch/s, loss=0.0258]


Epoch 1/5, Average Train Loss: 0.0644


Epoch 2/5: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 486/486 [00:17<00:00, 27.11batch/s, loss=0.0059]


Epoch 2/5, Average Train Loss: 0.0197


Epoch 3/5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 486/486 [00:17<00:00, 27.32batch/s, loss=0.00342]


Epoch 3/5, Average Train Loss: 0.0129


Epoch 4/5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 486/486 [00:18<00:00, 26.81batch/s, loss=0.00282]


Epoch 4/5, Average Train Loss: 0.0120


Epoch 5/5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 486/486 [00:18<00:00, 26.56batch/s, loss=0.00263]

Epoch 5/5, Average Train Loss: 0.0118





# THRESHOLD
- 훈련 데이터의 재구성 오차 계산

In [11]:
MODEL.eval()
train_errors = []
with torch.no_grad():
    for batch in TRAIN_LOADER:
        inputs = batch["input"].cuda()
        targets = batch["target"].cuda()
        outputs = MODEL(inputs)
        errors = torch.mean(torch.abs(targets - outputs), dim=1).cpu().numpy()
        train_errors.extend(errors)

# 임계값 설정
THRESHOLD = np.mean(train_errors) + 2 * np.std(train_errors)

# Inference and Detect anomaly

In [12]:
DATASET_TEST = TimeSeriesDataset(TEST_DF_RAW["Timestamp"], TEST_DF)
TEST_LOADER = torch.utils.data.DataLoader(DATASET_TEST, batch_size=BATCH_SIZE, shuffle=False)

In [13]:
def inference(model, data_loader, device='cuda'):
    model.eval()
    timestamps = []
    distances = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Inference", unit="batch"):
            inputs = batch["input"].to(device)
            targets = batch["target"].to(device)
            
            predictions = model(inputs)
            
            timestamps.extend(batch["timestamps"])
            distances.extend(torch.abs(targets - predictions).cpu().tolist())

    return np.array(timestamps), np.array(distances)

timestamps, distances = inference(MODEL, TEST_LOADER)

ANOMALY_SCORE = np.mean(distances, axis=1)

Inference: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 440/440 [00:08<00:00, 53.60batch/s]


In [14]:
def put_labels(distance, threshold):
    xs = np.zeros_like(distance)
    xs[distance > threshold] = 1
    return xs

import datetime

def fill_blank(check_ts, labels, total_ts):
    TS_FORMAT = "%Y-%m-%d %H:%M:%S"

    def parse_ts(ts):
        return datetime.datetime.strptime(ts.strip(), TS_FORMAT)

    def ts_label_iter():
        return ((parse_ts(ts), label) for ts, label in zip(check_ts, labels))

    final_labels = []
    label_iter = ts_label_iter()
    cur_ts, cur_label = next(label_iter, (None, None))

    for ts in total_ts:
        cur_time = parse_ts(ts)
        while cur_ts and cur_time > cur_ts:
            cur_ts, cur_label = next(label_iter, (None, None))
        
        if cur_ts == cur_time:
            final_labels.append(cur_label)
            cur_ts, cur_label = next(label_iter, (None, None))
        else:
            final_labels.append(0)

    return np.array(final_labels, dtype=np.int8)

In [15]:
# 예측
LABELS = put_labels(ANOMALY_SCORE, THRESHOLD)
PREDICTION = fill_blank(timestamps, LABELS, np.array(TEST_DF_RAW["Timestamp"]))
PREDICTION = PREDICTION.flatten().tolist()

# Submission

In [16]:
sample_submission = pd.read_csv("./sample_submission.csv")
sample_submission['anomaly'] = PREDICTION
sample_submission.to_csv('./baseline_submission.csv', encoding='UTF-8-sig', index=False)