#  Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from collections import Counter
import tensorflow as tf
from tensorflow.keras import Model, models, layers, optimizers, regularizers
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Load Data

In [2]:
train = pd.read_csv('2_train.csv')
test = pd.read_csv('2_test.csv')

In [3]:
train.drop(['Timestamp', 'Unnamed: 0'], inplace=True, axis=1)
test.drop(['Timestamp', 'Unnamed: 0'], inplace=True, axis=1)

In [7]:
shift_by = -5
n_features = train.shape[1]

In [9]:
timesteps = abs(shift_by)

# Temporalize
x = temporalize(train, timesteps)
print(x.shape)

(105114, 5, 33)


# Transform to Series Data 

In [4]:
shift_by = -5
n_features = train.shape[1]

In [5]:
# LSTM 모델은 (samples, timesteps, feature)에 해당하는 3d 차원의 shape을 가지므로, 데이터를 시퀀스 형태로 변환

def temporalize(X, timesteps):
    output_X = []
    for i in range(len(X) - timesteps - 1):
        t = []
        for j in range(1, timesteps + 1):
            # Gather the past records upto the lookback period
            t.append(X.loc[[(i + j + 1)], :])
        output_X.append(t)
    return np.squeeze(np.array(output_X))

In [6]:
timesteps = abs(shift_by)

# Temporalize
x = temporalize(train, timesteps)
print(x.shape)

(26058, 5, 4)


# Split Train / Valid / Test

In [10]:
# Split into train, valid, and test 
x_train = x
x_test = temporalize(test, timesteps)
x_train, x_valid = train_test_split(x_train, test_size=0.2)

print(len(x_train))  
print(len(x_valid)) 
print(len(x_test))

84091
21023
105114


# Standardize

In [11]:
def flatten(X):
    flattened_X = np.empty((X.shape[0], X.shape[2]))  # sample x features array.
    for i in range(X.shape[0]):
        flattened_X[i] = X[i, (X.shape[1]-1), :]
    return(flattened_X)

def scale(X, scaler):
    for i in range(X.shape[0]):
        X[i, :, :] = scaler.transform(X[i, :, :])
        
    return X

In [12]:
scaler = MinMaxScaler().fit(flatten(x_train))

x_train_scaled = scale(x_train, scaler)
x_valid_scaled = scale(x_valid, scaler)
x_test_scaled = scale(x_test, scaler)

# Training LSTM Autoencoder

In [13]:
epochs = 20
batch = 256
lr = 0.001

In [14]:
# earlystopping
callback = tf.keras.callbacks.EarlyStopping(monitor="loss", patience=2)

lstm_ae = models.Sequential()
# Encoder
lstm_ae.add(layers.LSTM(64, activation='relu', input_shape=(timesteps, n_features), return_sequences=True))
lstm_ae.add(layers.LSTM(32, activation='relu', return_sequences=False))
lstm_ae.add(layers.RepeatVector(timesteps))
# Decoder
lstm_ae.add(layers.LSTM(32, activation='relu', return_sequences=True))
lstm_ae.add(layers.LSTM(64, activation='relu', return_sequences=True))
lstm_ae.add(layers.TimeDistributed(layers.Dense(n_features)))

lstm_ae.summary()

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2022-05-04 19:56:18.554804: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-05-04 19:56:18.556341: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 5, 64)             25088     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 repeat_vector (RepeatVector  (None, 5, 32)            0         
 )                                                               
                                                                 
 lstm_2 (LSTM)               (None, 5, 32)             8320      
                                                                 
 lstm_3 (LSTM)               (None, 5, 64)             24832     
                                                                 
 time_distributed (TimeDistr  (None, 5, 33)            2145      
 ibuted)                                                

In [15]:
# compile
lstm_ae.compile(loss="mse", optimizer=optimizers.Adam(lr))

# fit
history = lstm_ae.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=epochs,
    batch_size=batch,
    validation_data=(x_valid_scaled, x_valid_scaled),
    callbacks=[callback],
)

Epoch 1/20


2022-05-04 19:56:19.657166: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-05-04 19:56:20.447263: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-05-04 19:57:17.280520: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# threshold

In [27]:
threshold = 0.1

In [18]:
test.shape

(105120, 33)

In [20]:
test_predictions.shape

(105114, 5, 33)

In [19]:
test_predictions = lstm_ae.predict(x_test_scaled)

2022-05-04 20:18:58.421412: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [21]:
mse = np.sqrt(np.mean(np.power(test_predictions.reshape(test_predictions.shape[0], test_predictions.shape[1] * test_predictions.shape[2]) - x_test_scaled.reshape(test_predictions.shape[0], test_predictions.shape[1] * test_predictions.shape[2]), 2)))
print(mse)

0.030434571140730105


In [28]:
predict = []
for i in range(len(x_test_scaled)):
    if np.sqrt(np.mean(np.power(test_predictions.reshape(test_predictions.shape[0], test_predictions.shape[1] * test_predictions.shape[2])[i] - x_test_scaled.reshape(test_predictions.shape[0], test_predictions.shape[1] * test_predictions.shape[2])[i], 2))) > threshold:
        predict.append(1)
    else:
        predict.append(0)

In [29]:
predict = pd.DataFrame(predict, columns=['Prediction'])
predict.value_counts()

Prediction
0             104518
1                596
dtype: int64

In [30]:
predict[predict['Prediction']==1]

Unnamed: 0,Prediction
2227,1
2228,1
2229,1
2230,1
2231,1
...,...
105109,1
105110,1
105111,1
105112,1


In [31]:
predict.to_csv('2_predict2.csv', mode='w')

In [61]:
predict[predict['Prediction']==1].to_csv('check.csv', mode='w')