#  Import Libraries

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from collections import Counter
import tensorflow as tf
from tensorflow.keras import Model, models, layers, optimizers, regularizers
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Load Data

In [2]:
train = pd.read_csv('1_train.csv')
test = pd.read_csv('1_test.csv')

In [3]:
train.drop(['Timestamp', 'Unnamed: 0'], inplace=True, axis=1)
test.drop(['Timestamp', 'Unnamed: 0'], inplace=True, axis=1)

In [None]:
# curve shifting을 2개의 row만큼만 적용하면 된다. 이후, 본래 break 데이터는 제거

shift_by = 5 # 몇 시점 땡길지 설정

sign = lambda x: (1, -1)[x < 0]
def curve_shift(df, shift_by):
    vector = df['y'].copy()
    for _ in range(abs(shift_by)):
        tmp = vector.shift(sign(shift_by))
        tmp = tmp.fillna(0)
        vector += tmp
    labelcol = 'y'
    # Add vector to the df
    df.insert(loc=0, column=labelcol+'tmp', value=vector)

    # Remove the rows with labelcol == 1.
    df = df.drop(df[df[labelcol] == 1].index)

    # Drop labelcol and rename the tmp col as labelcol
    df = df.drop(labelcol, axis=1)
    df = df.rename(columns={labelcol+'tmp': labelcol})

    # Make the labelcol binary
    df.loc[df[labelcol] > 0, labelcol] = 1

    return df

In [23]:
# shift
shifted_df = curve_shift(df, -shift_by)
shifted_df.head()

Unnamed: 0.1,y,Unnamed: 0,CSU,STS,FTS,FMU,ECU_CURRENT,ECU_VOLTAGE,ANU,TRO
0,0.0,127828,30.92,25.49,29.57,1925.55,4175.333333,4.2,0.0,0.0
1,0.0,127846,30.95,25.28,30.54,1937.25,4157.333333,4.2,0.0,0.0
2,0.0,127862,30.94,25.35,30.58,1966.73,4159.0,4.2,0.0,0.0
3,0.0,127879,30.97,25.4,30.61,1954.58,4151.0,4.233333,0.0,0.0
4,0.0,127894,30.97,25.27,30.57,1952.1,4136.666667,4.233333,0.0,0.0


In [24]:
shifted_df.columns

Index(['y', 'Unnamed: 0', 'CSU', 'STS', 'FTS', 'FMU', 'ECU_CURRENT',
       'ECU_VOLTAGE', 'ANU', 'TRO'],
      dtype='object')

In [25]:
# drop remove columns
shifted_df = shifted_df[['CSU', 'STS', 'FTS', 'ECU_VOLTAGE', 'ECU_CURRENT', 'FMU', 'TRO', 'ANU', 'y']]

# 데이터와 라벨 x, y로 분리
input_x = shifted_df.drop('y', axis=1).values
input_y = shifted_df['y'].values

n_features = input_x.shape[1]
print(n_features)

8


# Transform to Series Data 

In [4]:
shift_by = -5
n_features = train.shape[1]

In [5]:
# LSTM 모델은 (samples, timesteps, feature)에 해당하는 3d 차원의 shape을 가지므로, 데이터를 시퀀스 형태로 변환

def temporalize(X, timesteps):
    output_X = []
    for i in range(len(X) - timesteps - 1):
        t = []
        for j in range(1, timesteps + 1):
            # Gather the past records upto the lookback period
            t.append(X.loc[[(i + j + 1)], :])
        output_X.append(t)
    return np.squeeze(np.array(output_X))

In [6]:
timesteps = abs(shift_by)

# Temporalize
x = temporalize(train, timesteps)
print(x.shape)

(26058, 5, 4)


# Split Train / Valid / Test

In [7]:
# Split into train, valid, and test 
x_train = x
x_test = temporalize(test, timesteps)
x_train, x_valid = train_test_split(x_train, test_size=0.2)

print(len(x_train))  
print(len(x_valid)) 
print(len(x_test))

20846
5212
26490


# Standardize

In [8]:
def flatten(X):
    flattened_X = np.empty((X.shape[0], X.shape[2]))  # sample x features array.
    for i in range(X.shape[0]):
        flattened_X[i] = X[i, (X.shape[1]-1), :]
    return(flattened_X)

def scale(X, scaler):
    for i in range(X.shape[0]):
        X[i, :, :] = scaler.transform(X[i, :, :])
        
    return X

In [33]:
scaler = MinMaxScaler().fit(flatten(x_train))

x_train_scaled = scale(x_train, scaler)
x_valid_scaled = scale(x_valid, scaler)
x_test_scaled = scale(x_test, scaler)

# Training LSTM Autoencoder

In [34]:
epochs = 20
batch = 256
lr = 0.001

In [35]:
# earlystopping
callback = tf.keras.callbacks.EarlyStopping(monitor="loss", patience=2)

lstm_ae = models.Sequential()
# Encoder
lstm_ae.add(layers.LSTM(64, activation='relu', input_shape=(timesteps, n_features), return_sequences=True))
lstm_ae.add(layers.LSTM(32, activation='relu', return_sequences=False))
lstm_ae.add(layers.RepeatVector(timesteps))
# Decoder
lstm_ae.add(layers.LSTM(32, activation='relu', return_sequences=True))
lstm_ae.add(layers.LSTM(64, activation='relu', return_sequences=True))
lstm_ae.add(layers.TimeDistributed(layers.Dense(n_features)))

lstm_ae.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_12 (LSTM)              (None, 5, 64)             17664     
                                                                 
 lstm_13 (LSTM)              (None, 32)                12416     
                                                                 
 repeat_vector_3 (RepeatVect  (None, 5, 32)            0         
 or)                                                             
                                                                 
 lstm_14 (LSTM)              (None, 5, 32)             8320      
                                                                 
 lstm_15 (LSTM)              (None, 5, 64)             24832     
                                                                 
 time_distributed_3 (TimeDis  (None, 5, 4)             260       
 tributed)                                            

In [36]:
# compile
lstm_ae.compile(loss="mse", optimizer=optimizers.Adam(lr))

# fit
history = lstm_ae.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=epochs,
    batch_size=batch,
    validation_data=(x_valid_scaled, x_valid_scaled),
    callbacks=[callback],
)

Epoch 1/20


2022-05-04 19:37:12.756601: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-05-04 19:37:25.300436: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# threshold

In [56]:
threshold = 0.1

In [45]:
test.shape

(26496, 4)

In [46]:
test_predictions = lstm_ae.predict(x_test_scaled)

In [47]:
mse = np.sqrt(np.mean(np.power(test_predictions.reshape(26490, 20) - x_test_scaled.reshape(26490, 20), 2)))
print(mse)

0.028156282495992665


In [57]:
predict = []
for i in range(len(x_test_scaled)):
    if np.sqrt(np.mean(np.power(test_predictions.reshape(26490, 20)[i] - x_test_scaled.reshape(26490, 20)[i], 2))) > threshold:
        predict.append(1)
    else:
        predict.append(0)

In [58]:
predict = pd.DataFrame(predict, columns=['Prediction'])
predict.value_counts()

Prediction
0             26393
1                97
dtype: int64

In [60]:
predict[predict['Prediction']==1]

Unnamed: 0,Prediction
14,1
15,1
16,1
17,1
18,1
...,...
26194,1
26213,1
26404,1
26405,1


In [62]:
predict.to_csv('1_predict1.csv', mode='w')

In [61]:
predict[predict['Prediction']==1].to_csv('check.csv', mode='w')