In [None]:
from google.colab import drive 
drive.mount('/content/gdrive/')

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
%matplotlib inline

from numpy.random import seed
import tensorflow as tf

from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector
from keras.models import Model
from keras import regularizers

# misc
import random as rn

# manual parameters
RANDOM_SEED = 10

# setting random seeds for libraries to enseure reproducibility
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
full_df = pd.read_csv("./gdrive/MyDrive/challenges/DHCP.csv")
full_df.info()

In [None]:
idx_half = full_df.index[full_df['Timestamp'] == '20210630_2350-0000'].tolist()[0]
print(idx_half)

In [None]:
full_df.fillna(0, inplace=True)

In [None]:
null_values = full_df.isna().sum()
null_values[null_values != 0]

In [None]:
TIMESTAMP_FIELD = 'Timestamp'
VALID_COLUMNS_IN_TRAIN_DATASET = full_df.columns.drop([TIMESTAMP_FIELD])
VALID_COLUMNS_IN_TRAIN_DATASET

In [None]:
full_df.info()

In [None]:
train_set = full_df[:idx_half+1]
test_temp_set = full_df[idx_half+1:]
print(train_set)
print(test_temp_set)
print("==== shape")
print(train_set.shape)
print(test_temp_set.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_set[VALID_COLUMNS_IN_TRAIN_DATASET])
X_test = scaler.transform(test_temp_set[VALID_COLUMNS_IN_TRAIN_DATASET])
print(X_train)
print(X_train.shape)

In [None]:
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
print("Training data shape:", X_train.shape)
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
print("Test data shape:", X_test.shape)

In [None]:
def autoencoder_model(X):
    inputs = Input(shape=(X.shape[1], X.shape[2]))
    L1 = LSTM(32, activation='relu', return_sequences=True, 
              kernel_regularizer=regularizers.l2(0.00))(inputs)
    L2 = LSTM(8, activation='relu', return_sequences=False)(L1)
    L3 = RepeatVector(X.shape[1])(L2)
    L4 = LSTM(8, activation='relu', return_sequences=True)(L3)
    L5 = LSTM(32, activation='relu', return_sequences=True)(L4)
    output = TimeDistributed(Dense(X.shape[2]))(L5)    
    model = Model(inputs=inputs, outputs=output)
    return model

In [None]:
model = autoencoder_model(X_train)
model.compile(optimizer='adam', loss='mae')
model.summary()

In [None]:
modelpath = "./LSTM_Autoencoder_model/"+"ae4_{epoch:02d}-{val_loss:.8f}.hdf5"
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=1, save_best_only=True)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

epochs = 50
batch = 64

history = model.fit(X_train, X_train,
                    epochs=epochs, batch_size=batch,
                    validation_split=0.2, callbacks=[checkpointer, early_stopping_callback]).history

In [None]:
plt.plot(history['loss'], label='train loss')
plt.plot(history['val_loss'], label='valid loss')
plt.legend()
plt.xlabel('Epoch'); plt.ylabel('loss')
plt.show()

In [None]:
from tensorflow.keras.models import load_model

model = load_model('/content/LSTM_Autoencoder_model/ae4_07-0.00831347.hdf5')

In [None]:
X_pred = model.predict(X_train)
print(X_train)
print("="*50)
print(X_pred)
print("##### shape #####")
print(X_pred.shape)
X_pred = X_pred.reshape(X_pred.shape[0], X_pred.shape[2])
X_pred = pd.DataFrame(X_pred, columns=train_set[VALID_COLUMNS_IN_TRAIN_DATASET].columns)
X_pred.index = train_set[VALID_COLUMNS_IN_TRAIN_DATASET].index

print(train_set)
print("="*50)
print(X_pred.index)

scored = pd.DataFrame(index=train_set[VALID_COLUMNS_IN_TRAIN_DATASET].index)
Xtrain = X_train.reshape(X_train.shape[0], X_train.shape[2])
scored['Loss_mae'] = np.mean(np.abs(X_pred-Xtrain), axis = 1)
plt.figure(figsize=(16,9), dpi=80)
plt.title('Loss Distribution', fontsize=16)
sns.distplot(scored['Loss_mae'], bins = 20, kde= True, color = 'blue');
plt.xlim([0.0,.5])

In [None]:
X_pred = model.predict(X_test)
X_pred = X_pred.reshape(X_pred.shape[0], X_pred.shape[2])
X_pred = pd.DataFrame(X_pred, columns=test_temp_set[VALID_COLUMNS_IN_TRAIN_DATASET].columns)
X_pred.index = test_temp_set[VALID_COLUMNS_IN_TRAIN_DATASET].index
print(test_temp_set[VALID_COLUMNS_IN_TRAIN_DATASET].index)

scored = pd.DataFrame(index=test_temp_set[VALID_COLUMNS_IN_TRAIN_DATASET].index)
Xtest = X_test.reshape(X_test.shape[0], X_test.shape[2])
scored['Loss_mae'] = np.mean(np.abs(X_pred-Xtest), axis = 1)
scored['Threshold'] = 0.03
prediction_result = scored['Loss_mae'] > scored['Threshold']
scored['Prediction'] = prediction_result
scored.head()

In [None]:
X_pred_train = model.predict(X_train)
X_pred_train = X_pred_train.reshape(X_pred_train.shape[0], X_pred_train.shape[2])
X_pred_train = pd.DataFrame(X_pred_train, columns=train_set[VALID_COLUMNS_IN_TRAIN_DATASET].columns)
X_pred_train.index = train_set[VALID_COLUMNS_IN_TRAIN_DATASET].index

scored_train = pd.DataFrame(index=train_set[VALID_COLUMNS_IN_TRAIN_DATASET].index)
scored_train['Loss_mae'] = np.mean(np.abs(X_pred_train-Xtrain), axis = 1)
scored_train['Threshold'] = 0.03
scored_train['Anomaly'] = scored_train['Loss_mae'] > scored_train['Threshold']
scored = pd.concat([scored_train, scored])

In [None]:
scored.plot(logy=True,  figsize=(16,9), ylim=[1e-2,1e2], color=['blue','red'])

In [None]:
temp_scored = scored['Prediction'][idx_half+1:]
temp_scored = temp_scored.reset_index(drop=True)
temp_scored

In [None]:
for i in range(26496):
  if temp_scored.loc[i] == True:
    temp_scored.loc[i] = 1
  else:
    temp_scored.loc[i] = 0

In [None]:
temp_scored.value_counts()

In [None]:
answer = pd.DataFrame(temp_scored, columns=['Prediction'])
print(f'예측결과. \n{answer}\n')
answer.to_csv('IP_answer8.csv', index=True)