In [12]:
import pandas as pd
from keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf
import keras
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import random
import operator
from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector
from keras.models import Model
from keras import regularizers
import os

In [13]:
file_name = './388.parquet'
data = pd.read_parquet(file_name)
data.dropna(inplace = True)
data.reset_index(drop=True, inplace = True)
names = list(data)
size = 0.8
test = data[int(data.shape[0]*size):]
data = data[:int(data.shape[0]*size)]

In [14]:
seed_value= 0
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

#general_variables
timestep = 20



#FIT scaler on train data
scaler = MinMaxScaler(feature_range=(0, 1))
#scaler = StandardScaler()
scaler = scaler.fit(data.drop('timestamp', axis = 1).values)



diffs = dict(data['timestamp'].diff().apply(lambda x: x/np.timedelta64(1, 'm')).fillna(0).astype('int64'))
large_diffs = {i:diffs[i] for i in diffs.keys() if diffs[i] > 15}


diffs_test = dict(test['timestamp'].diff().apply(lambda x: x/np.timedelta64(1, 'm')).fillna(0).astype('int64'))
large_diffs_test = {i:diffs_test[i] for i in diffs_test.keys() if diffs_test[i] > 15}


lk = list(large_diffs.keys())
lk.sort()
lk_test = list(large_diffs_test.keys())
lk_test.sort()

l_mod = [0] + lk + [data.shape[0] +1]
l_mod_test = [0] + lk_test + [test.shape[0] +1]

list_of_dfs = [data.iloc[l_mod[n]:l_mod[n+1]] for n in range(len(l_mod)-1)]
list_of_dfs = [i for i in list_of_dfs if i.shape[0] > timestep]

list_of_dfs_test = [test.iloc[l_mod_test[n]:l_mod_test[n+1]] for n in range(len(l_mod_test)-1)]
list_of_dfs_test = [i for i in list_of_dfs_test if i.shape[0] > timestep]


lenghts = {i:list_of_dfs[i].shape[0] for i in range(len(list_of_dfs))}
lenghts_test = {i:list_of_dfs_test[i].shape[0] for i in range(len(list_of_dfs_test))}

data_gens = []
for df in list_of_dfs:
    normalized = scaler.transform(df.drop('timestamp', axis = 1).values)
    data_gens.append(TimeseriesGenerator(normalized, normalized,
                               length=timestep, sampling_rate=1,
                               batch_size=1000))


data_gens_test = []
for df in list_of_dfs_test:
    normalized = scaler.transform(df.drop('timestamp', axis = 1).values)
    data_gens_test.append(TimeseriesGenerator(normalized, normalized,
                               length=timestep, sampling_rate=1,
                               batch_size=1000))  

train = np.vstack([np.array(i[0][0][:]) for i in data_gens])
test = np.vstack([np.array(i[0][0][:]) for i in data_gens_test])

Labels = test[:,-1,-1]

train = train[:,:,:-1]
test = test[:,:,:-1]

y_train = train[:, -1, :]
y_test = test[:, -1, :]

def autoencoder_model(X):
    inputs = Input(shape=(timestep, X.shape[2]))
    L1 = LSTM(16, activation = 'selu', return_sequences = True)(inputs)
    L2 = LSTM(16, activation = 'selu', return_sequences = True)(L1)
    L3 = LSTM(8, activation = 'selu', return_sequences = False)(L2)
    L4 = Dense(16, activation = 'selu')(L3)
    L5 = Dense(16, activation = 'selu')(L4)
    output = Dense(X.shape[2])(L5)
    model = Model(inputs = inputs, outputs = output)
    return model

model = autoencoder_model(train)
model.compile(optimizer=keras.optimizers.RMSprop(1e-3),
    loss=keras.losses.MeanSquaredError())
model.fit(train, y_train,batch_size = 128, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x20023a80bc88>

In [15]:
predictions = model.predict(test, verbose = True)
predictions_df = pd.DataFrame(predictions, columns=names[1:-1])
val_df = pd.DataFrame(y_test, columns=names[1:-1])
normal_diff = val_df - predictions_df

normal_diff['Total absolute reproduction error'] = normal_diff.abs().sum(axis=1)
M = max(normal_diff['Total absolute reproduction error'])
normal_diff['real_label'] = Labels

normal_diff['real_label'] = normal_diff['real_label'].apply(lambda x: 1 if x > 0 else 0)

normal_diff['Softmax'] = normal_diff['Total absolute reproduction error'].apply(lambda x: x/M) 

from sklearn.metrics import roc_auc_score
# calculate scores
auc = roc_auc_score(normal_diff['real_label'], normal_diff['Softmax'])
print(auc)

0.6266266266266266
