In [3]:
import numpy
import time
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed

%matplotlib inline

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import json

class TSdataset:

    def __init__(self, path:str, source:str):
        self.source = source
        self._features = {}
        self.MinMaxs = MinMaxScaler()
        if source =="NAB":
            split_name = str(path).split('/')
            self.ds_name = '/'.join(split_name[-2:])
            self.df = pd.read_csv(path, parse_dates=[0], index_col= 0)
            self.ts = np.array(self.df.value)
            self._features['DS_name']  = self.ds_name
            self._get_NAB_anomaly()


        elif source == "UCR":
            split_name = str(path).split('/')
            self.ds_name = '/'.join(split_name[-2:])
            split_name = str(split_name[-1]).split('.')[0]
            name_aux = str(split_name).split('_')
            self.ds_name = f"{split_name[-1]}"
            self.ts = np.genfromtxt(path)
            self.df = pd.DataFrame(self.ts, columns = ['value'])
            self._features['DS_name'] = self.ds_name
            anomaly = np.zeros(len(self.df), dtype = np.int)
            anomaly[int(name_aux[5]):int(name_aux[6])] = 1
            self.df['is_anomaly'] = anomaly


        elif source == "YAHOO":
            split_name = str(path).split('/')
            self.ds_name = '/'.join(split_name[-2:])
            self.df = pd.read_csv(path)
            self.df.set_index('timestamp', inplace = True)
            self.ts = np.array(self.df['value'])
            self._features['DS_name'] = self.ds_name
            
        self.ts_scaled = self.MinMaxs.fit_transform(self.ts.reshape(-1, 1))


    def _get_NAB_anomaly(self, path:str = None):
        if path == None:
            with urllib.request.urlopen("https://raw.githubusercontent.com/numenta/NAB/master/labels/combined_windows.json") as url:
                an = json.load(url)
        else:
            with open(path, "r") as jsonF:
                an = json.load(jsonF)

        aux = np.zeros(len(self.df), dtype = np.int)
        for start, end in an[self.ds_name]:
            aux[self.df.index.get_loc(pd.to_datetime(start)): self.df.index.get_loc(pd.to_datetime(end))] = 1
        self.df['is_anomaly'] = aux



    def _get_anomaly_window(self):
        edges = np.diff(np.concatenate([[0],self.df['is_anomaly'],[0]])).nonzero()[0]
        edges = edges.reshape((-1,2)) + np.array([0,-1])
        if self.source == 'NAB':
            return np.array(self.df.index)[edges]
        else:
            return edges


    def plot(self, width:int = 25, height:int = 8):

        my_alpha = 0.4
        plt.figure(figsize=(width,height))
        if self.source in ['YAHOO','UCR']:
            real_anoms = self._get_anomaly_window()

            extend_window = 2
            for anom in real_anoms:
                plt.axvspan(anom[0]-extend_window,anom[1]+extend_window, ymin=0.0, ymax=50, alpha=my_alpha, color='red')
            plt.plot(self.df['value'], zorder=1)
            plt.ylim((self.df['value'].values.min(),self.df['value'].values.max()));
        else:

            real_anoms = self._get_anomaly_window()
            for anom in real_anoms:
                plt.axvspan(anom[0],anom[1], ymin=0.0, ymax=50, alpha=my_alpha, color='red')
            plt.plot(self.df['value'], zorder=1)
            plt.ylim((self.df['value'].values.min(),self.df['value'].values.max()));
        plt.draw()


In [None]:
ds = TSdataset('../../metaFeaturesTS/data/UCR_Anomaly_FullData/071_UCR_Anomaly_DISTORTEDltstdbs30791AS_23000_52600_52800.txt', 'UCR')

In [None]:
ds.plot()

In [None]:
ds.ts_scaled[:int(len(ds.ts)*0.4)].shape

In [None]:
TIME_STEPS = 115*4

# Generated training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS):
    output = []
    for i in range(len(values) - time_steps + 1):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)


x_train = create_sequences(ds.ts_scaled[:int(len(ds.ts)*0.4)], TIME_STEPS)
print("Training input shape: ", x_train.shape)

In [None]:
model = keras.Sequential(
    [
        layers.Input(shape=(x_train.shape[1], x_train.shape[2])),
        layers.Conv1D(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1D(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1DTranspose(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
    ]
)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
model.summary()

In [None]:
history = model.fit(
    x_train,
    x_train,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    workers=-1,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
    ],
)

In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.show()

In [None]:
x_train_pred = model.predict(x_train)
train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis=1)

plt.hist(train_mae_loss, bins=50)
plt.xlabel("Train MAE loss")
plt.ylabel("No of samples")
plt.show()

# Get reconstruction loss threshold.
threshold = 0.001
print("Reconstruction error threshold: ", threshold)

In [None]:
plt.plot(x_train[100])
plt.plot(x_train_pred[100])
plt.show()

In [None]:
df_test_value = ds.ts_scaled
fig, ax = plt.subplots()
plt.plot(df_test_value)
plt.show()

# Create sequences from test values.
x_test = create_sequences(ds.ts_scaled)
print("Test input shape: ", x_test.shape)

# Get test MAE loss.
x_test_pred = model.predict(x_test)
test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
test_mae_loss = test_mae_loss.reshape((-1))

plt.hist(test_mae_loss, bins=50)
plt.xlabel("test MAE loss")
plt.ylabel("No of samples")
plt.show()

# Detect all the samples which are anomalies.
anomalies = test_mae_loss > 0.02
print("Number of anomaly samples: ", np.sum(anomalies))
print("Indices of anomaly samples: ", np.where(anomalies))

In [None]:
anomalies = test_mae_loss > 0.03
print("Number of anomaly samples: ", np.sum(anomalies))
print("Indices of anomaly samples: ", np.where(anomalies))

In [None]:
anomalous_data_indices = []
for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
    if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
        anomalous_data_indices.append(data_idx)

In [None]:
df_subset = ds.df.iloc[anomalous_data_indices]
fig, ax = plt.subplots()
ds.df.plot(legend=False, ax=ax)
df_subset.plot(legend=False, ax=ax, color="r")
plt.show()

In [None]:
df_subset.plot(legend=False, ax=ax, color="r")