In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from scipy import signal

from keras import Sequential
from keras.layers import LSTM, RepeatVector, TimeDistributed, Dense, Dropout, Input
from keras.optimizers import Adam 

offline.init_notebook_mode(connected=True)

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor

## Helper functions

In [None]:
def plot_history(network_history, title):
    plt.figure(figsize=(10, 5))
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.semilogy(network_history.history['loss'])
    plt.semilogy(network_history.history['val_loss'])
    plt.legend(['Training', 'Validation'])
    plt.grid()
    plt.show()
    
def print_scores(mse_train, mse_test, mse_attacks):
    th = np.quantile(np.mean(train_, axis=1), 0.99)

    mse_train = np.mean(train_, axis=1)
    mse_test = np.mean(test_, axis=1)
    mse_attacks = np.mean(attacks_, axis=1)

    print('Thresholding the 99% quantile')
    print('Train TPR: {}'.format(1 - sum(np.array(mse_train) > th) / len(mse_train)))
    print('Test TPR: {}'.format(1 - sum(np.array(mse_test) > th) / len(mse_test)))
    print('TNR: {}'.format(1 - sum(np.array(mse_attacks) < th) / len(mse_attacks)))

    true_positives = sum(np.array(mse_test) < th)
    false_negatives = sum(np.array(mse_test) > th)
    false_positives = sum(np.array(mse_attacks) < th)
    true_negatives = sum(np.array(mse_attacks) > th)

    beta = 20
    precision = true_positives/(true_positives+false_positives)
    recall = true_positives/(true_positives+false_negatives)
    F20 = (1 + (beta ** 2))*precision*recall/((beta ** 2)*precision + recall)
    print('F20: {}'.format(F20))

    print('\n-------------------\n')

    th = np.quantile(mse_test, 0.999)
    print('Thresholding the 99.9% quantile')
    print('Train TPR: {}'.format(1 - sum(np.array(mse_train) > th) / len(mse_train)))
    print('Test TPR: {}'.format(1 - sum(np.array(mse_test) > th) / len(mse_test)))
    print('TNR: {}'.format(1 - sum(np.array(mse_attacks) < th) / len(mse_attacks)))

    true_positives = sum(np.array(mse_test) < th)
    false_negatives = sum(np.array(mse_test) > th)
    false_positives = sum(np.array(mse_attacks) < th)
    true_negatives = sum(np.array(mse_attacks) > th)

    beta = 20
    precision = true_positives/(true_positives+false_positives)
    recall = true_positives/(true_positives+false_negatives)
    F20 = (1 + (beta ** 2))*precision*recall/((beta ** 2)*precision + recall)
    print('F20: {}'.format(F20))
    
def print_series(train_, train_re, test_, test_re, attacks_, attacks_re):
    
    f, axs = plt.subplots(3,5,figsize=(20,15))

    ims = np.random.randint(0,3000,5)

    axs[0,0].plot(train_[ims[0]], label="train")
    axs[0,0].plot(train_re[ims[0]], label="pred")

    axs[0,1].plot(train_[ims[1]], label="train")
    axs[0,1].plot(train_re[ims[1]], label="pred")

    axs[0,2].plot(train_[ims[2]], label="train")
    axs[0,2].plot(train_re[ims[2]], label="pred")

    axs[0,3].plot(train_[ims[3]], label="train")
    axs[0,3].plot(train_re[ims[3]], label="pred")

    axs[0,4].plot(train_[ims[4]], label="train")
    axs[0,4].plot(train_re[ims[4]], label="pred")


    axs[1,0].plot(test_[ims[0]], label="test")
    axs[1,0].plot(test_re[ims[0]], label="pred")

    axs[1,1].plot(test_[ims[1]], label="test")
    axs[1,1].plot(test_re[ims[1]], label="pred")

    axs[1,2].plot(test_[ims[2]], label="test")
    axs[1,2].plot(test_re[ims[2]], label="pred")

    axs[1,3].plot(test_[ims[3]], label="test")
    axs[1,3].plot(test_re[ims[3]], label="pred")

    axs[1,4].plot(test_[ims[4]], label="test")
    axs[1,4].plot(test_re[ims[4]], label="pred")


    axs[2,0].plot(attacks_[ims[0]], label="attacks")
    axs[2,0].plot(attacks_re[ims[0]], label="pred")

    axs[2,1].plot(attacks_[ims[1]], label="attacks")
    axs[2,1].plot(attacks_re[ims[1]], label="pred")

    axs[2,2].plot(attacks_[ims[2]], label="attacks")
    axs[2,2].plot(attacks_re[ims[2]], label="pred")

    axs[2,3].plot(attacks_[ims[3]], label="attacks")
    axs[2,3].plot(attacks_re[ims[3]], label="pred")

    axs[2,4].plot(attacks_[ims[4]], label="attacks")
    axs[2,4].plot(attacks_re[ims[4]], label="pred")
    
    for i in range(axs.shape[0]):
        for j in range(axs.shape[1]):
            axs[i,j].legend()

def filter_signal(yn):
    
    
    b, a = signal.butter(3, 0.05)
    zi = signal.lfilter_zi(b, a)
    z, _ = signal.lfilter(b, a, yn, zi=zi*yn[0])
    z2, _ = signal.lfilter(b, a, z, zi=zi*z[0])
    
    return z2

In [None]:
path = '../../machine_learning/cloud_functions/data-large.csv'
data = pd.read_csv(path)
df = pd.DataFrame(data)

columns = ['attack',
           'dimension',
           'temporal_difference-series',
           'temporal_histogram_distance-series',
           'temporal_gaussian_mse-series']


df = df[columns]
df = df.dropna()

        
series = []
series_1 = []
series_2 = []
attack_ID = []
length = 15


for i, row in df.iterrows():
    time_series = row['dimension'] * np.fromstring(row['temporal_histogram_distance-series'].replace('[', '').replace(']', ''), 
                                                dtype=np.float, sep=' ')[:length]
    
    time_series_1 = row['dimension'] * np.fromstring(row['temporal_gaussian_mse-series'].replace('[', '').replace(']', ''), 
                                            dtype=np.float, sep=' ')[:length]
    
    if row['attack'] == '1080p':
        time_series_2 = np.fromstring(row['temporal_difference-series'].replace('[', '').replace(']', ''), 
                                            dtype=np.float, sep=' ')[:length]
        time_series_2 = filter_signal(time_series_2)
        
    if len(time_series) < length:
        time_series = np.append(time_series, np.zeros(length - len(time_series)))
        
    if len(time_series_1) < length: 
        time_series_1 = np.append(time_series_1, np.zeros(length - len(time_series_1)))
    
    if len(time_series_2) < length: 
        time_series_2 = np.append(time_series_2, np.zeros(length - len(time_series_2)))
        
    series.append(time_series)
    series_1.append(time_series_1)
    series_2.append(time_series_2)
        
    if row['attack'] in ['1080p', '720p', '480p', '360p', '240p', '144p']:
        attack_ID.append(1)
    else:
        attack_ID.append(0)
        
df['series'] = series
df['series_1'] = series_1
df['series_2'] = series_2
df['attack_ID'] = attack_ID



In [None]:
df = df.sample(frac=1)

df_0 = df[df['attack_ID'] == False]
df_1 = df[df['attack_ID'] == True]

df_train = df_1[:int(0.8 * df_1.shape[0])]
df_test = df_1[int(0.8 * df_1.shape[0]):]
df_attacks = df_0

scaler = MinMaxScaler()

train = np.stack(df_train['series'].to_numpy())
test = np.stack(df_test['series'].to_numpy())
attacks = np.stack(df_attacks['series'].to_numpy())

train_1 = np.stack(df_train['series_1'].to_numpy())
test_1 = np.stack(df_test['series_1'].to_numpy())
attacks_1 = np.stack(df_attacks['series_1'].to_numpy())

train_2 = np.stack(df_train['series_2'].to_numpy())
test_2 = np.stack(df_test['series_2'].to_numpy())
attacks_2 = np.stack(df_attacks['series_2'].to_numpy())
    
# Free memory
del df, df_train, df_attacks, df_0, df_1


In [None]:
train_ = np.copy(train)
test_ = np.copy(test)
attacks_ = np.copy(attacks)

train_1 = np.copy(train_1)
test_1 = np.copy(test_1)
attacks_1 = np.copy(attacks_1)

train_2 = np.copy(train_2)
test_2 = np.copy(test_2)
attacks_2 = np.copy(attacks_2)

scaler = MinMaxScaler()
#scaler = StandardScaler()


train_ = scaler.fit_transform(train_)
test_ = scaler.transform(test_)
attacks_ = scaler.transform(attacks_)


train_1_ = scaler.fit_transform(train_1)
test_1_ = scaler.transform(test_1)
attacks_1_ = scaler.transform(attacks_1)


train_2_ = scaler.fit_transform(train_2)
test_2_ = scaler.transform(test_2)
attacks_2_ = scaler.transform(attacks_2)

train_X = np.array(train_1_)
test_X = np.array(test_1_)
attacks_X = np.array(attacks_1_)
print(train_X.shape)
train_X = train_X.reshape((train_X.shape[0], train_X.shape[1], 1))
test_X = test_X.reshape((test_X.shape[0], test_X.shape[1], 1))
attacks_X = attacks_X.reshape((attacks_X.shape[0], attacks_X.shape[1], 1))
print(train_X.shape)
train_2_ = train_2_.reshape((train_2_.shape[0], train_2_.shape[1], 1))
test_2_ = test_2_.reshape((test_2_.shape[0], test_2_.shape[1], 1))
attacks_2_ = attacks_2_.reshape((attacks_2_.shape[0], attacks_2_.shape[1], 1))
print(train_2_.shape)

In [None]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=False))
model.add(Dense(32))
#model.add(RepeatVector(train_X.shape[1]))

model.add(Dense(train_2_.shape[2]))
adam = Adam(lr=0.0001)
model.compile(optimizer=adam, loss='mse')
model.summary()

In [None]:
history = model.fit(train_X, train_2_, epochs=20, verbose=1, 
                    batch_size=256, validation_data=(test_X, test_2_), shuffle=True)

In [None]:
plot_history(history, 'AE')

In [None]:
# reshape input into [samples, timesteps, features]
n_in = train_X.shape[1]
sequence = train_X.reshape((train_X.shape[0], n_in, 1))
# define model
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(n_in,1)))
model.add(RepeatVector(n_in))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(1)))
adam = Adam(lr=0.0001)
model.compile(optimizer=adam, loss='mse')
model.summary()
# fit model
model.fit(sequence, sequence, epochs=20, verbose=1)

In [None]:
train_re = model.predict(train_X, batch_size=2048)
test_re = model.predict(test_X, batch_size=2048)
attacks_re = model.predict(attacks_X, batch_size=2048)

In [None]:
mse_train = [mean_squared_error(train_X[i], train_re[i]) for i,_ in enumerate(train_X)]
mse_test = [mean_squared_error(test_X[i], test_re[i]) for i, _ in enumerate(test_X)]
mse_attacks = [mean_squared_error(attacks_X[i], attacks_re[i]) for i, _ in enumerate(attacks_X)]

In [None]:
np.mean(mse_train), np.mean(mse_test), np.mean(mse_attacks)

In [None]:
np.std(mse_train), np.std(mse_test), np.std(mse_attacks)

In [None]:
print_scores(mse_train, mse_test, mse_attacks)

In [None]:
print_series(train_, train_re, test_, test_re, attacks_, attacks_re)