In [None]:
import sys
import glob, os
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
from plotly import tools

from scipy.spatial import distance
from scipy import linalg
from scipy import signal

%matplotlib inline
offline.init_notebook_mode()

In [None]:
path = '../../machine_learning/cloud_functions/data-large.csv'
data = pd.read_csv(path)
df = pd.DataFrame(data)
df.head()

In [None]:
columns = ['temporal_canny-series',
           'temporal_cross_correlation-series',
           'temporal_dct-series',
           'temporal_difference-series',
           'temporal_histogram_distance-series',
           'temporal_gaussian-series',
           'dimension',
           'attack',
           'title']

df = df[columns]
df.head()

In [None]:
def select_time_series(df, series_name, asset_name=None):
    if not asset_name:
        asset_name = df.sample(1)['title'].values[0]
    else:
        asset_name = '/tmp/' + asset_name 
    print('{} was selected'.format(str(asset_name)))
    
    series = []
    asset_info = df[df['title'] == asset_name][[series_name, 'dimension', 'attack']]  
    for i, row in asset_info.iterrows():
        series.append(np.fromstring(row[series_name].replace('[', '').replace(']', ''), 
                                                    dtype=np.float, sep=' '))
    asset_info['series'] = series
    return asset_info


# Series analysis

In [None]:
# Select series to study, if not specified a random video will be used

series = 'temporal_gaussian-series'
df_series = select_time_series(df, series)
display(df_series.head())
print(df_series.shape)


In [None]:

X = list(range(0, len(df_series['series'].values[0])))

i = 0
data = []
scaling = True
for _, row in df_series.iterrows():
    
    Y = df_series.iloc[i]['series']
    if scaling:
        Y /= df_series.iloc[i]['dimension']
    
    trace = go.Scatter(
            x = X,
            y = Y,
            name = df_series.iloc[i]['attack'],
            mode = 'lines'
        )
    data.append(trace)
    i += 1

layout = {"title": series, 
      "legend":{"x": 1, "y":.95},
      "xaxis": {"title": "Frame", }, 
      "yaxis": {"title": series},
      "hovermode":"closest"
      }
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)


# Autoencoder

Let's try with a LSTM autoencoder to reconstruct the gaussian time-series  

In [None]:
path = '../../machine_learning/cloud_functions/data-large.csv'
data = pd.read_csv(path)
df = pd.DataFrame(data)

In [None]:
df = df[columns]
df = df.dropna()

In [None]:
series = []
attack_ID = []

for i, row in df.iterrows():
    time_series = np.fromstring(row['temporal_gaussian-series'].replace('[', '').replace(']', ''), 
                                                dtype=np.float, sep=' ')[:70]
    if len(time_series) < 70:
        time_series = np.append(time_series, np.zeros(70 - len(time_series)))
        
    series.append(time_series)
    if row['attack'] in ['1080p', '720p', '480p', '360p', '240p', '144p']:
        attack_ID.append(1)
    else:
        attack_ID.append(0)
        
df['series'] = series
df['attack_ID'] = attack_ID


In [None]:
df_0 = df[df['attack_ID'] == 0]
df_1 = df[df['attack_ID'] == 1]

In [None]:
df_train = df_1[:int(0.8*df_1.shape[0])]
df_test = df_1[int(0.8*df_1.shape[0]):]
df_attacks = df_0

In [None]:
train = np.empty(shape=(70,))
for i, elem in df_train.iterrows():
    train = np.vstack((train, elem['series']))
    
test = np.empty(shape=(70,))
for i, elem in df_test.iterrows():
    test = np.vstack((test, elem['series']))

In [None]:
attacks = np.empty(shape=(70,))
for i, elem in df_attacks[:30000].iterrows():
    attacks = np.vstack((attacks, elem['series']))

In [None]:
# Free memory
del df, df_train, df_attacks, df_0, df_1

In [None]:
from keras.layers import Lambda, LSTM, RepeatVector, TimeDistributed, Dense
from keras.optimizers import Adam 
from keras import backend as K
from keras import Sequential
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [None]:
train = train.reshape(train.shape[0], 70)
test = test.reshape(test.shape[0], 70)
attacks = attacks.reshape(attacks.shape[0], 70)

In [None]:
scaler = StandardScaler()
train_ = train.copy()
test_ = test.copy()
attacks_ = attacks.copy()


train_ = scaler.fit_transform(train_.reshape((train.shape[0]*train.shape[1], 1))).reshape((train.shape[0], train.shape[1]))
test_ = scaler.transform(test_.reshape((test_.shape[0]*test.shape[1], 1))).reshape((test.shape[0], test.shape[1]))
attacks_ = scaler.transform(attacks_.reshape((attacks_.shape[0]*attacks.shape[1], 1))).reshape((attacks.shape[0], attacks.shape[1]))

In [None]:
# define model
n_in = 70
latent_dim = 30
model = Sequential()
model.add(Dense(latent_dim, activation='linear', input_shape=(n_in,)))
model.add(Dense(n_in, activation='linear'))
adam = Adam(lr=0.001)
model.compile(optimizer=adam, loss='mse')
model.summary()

# TODO: fine-tune the model
# n_in = 70
# latent_dim = 16
# model = Sequential()
# model.add(Dense(4 * latent_dim, activation='linear', input_shape=(n_in,)))
# model.add(Dense(2 * latent_dim, activation='linear'))
# model.add(Dense(latent_dim, activation='linear'))
# model.add(Dense(2 * latent_dim, activation='linear'))
# model.add(Dense(4 * latent_dim, activation='linear'))
# model.add(Dense(n_in, activation='linear'))
# adam = Adam(lr=0.001)
# model.compile(optimizer=adam, loss='mse')
# model.summary()

In [None]:
history = model.fit(train_, train_, epochs=100, verbose=1, 
                    batch_size=128, validation_data=(test_, test_), shuffle=True)

In [None]:
import matplotlib.pyplot as plt

def plot_history(network_history, title):
    plt.figure(figsize=(10, 5))
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.semilogy(network_history.history['loss'])
    plt.semilogy(network_history.history['val_loss'])
    plt.legend(['Training', 'Validation'])
    plt.show()

In [None]:
plot_history(history, 'AE LSTM')

In [None]:
train_re = model.predict(train, batch_size=128)
test_re = model.predict(test, batch_size=128)
attacks_re = model.predict(attacks, batch_size=128)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mse_train = [mean_squared_error(train_[i], train_re[i]) for i,_ in enumerate(train_)]
mse_test = [mean_squared_error(test_[i], test_re[i]) for i, _ in enumerate(test_)]
mse_attacks = [mean_squared_error(attacks_[i], attacks_re[i]) for i, _ in enumerate(attacks_)]

In [None]:
np.mean(mse_train), np.mean(mse_test), np.mean(mse_attacks)

In [None]:
np.std(mse_train), np.std(mse_test), np.std(mse_attacks)

In [None]:
th = 10
print(sum(np.array(mse_train) > th) / len(mse_train))
print(sum(np.array(mse_test) > th) / len(mse_test))
print(sum(np.array(mse_attacks) < th) / len(mse_attacks))

In [None]:
data = [go.Histogram(x=mse_train), go.Histogram(x=mse_test), go.Histogram(x=mse_attacks)]

fig = go.Figure(data)
offline.iplot(fig)