In [None]:
# Turn off Tensorflow warnings

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import keras as k
import pickle

## Helper function
def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

def skipper(fname):
    with open(fname) as fin:
        no_comments = (line for line in fin if not line.lstrip().startswith('n'))
        next(no_comments, None) # skip header
        for row in no_comments:
            yield row
        
# Load the data

browse_data = np.loadtxt(skipper('Data/featurePCAP_Browsing.csv'), delimiter=',')
chat_data = np.loadtxt(skipper('Data/featurePCAP_Chat.csv'), delimiter=',')
mail_data = np.loadtxt(skipper('Data/featurePCAP_Mail.csv'), delimiter=',')
p2p_data = np.loadtxt(skipper('Data/featurePCAP_P2P2.csv'), delimiter=',')
raw_data = {'browse': browse_data, 'chat': chat_data, 
            'mail' : mail_data, 'p2p': p2p_data}

# Standardize data
total_data = np.concatenate([browse_data, chat_data, mail_data, p2p_data])
scaler = StandardScaler().fit(total_data)
scaled_data = dict()
for data_type in raw_data:
    scaled_data[data_type] = scaler.transform(raw_data[data_type])

save_object(scaler, 'scaler.pkl')
    
del raw_data

train_data, test_data = dict(), dict()
train_labels, test_labels = dict(), dict()
lstm_train_data, lstm_train_labels = dict(), dict()
lstm_test_data, lstm_test_labels = dict(), dict()
for index, data_type in enumerate(['browse','chat','mail','p2p']):
    l = len(scaled_data[data_type])
    train, test = scaled_data[data_type][:int(0.7*l)], scaled_data[data_type][int(0.7*l):]
    if train.shape[0]%5 == 0:
        split_train = np.array_split(train, train.shape[0]//5)
    else:
        split_train = np.array_split(train[:-(train.shape[0]%5)], train.shape[0]//5)
    
    if test.shape[0]%5 == 0:
        split_test = np.array_split(test, test.shape[0]//5)
    else:
        split_test = np.array_split(test[:-(test.shape[0]%5)], test.shape[0]//5)
    

    lstm_train_data[data_type] = np.stack(split_train)
    lstm_train_labels[data_type] = np.tile(np.eye(4)[index], (train.shape[0]//5, 1))
    lstm_test_data[data_type] = np.stack(split_train)
    lstm_test_labels[data_type] = np.tile(np.eye(4)[index], (train.shape[0]//5, 1))
    train_data[data_type] = train
    test_data[data_type] = test
    train_labels[data_type] = np.tile(np.eye(4)[index], (train.shape[0], 1))
    test_labels[data_type] = np.tile(np.eye(4)[index], (test.shape[0], 1))

del scaled_data

save_object(train_data, 'train_data.pkl')
save_object(train_labels, 'train_labels.pkl')
save_object(test_data, 'test_data.pkl')
save_object(test_labels, 'test_labels.pkl')
save_object(lstm_train_data, 'lstm_train_data.pkl')
save_object(lstm_train_labels, 'lstm_train_labels.pkl')
save_object(lstm_test_data, 'lstm_test_data.pkl')
save_object(lstm_test_labels, 'lstm_test_labels.pkl')

train_data = np.concatenate([train_data[d] for d in ['browse','chat','mail','p2p']])
train_labels = np.concatenate([train_labels[d] for d in ['browse','chat','mail','p2p']])
test_data = np.concatenate([test_data[d] for d in ['browse','chat','mail','p2p']])
test_labels = np.concatenate([test_labels[d] for d in ['browse','chat','mail','p2p']])
lstm_train_data = np.concatenate([lstm_train_data[d] for d in ['browse','chat','mail','p2p']])
lstm_train_labels = np.concatenate([lstm_train_labels[d] for d in ['browse','chat','mail','p2p']])
lstm_test_data = np.concatenate([lstm_test_data[d] for d in ['browse','chat','mail','p2p']])
lstm_test_labels = np.concatenate([lstm_test_labels[d] for d in ['browse','chat','mail','p2p']])


## Custom layer: used to map a function over a layer

@tf.keras.utils.register_keras_serializable()
class MapLayer(k.layers.Layer):
    def __init__(self, f, **kwargs):
        super().__init__(**kwargs)
        self.f = f
    
    def call(self, inputs):
        return tf.map_fn(self.f, inputs)
        
    def get_config(self):
        config = super().get_config()
        config['f'] = tf.keras.utils.serialize_keras_object(self.f)
        return config

## Small helper function used to compose layers

def compose(layers, input_layer):
    acc = input_layer
    for layer in layers:
        acc = layer(acc)
    return acc


In [None]:
## CNN SAE

activation = k.activations.relu
regularizer = k.regularizers.L1(l1=0.01)

cnn_input = k.Input(shape=6)
cnn_reshape = k.layers.Reshape((3, 2, 1))(cnn_input)
cnn_conv_e_1 = k.layers.Conv2D(4, kernel_size=(2,1), activation=activation, padding='same')(cnn_reshape)
cnn_conv_e_2 = k.layers.Conv2D(13, kernel_size=(2,1), activation=activation, activity_regularizer=regularizer, padding='same')(cnn_conv_e_1)
cnn_conv_d_2 = k.layers.Conv2D(13, kernel_size=(2,1), activation=activation, padding='same')(cnn_conv_e_2)
cnn_conv_d_1 = k.layers.Conv2D(4, kernel_size=(2,1), activation=activation, padding='same')(cnn_conv_d_2)
cnn_output = k.layers.Dense(1, activation='sigmoid')(cnn_conv_d_1)
cnn_flatten = k.layers.Flatten()(cnn_output)

cnn_sae_flatten = k.layers.Flatten()(cnn_conv_e_2)
cnn_sae_fcnn_1 = k.layers.Dense(64, activation=activation)(cnn_sae_flatten)
cnn_sae_dropout_1 = k.layers.Dropout(rate=0.5)(cnn_sae_fcnn_1)
cnn_sae_fcnn_2 = k.layers.Dense(24, activation=activation)(cnn_sae_dropout_1)
cnn_sae_dropout_2 = k.layers.Dropout(rate=0.5)(cnn_sae_fcnn_2)
cnn_sae_output = k.layers.Dense(4, activation='softmax')(cnn_sae_dropout_2)

cnn_sae = k.Model(inputs=cnn_input, outputs=cnn_flatten, name='cnn_sae')
cnn_sae.compile(loss='mse', optimizer='adam')

cnn_sae_encoder = k.Model(inputs=cnn_input, outputs=cnn_conv_e_2)
cnn_sae_encoded_input = k.Input(shape=(3, 2, 13))
_cnn_sae_decoder_layers = compose(cnn_sae.layers[-4:], cnn_sae_encoded_input)
cnn_sae_decoder = k.Model(cnn_sae_encoded_input, _cnn_sae_decoder_layers)


In [None]:
cnn_sae.fit(train_data, train_data, epochs=10, batch_size=32,
           callbacks=[k.callbacks.ModelCheckpoint(filepath='cnn_sae.{epoch:02d}.keras'),
                     k.callbacks.EarlyStopping(monitor='loss')])

for layer in cnn_sae.layers:
    layer.trainable = False

In [None]:
# cnn_sae_full = k.Model(inputs=cnn_input, outputs=cnn_sae_output, name='cnn_sae_full')
# cnn_sae_full.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

# cnn_sae_full.fit(train_data, train_labels, epochs=100, batch_size=32,
#                callbacks=[k.callbacks.ModelCheckpoint(filepath='cnn_sae_full.{epoch:02d}.keras'),
#                           k.callbacks.EarlyStopping(monitor='loss')])

cnn_sae_full_scores = cnn_sae_full.evaluate(test_data, test_labels)
print("\n%s: %.2f%%" % (cnn_sae_full.metrics_names[1], cnn_sae_full_scores[1]*100))

In [None]:
## LSTM SAE

lstm_input = tf.keras.layers.Input(shape=(5, 6))
lstm_cnn_encoded_input_0 = tf.keras.layers.TimeDistributed(tf.keras.layers.Reshape((3, 2, 1)))(lstm_input)
lstm_cnn_encoded_input_1 = tf.keras.layers.TimeDistributed(cnn_sae.layers[2])(lstm_cnn_encoded_input_0)
lstm_cnn_encoded_input_2 = tf.keras.layers.TimeDistributed(cnn_sae.layers[3])(lstm_cnn_encoded_input_1)
lstm_cnn_encoded_input_3 = tf.keras.layers.Reshape((5, 78))(lstm_cnn_encoded_input_2)
lstm_e_1 = tf.keras.layers.LSTM(16, return_sequences=True)(lstm_cnn_encoded_input_3)
lstm_e_2 = tf.keras.layers.LSTM(136, return_sequences=False, activity_regularizer=regularizer)(lstm_e_1)
lstm_d_repeat = tf.keras.layers.RepeatVector(5)(lstm_e_2)
lstm_d_1 = tf.keras.layers.LSTM(136, return_sequences=True)(lstm_d_repeat)
lstm_d_2 = tf.keras.layers.LSTM(16, return_sequences=True)(lstm_d_1)
lstm_output = k.layers.Dense(6, activation='sigmoid')(lstm_d_2)
lstm_sae_fcnn_1 = tf.keras.layers.Dense(64, activation=activation)(lstm_e_2)
lstm_sae_fcnn_2 = tf.keras.layers.Dense(24, activation=activation)(lstm_sae_fcnn_1)
lstm_sae_output = tf.keras.layers.Dense(4, activation='softmax')(lstm_sae_fcnn_2)

lstm_sae = tf.keras.Model(inputs=lstm_input, outputs=lstm_output, name='lstm_sae')
lstm_sae.compile(loss='mse', optimizer='adam')
lstm_sae_encoder = k.Model(lstm_input, lstm_e_2)

# lstm_sae_full = k.Model(inputs=lstm_input, outputs=lstm_sae_output, name='lstm_sae_full')
# lstm_sae_full.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [None]:
lstm_sae.fit(lstm_train_data, lstm_train_data, epochs=10, batch_size=32,
           callbacks=[k.callbacks.ModelCheckpoint(filepath='lstm_sae.{epoch:02d}.keras'),
                       k.callbacks.EarlyStopping(monitor='loss')])

for layer in lstm_sae.layers:
    layer.trainable = False


In [None]:
# lstm_sae_full.fit(lstm_train_data, lstm_train_labels, epochs=100, batch_size=2048,
#                   callbacks=[k.callbacks.ModelCheckpoint(filepath='lstm_sae_full.{epoch:02d}.keras'),
#                              k.callbacks.EarlyStopping(monitor='loss')])
# lstm_sae_full_scores = lstm_sae_full.evaluate(lstm_test_data, lstm_test_labels)
# print("\n%s: %.2f%%" % (lstm_sae_full.metrics_names[1], lstm_sae_full_scores[1]*100))

In [None]:
stacked_cnn_lstm_sae_input = tf.keras.layers.Input(shape=(5, 6))
stacked_cnn_lstm_sae_encoded_input_0 = tf.keras.layers.TimeDistributed(tf.keras.layers.Reshape((3, 2, 1)))(stacked_cnn_lstm_sae_input)
stacked_cnn_lstm_sae_encoded_input_1 = tf.keras.layers.TimeDistributed(cnn_sae.layers[2])(stacked_cnn_lstm_sae_encoded_input_0)
stacked_cnn_lstm_sae_encoded_input_2 = tf.keras.layers.TimeDistributed(cnn_sae.layers[3])(stacked_cnn_lstm_sae_encoded_input_1)
stacked_cnn_lstm_sae_encoded_input_3 =  tf.keras.layers.Reshape((5, 78))(stacked_cnn_lstm_sae_encoded_input_2)
stacked_cnn_lstm_sae_e2 = compose(lstm_sae.layers[5:7], stacked_cnn_lstm_sae_encoded_input_3)
stacked_cnn_lstm_sae_fcnn_1 = tf.keras.layers.Dense(64, activation=activation)(stacked_cnn_lstm_sae_e2)
stacked_cnn_lstm_sae_fcnn_2 = tf.keras.layers.Dense(24, activation=activation)(stacked_cnn_lstm_sae_fcnn_1)
stacked_cnn_lstm_sae_output = tf.keras.layers.Dense(4, activation='softmax')(stacked_cnn_lstm_sae_fcnn_2)
stacked_cnn_lstm_sae = tf.keras.Model(stacked_cnn_lstm_sae_input,
                                     stacked_cnn_lstm_sae_output,
                                     name='stacked_cnn_lstm_sae')
stacked_cnn_lstm_sae.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [None]:
stacked_cnn_lstm_sae.fit(lstm_train_data, lstm_train_labels, epochs=100, batch_size=32,
                  callbacks=[k.callbacks.ModelCheckpoint(filepath='stacked_cnn_lstm_sae.{epoch:02d}.keras'),
                             k.callbacks.EarlyStopping(monitor='loss')])
stacked_cnn_lstm_sae_scores = stacked_cnn_lstm_sae.evaluate(lstm_test_data, lstm_test_labels)
print("\n%s: %.2f%%" % (stacked_cnn_lstm_sae.metrics_names[1], stacked_cnn_lstm_sae_scores[1]*100))