# Import packages

In [30]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation, Conv2D, MaxPooling2D, Flatten
from keras.layers import Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dense, Flatten


import sklearn
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import os
import random

# Utility function

In [2]:
def uncompressArray(file_dir):
  uncompressed_data = []
  with open(file_dir, 'rb') as f:
    loaded_file = np.load(f)
    ks = list(loaded_file.keys())
    print("First, check the data!")
    print(f"Keys: {ks}")
    ans = input("Please enter 'y' if you want to proceed: ")
    if ans == 'y':
      print("\nloading data !")
      for k in ks:
        uncompressed_data.append(loaded_file[k].copy())
        print(f"load: {k}")
    else:
      print("data is not loaded!")
  return uncompressed_data

def reshape_data(X):
  X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
  return X

# Data preprocessing

## Import data

In [3]:
data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/clean_data.npz'

X_clean_train, X_clean_test, Y_clean_train, Y_clean_test = uncompressArray(data_file)

First, check the data!
Keys: ['X_clean_train', 'X_clean_test', 'Y_clean_train', 'Y_clean_test']
Please enter 'y' if you want to proceed: y

loading data !
load: X_clean_train
load: X_clean_test
load: Y_clean_train
load: Y_clean_test


In [5]:
print(X_clean_train.shape, X_clean_test.shape, Y_clean_train.shape, Y_clean_test.shape)

(401302, 28, 28, 1) (10000, 28, 28, 1) (401302,) (10000,)


## Split train and test

In [6]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_clean_train, Y_clean_train, test_size=0.2, random_state=42)
X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape

((321041, 28, 28, 1), (321041,), (80261, 28, 28, 1), (80261,))

## Normalize data

In [7]:
# extract only one channel
X_train = X_train[:,:,:,0]/255.
X_valid = X_valid[:,:,:,0]/255.
X_test = X_clean_test[:,:,:,0]/255.

X_train.shape, X_valid.shape, X_test.shape

((321041, 28, 28), (80261, 28, 28), (10000, 28, 28))

## Reshape data

In [8]:
X_train = reshape_data(X_train)
X_valid = reshape_data(X_valid)
X_test = reshape_data(X_test)

X_train.shape, X_valid.shape, X_test.shape

((321041, 28, 28, 1), (80261, 28, 28, 1), (10000, 28, 28, 1))

## One hot encoding

In [9]:
# letters labels start from 11
Y_train = to_categorical(Y_train, dtype ="uint8")
Y_valid = to_categorical(Y_valid, dtype ="uint8")
Y_test = to_categorical(Y_clean_test, dtype ="uint8")
Y_train.shape, Y_valid.shape, Y_test.shape

((321041, 37), (80261, 37), (10000, 37))

# Basic Model

In [18]:
# def classificationModel(input_shape, num_classes):
#   model = Sequential()

  # model.add(Conv2D(32, (3,3), padding='same', activation="relu", input_shape=input_shape))
  # model.add(Conv2D(32, (3,3), padding='same', activation="relu"))
  # model.add(MaxPooling2D(pool_size=(2,2)))
  # model.add(Dropout(0.25))
  
  # model.add(Conv2D(64, (3,3), padding='same', activation="relu"))
  # model.add(Conv2D(64, (3,3), padding='same', activation="relu"))
  # model.add(MaxPooling2D(pool_size=(2,2)))
  # model.add(Dropout(0.25))
  
  # model.add(Flatten())
  # model.add(Dense(512))
  # model.add(Activation('relu'))
  # model.add(Dropout(0.5))
  # model.add(Dense(num_classes))
  # model.add(Activation('softmax'))

#################################################################################################################
  # model.add(Conv2D(input_shape=input_shape, filters=64, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(Conv2D(filters=64, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
  # model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
  # model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
  # model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))
  # model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
  # model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))

  # model.add(Flatten())
  # model.add(Dense(units=4096,activation="relu"))
  # model.add(Dense(units=4096,activation="relu"))
  # model.add(Dense(units=num_classes, activation="softmax"))

  # return model

In [34]:
# https://github.com/kkweon/mnist-competition/blob/master/vgg16.py

def vgg(input_shape, no_classes):
    """Inference function for VGGNet
    y = vgg(X)
    Parameters
    ----------
    input_tensor : keras.layers.Input
    Returns
    ----------
    y : softmax output tensor
    """
    def two_conv_pool(x, F1, F2, name):
        # x = Conv2D(F1, (3, 3), activation=None, padding='same', name='{}_conv1'.format(name))(x)
        # x = BatchNormalization()(x)
        # x = Activation('relu')(x)
        # x = Conv2D(F2, (3, 3), activation=None, padding='same', name='{}_conv2'.format(name))(x)
        # x = BatchNormalization()(x)
        # x = Activation('relu')(x)
        # x = MaxPooling2D((2, 2), strides=(2, 2), name='{}_pool'.format(name))(x)

        
        x.add(Conv2D(F1, (3, 3), activation=None, padding='same', name='{}_conv1'.format(name)))
        x.add(BatchNormalization())
        x.add(Activation('relu'))
        x.add(Conv2D(F2, (3, 3), activation=None, padding='same', name='{}_conv2'.format(name)))
        x.add(BatchNormalization())
        x.add(Activation('relu'))
        x.add(MaxPooling2D((2, 2), strides=(2, 2), name='{}_pool'.format(name)))

        return x

    def three_conv_pool(x, F1, F2, F3, name):
        # x = Conv2D(F1, (3, 3), activation=None, padding='same', name='{}_conv1'.format(name))(x)
        # x = BatchNormalization()(x)
        # x = Activation('relu')(x)
        # x = Conv2D(F2, (3, 3), activation=None, padding='same', name='{}_conv2'.format(name))(x)
        # x = BatchNormalization()(x)
        # x = Activation('relu')(x)
        # x = Conv2D(F3, (3, 3), activation=None, padding='same', name='{}_conv3'.format(name))(x)
        # x = BatchNormalization()(x)
        # x = Activation('relu')(x)
        # x = MaxPooling2D((2, 2), strides=(2, 2), name='{}_pool'.format(name))(x)

        
        x.add(Conv2D(F1, (3, 3), activation=None, padding='same', name='{}_conv1'.format(name)))
        x.add(BatchNormalization())
        x.add(Activation('relu'))
        x.add(Conv2D(F2, (3, 3), activation=None, padding='same', name='{}_conv2'.format(name)))
        x.add(BatchNormalization())
        x.add(Activation('relu'))
        x.add(Conv2D(F3, (3, 3), activation=None, padding='same', name='{}_conv3'.format(name)))
        x.add(BatchNormalization())
        x.add(Activation('relu'))
        x.add(MaxPooling2D((2, 2), strides=(2, 2), name='{}_pool'.format(name)))

        return x
    
    net = Sequential()
    net.add(Input(input_shape))
    # net = Input(input_shape)

    # net = two_conv_pool(net, 64, 64, "block1")
    # net = two_conv_pool(net, 128, 128, "block2")
    # net = three_conv_pool(net, 256, 256, 256, "block3")
    # net = three_conv_pool(net, 512, 512, 512, "block4")
    
    net = two_conv_pool(net, 64, 64, "block1")
    net = two_conv_pool(net, 128, 128, "block2")
    net = three_conv_pool(net, 256, 256, 256, "block3")
    net = three_conv_pool(net, 512, 512, 512, "block4")

    # net = Flatten()(net)
    # net = Dense(512, activation='relu', name='fc')(net)
    # net = Dense(no_classes, activation='softmax', name='predictions')(net)

    net.add(Flatten())
    net.add(Dense(512, activation='relu', name='fc'))
    net.add(Dense(no_classes, activation='softmax', name='predictions'))

    return net

# Model training

In [36]:
# Model configuration
batch_size = 32
img_width, img_height, img_num_channels = X_train.shape[1:]
loss_function = 'categorical_crossentropy'
no_epochs = 10
optimizer = 'adam'
verbosity = 1
num_folds = 5
no_classes = Y_train.shape[1]

# Determine shape of the data
input_shape = (img_width, img_height, img_num_channels)

# Create model
model = vgg(input_shape, no_classes)

# Compile model
model.compile(loss=loss_function,
              optimizer=optimizer,
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc', multi_label=True)])

# Fit model
history = model.fit(X_train, Y_train, ###Moda:XXX
                    batch_size=batch_size,
                    epochs=no_epochs,
                    verbose=verbosity,
                    validation_data=(X_valid, Y_valid))

results = model.evaluate(X_test, Y_test, verbose=0)

model.save('/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/saved_models/vgg16_classifier.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
results[1]*100

93.94999742507935