# Import packages

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential, load_model
# from keras.layers import Dense, Dropout, Activation, Conv2D, MaxPooling2D, Flatten
from keras.layers import Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dense, Flatten, Dropout


import sklearn
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import os
import random

# Utility function

In [None]:
def uncompressArray(file_dir):
  uncompressed_data = []
  with open(file_dir, 'rb') as f:
    loaded_file = np.load(f)
    ks = list(loaded_file.keys())
    print("First, check the data!")
    print(f"Keys: {ks}")
    ans = input("Please enter 'y' if you want to proceed: ")
    if ans == 'y':
      print("\nloading data !")
      for k in ks:
        uncompressed_data.append(loaded_file[k].copy())
        print(f"load: {k}")
    else:
      print("data is not loaded!")
  return uncompressed_data

def reshape_data(X):
  X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
  return X

# Data preprocessing

## Import data

In [None]:
data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/arabic_data.npz'

X_clean_train, X_clean_test, Y_clean_train, Y_clean_test = uncompressArray(data_file)

First, check the data!
Keys: ['X_train_arab', 'X_test_arab', 'Y_train_arab', 'Y_test_arab']
Please enter 'y' if you want to proceed: y

loading data !
load: X_train_arab
load: X_test_arab
load: Y_train_arab
load: Y_test_arab


In [None]:
print(X_clean_train.shape, X_clean_test.shape, Y_clean_train.shape, Y_clean_test.shape)

(40320, 28, 28, 1) (10080, 28, 28, 1) (40320,) (10080,)


## Split train and test

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_clean_train, Y_clean_train, test_size=0.2, random_state=42)
X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape

((32256, 28, 28, 1), (32256,), (8064, 28, 28, 1), (8064,))

In [None]:
X_test = X_clean_test.copy()

## Normalize data

In [None]:
# # extract only one channel
# X_train = X_train[:,:,:,0]/255.
# X_valid = X_valid[:,:,:,0]/255.
# X_test = X_clean_test[:,:,:,0]/255.

# X_train.shape, X_valid.shape, X_test.shape

## Reshape data

In [None]:
# X_train = reshape_data(X_train)
# X_valid = reshape_data(X_valid)
# X_test = reshape_data(X_test)

# X_train.shape, X_valid.shape, X_test.shape

## One hot encoding

In [None]:
# letters labels start from 11
Y_train = to_categorical(Y_train, dtype ="uint8")
Y_valid = to_categorical(Y_valid, dtype ="uint8")
Y_test = to_categorical(Y_clean_test, dtype ="uint8")
Y_train.shape, Y_valid.shape, Y_test.shape

((32256, 28), (8064, 28), (10080, 28))

# Basic Model

In [None]:
# https://analyticsindiamag.com/hands-on-guide-to-implementing-alexnet-with-keras-for-multi-class-image-classification/

def AlexNet(input_shape=(28,28,1), no_classes=10):

  model = Sequential()

  #1st Convolutional Layer
  model.add(Conv2D(filters=96, input_shape=input_shape, kernel_size=(11,11), strides=(4,4), padding='same'))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

  #2nd Convolutional Layer
  model.add(Conv2D(filters=256, kernel_size=(5, 5), strides=(1,1), padding='same'))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

  #3rd Convolutional Layer
  model.add(Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding='same'))
  model.add(BatchNormalization())
  model.add(Activation('relu'))

  #4th Convolutional Layer
  model.add(Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding='same'))
  model.add(BatchNormalization())
  model.add(Activation('relu'))

  #5th Convolutional Layer
  model.add(Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), padding='same'))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

  #Passing it to a Fully Connected layer
  model.add(Flatten())
  # 1st Fully Connected Layer
  model.add(Dense(4096, input_shape=input_shape))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  # Add Dropout to prevent overfitting
  model.add(Dropout(0.4))

  #2nd Fully Connected Layer
  model.add(Dense(4096))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  #Add Dropout
  model.add(Dropout(0.4))

  #3rd Fully Connected Layer
  model.add(Dense(1000))
  model.add(BatchNormalization())
  model.add(Activation('relu'))
  #Add Dropout
  model.add(Dropout(0.4))

  #Output Layer
  model.add(Dense(no_classes))
  model.add(BatchNormalization())
  model.add(Activation('softmax'))

  #Model Summary
  model.summary()

  return model

# Model training

In [None]:
# Implement callback function to stop training
# when accuracy reaches e.g. ACCURACY_THRESHOLD = 0.85

ACCURACY_THRESHOLD = 0.85

class myCallback(tf.keras.callbacks.Callback): 
  def __init__(self, test_data):
    self.test_data = test_data

  def on_epoch_end(self, epoch, logs={}): 
    x, y = self.test_data
    acc = self.model.evaluate(x, y, verbose=0)[1]
    print('\nTesting acc: {}\n'.format(acc))
    if acc > ACCURACY_THRESHOLD:
      print("\nReached %2.2f%% accuracy, so stopping training!!" %(ACCURACY_THRESHOLD*100))
      self.model.stop_training = True

In [None]:
# Model configuration
batch_size = 32
img_width, img_height, img_num_channels = X_train.shape[1:]
loss_function = 'categorical_crossentropy'
no_epochs = 100
optimizer = 'adam'
verbosity = 1
num_folds = 5
no_classes = Y_train.shape[1]

callbacks=myCallback((X_test, Y_test))

# Determine shape of the data
input_shape = (img_width, img_height, img_num_channels)

# Create model
model = AlexNet(input_shape, no_classes)

# Compile model
model.compile(loss=loss_function,
              optimizer=optimizer,
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc', multi_label=True)])

# Fit model
history = model.fit(X_train, Y_train, ###Moda:XXX
                    batch_size=batch_size,
                    epochs=no_epochs,
                    verbose=verbosity,
                    validation_data=(X_valid, Y_valid),
                    callbacks=[callbacks])

results = model.evaluate(X_test, Y_test, verbose=0)

model.save('/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/saved_models/alexNet_classifier.h5')

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 7, 7, 96)          11712     
                                                                 
 batch_normalization_18 (Bat  (None, 7, 7, 96)         384       
 chNormalization)                                                
                                                                 
 activation_18 (Activation)  (None, 7, 7, 96)          0         
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 4, 4, 96)         0         
 2D)                                                             
                                                                 
 conv2d_11 (Conv2D)          (None, 4, 4, 256)         614656    
                                                                 
 batch_normalization_19 (Bat  (None, 4, 4, 256)       

In [None]:
results[1]*100

81.01190328598022

In [None]:
test_preds = model.predict(X_test)

In [None]:
test_preds.shape

(10080, 28)

In [None]:
data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/arabic_alex_preds.npz'
np.savez_compressed(data_file, arab_preds=test_preds)

# Balanced accuracy

In [None]:
model = keras.models.load_model('/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/saved_models/alexNet_classifier.h5')

In [None]:
y_pred = model.predict(X_test)

In [None]:
sklearn.metrics.balanced_accuracy_score(np.argmax(Y_test, axis=-1), np.argmax(y_pred, axis=-1))

0.8101190476190476