In [1]:
#import
import os
import sys
import numpy as np
import tensorflow as tf
import sklearn
import glob

import matplotlib.pyplot as plt
from PIL import Image
from skimage.transform import resize

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam, RMSprop

from sklearn.metrics import *
from sklearn.preprocessing import OneHotEncoder

from stego_classifier_extended import *

In [2]:
#Utils

#convert to binary
def messageToBinary(message):
  if type(message) == str:
    return ''.join([ format(ord(i), "08b") for i in message ])
  elif type(message) == bytes or type(message) == np.ndarray:
    return [ format(i, "08b") for i in message ]
  elif type(message) == int or type(message) == np.uint8:
    return format(message, "08b")
  else:
    raise TypeError("Input type not supported")
  
#extract k lsb for each channel
def extract_k_lsb_features(data, k=4):
    lsb_training = []
    for img in data:
        binary_data = []
        for values in img:
            for pixel in values:
                r, g, b = messageToBinary(pixel)
                for i in range(1,k+1):
                    binary_data.append(int(r[-1-i+1]))  # extracting data from the least significant bit of red pixel
                    binary_data.append(int(g[-1-i+1]))  # extracting data from the least significant bit of green pixel
                    binary_data.append(int(b[-1-i+1]))  # extracting data from the least significant bit of blue pixel
                # split by 8-bits

        lsb_training.append(np.array(binary_data))
        

    return np.array(lsb_training)

# load images in the image_path
def load_images(image_path):
    images = []
    for f_name in sorted(glob.glob(image_path + '/*.png')):
        img = np.asarray(Image.open(f_name).convert('RGB'))
        images.append(img)
    return images

#convert an image into array
def convert_np_array(vector):
    result = []
    for v in vector:
        result.append(v)
    return np.array(result)

#laod data
def load_data(main_data_folder, usage_folder_name, legit_folder_name, stego_folder_names):
    
    data_path = os.path.join(main_data_folder, usage_folder_name)
    data_to_load = []
    num_stego_images_for_class = []
    data_to_load.append(convert_np_array(load_images(os.path.join(data_path,legit_folder_name))))
    for stego_folder_name in stego_folder_names:
        stego_images = convert_np_array(load_images(os.path.join(data_path,stego_folder_name)))
        num_stego_images_for_class.append(stego_images.shape[0])
        data_to_load.append(stego_images)

    num_legit_images = data_to_load[0].shape[0]
    print("#legit images", data_to_load[0].shape[0])
    print("#stego images", num_stego_images_for_class)

    data_to_load = np.concatenate(data_to_load)
    print("data shape: ", data_to_load.shape)

    print("done")
    
    return data_to_load, num_legit_images, np.array(num_stego_images_for_class)

#create target variable with labels
def create_target_labels(legits, stego_type_number):
    target = []
    i = 0
    target.append(np.zeros(legits, dtype=np.int8))
    for current_stego in stego_type_number:
        target.append(np.ones(current_stego, dtype=np.int8)+i)
        i=i+1
    return np.concatenate(target)

In [1]:
#Parameters ---------------------------

data_folder = "/ImageDirectory/dataset"
legit_folder = 'legit'
stego_folders = ['LSB_stego_php','LSB_stego_url']
cwd = './'
model_space = 'output/models-separate'
seed = 230782
k_lsb = 3
#------------------------------------

## Stego Malware Classification task

In [4]:
#loading training set
training_set, num_training_legit, num_training_stego_for_class  = load_data(data_folder, "training", legit_folder, stego_folders)
num_training_stego = np.sum(num_training_stego_for_class)
num_targets = len(num_training_stego_for_class)+1
print("#training shape, neg, pos: ", training_set.shape, num_training_legit, num_training_stego)
print("#num classes: ", num_targets)

#lsb extraction
training_set = extract_k_lsb_features(training_set, k_lsb)

#create target
y_train = create_target_labels(num_training_legit, num_training_stego_for_class) 
print("#num examples for class:")
(unique, counts) = np.unique(y_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)
print("Apply OHE")
ohe_processer = OneHotEncoder(handle_unknown='ignore')
ohe_y_train = ohe_processer.fit_transform(np.reshape(y_train,(-1,1))).toarray()
print("training set ready")

#legit images 29999
#stego images [29999, 119996]
data shape:  (179994, 32, 32, 3)
done
#training shape, neg, pos:  (179994, 32, 32, 3) 29999 149995
#num classes:  3
#num examples for class:
[[     0  29999]
 [     1  29999]
 [     2 119996]]
Apply OHE
training set ready


In [5]:
#loading validation set
validation, num_val_legit, num_val_stego_for_class  = load_data(data_folder, "validation", legit_folder, stego_folders)
num_val_stego = np.sum(num_val_stego_for_class)
print("#validation shape, neg, pos: ", validation.shape, num_val_legit, num_val_stego)

#lsb extraction
validation = extract_k_lsb_features(validation, k_lsb)

#create target
y_val = create_target_labels(num_val_legit, num_val_stego_for_class)
print("#num examples for class:")
(unique, counts) = np.unique(y_val, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)
print("Apply OHE")
ohe_y_val = ohe_processer.transform(np.reshape(y_val,(-1,1))).toarray()
print("validation set ready")

#legit images 15000
#stego images [15000, 60000]
data shape:  (90000, 32, 32, 3)
done
#validation shape, neg, pos:  (90000, 32, 32, 3) 15000 75000
#num examples for class:
[[    0 15000]
 [    1 15000]
 [    2 60000]]
Apply OHE
validation set ready


In [6]:
#init classifier
classifier = StegoClassifierExtended(training_set.shape[1], num_targets)

#compile classifier
classifier.compile(loss='categorical_crossentropy', optimizer=RMSprop(learning_rate=1e-3), metrics=['accuracy'])

#init callbacks
model_path = os.path.join(cwd, model_space, 'best_model_stego_classification')
print("best model path: ", model_path)
check = ModelCheckpoint(model_path, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')

best model path:  ./output/models-separate/best_model_stego_classification


2022-02-23 22:34:49.745702: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:

#training model
classifier.fit(training_set, ohe_y_train, batch_size=256, epochs=20, validation_data=(validation, ohe_y_val), callbacks=[check], verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x161410f10>

In [8]:
#reload best model
classifier.load_weights(model_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x16146d050>

In [9]:
#loading test set
test, num_test_legit, num_test_stego_for_class  = load_data(data_folder, "test", legit_folder, stego_folders)
num_test_stego = np.sum(num_test_stego_for_class)
print("#test shape, neg, pos: ", test.shape, num_test_legit, num_test_stego)

#lsb extraction
test = extract_k_lsb_features(test, k_lsb)

#create target
y_test = create_target_labels(num_test_legit, num_test_stego_for_class)
print("#num examples for class:")
(unique, counts) = np.unique(y_test, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)
print("Apply OHE")
ohe_y_test = ohe_processer.transform(np.reshape(y_test,(-1,1))).toarray()
print("test set ready")

#legit images 15001
#stego images [15001, 60004]
data shape:  (90006, 32, 32, 3)
done
#test shape, neg, pos:  (90006, 32, 32, 3) 15001 75005
#num examples for class:
[[    0 15001]
 [    1 15001]
 [    2 60004]]
Apply OHE
test set ready


In [10]:
y_pred_score = classifier.predict(test)

y_pred_label = np.around(y_pred_score, 0)

report_map = classification_report(y_test, np.argmax(y_pred_score, axis=1), output_dict=True)

#acc_score = accuracy_score(ohe_y_test, y_pred_label)
acc_score = accuracy_score(y_test, np.argmax(y_pred_score, axis=1))

auc_score = roc_auc_score(ohe_y_test, y_pred_score, multi_class="ovr", average="macro")

auc_score_pr = average_precision_score(ohe_y_test, y_pred_score)

result = str(acc_score)+";"+str(report_map['macro avg']['precision']) + ";" + str(
        report_map['macro avg']['recall']) + ";" + str(report_map['macro avg']['f1-score']) + ";" + str(
        auc_score)  + ";" + str(auc_score_pr)
 
print("acc;prec;rec;f1;auc;auc-pr")
print(result)

acc;prec;rec;f1;auc;auc-pr
1.0;1.0;1.0;1.0;1.0;1.0


In [11]:
#loading test set unseen
test_unseen, num_test_unseen_legit, num_test_unseen_stego_for_class  = load_data(data_folder, "test_unseen", legit_folder, stego_folders)
num_test_unseen_stego = np.sum(num_test_unseen_stego_for_class)
print("#test unseen shape, neg, pos: ", test_unseen.shape, num_test_unseen_legit, num_test_unseen_stego)

#lsb extraction
test_unseen = extract_k_lsb_features(test_unseen, k_lsb)

#create target
y_test_unseen = create_target_labels(num_test_unseen_legit, num_test_unseen_stego_for_class)
print("#num examples for class:")
(unique, counts) = np.unique(y_test_unseen, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)
print("Apply OHE")
ohe_y_test_unseen = ohe_processer.transform(np.reshape(y_test_unseen,(-1,1))).toarray()
print("test set unseen ready")

#legit images 15001
#stego images [15001, 60004]
data shape:  (90006, 32, 32, 3)
done
#test unseen shape, neg, pos:  (90006, 32, 32, 3) 15001 75005
#num examples for class:
[[    0 15001]
 [    1 15001]
 [    2 60004]]
Apply OHE
test set unseen ready


In [12]:
y_pred_score = classifier.predict(test_unseen)

y_pred_label = np.around(y_pred_score, 0)

report_map = classification_report(y_test_unseen, np.argmax(y_pred_score, axis=1), output_dict=True)

#acc_core = accuracy_score(ohe_y_test_unseen, y_pred_label)
acc_score = accuracy_score(y_test_unseen, np.argmax(y_pred_score, axis=1))

auc_score = roc_auc_score(ohe_y_test_unseen, y_pred_score, multi_class="ovr", average="macro")

auc_score_pr = average_precision_score(ohe_y_test_unseen, y_pred_score)

result = str(acc_score)+";"+str(report_map['macro avg']['precision']) + ";" + str(
        report_map['macro avg']['recall']) + ";" + str(report_map['macro avg']['f1-score']) + ";" + str(
        auc_score)  + ";" + str(auc_score_pr)

print("acc;prec;rec;f1;auc;auc-pr")
print(result)

acc;prec;rec;f1;auc;auc-pr
0.9116614447925694;0.9153099395589858;0.8233228895851387;0.8339557314200716;0.9999430479827444;0.9997218680658473


In [13]:
#plot confusion matrix
cm = confusion_matrix(y_test_unseen, np.argmax(y_pred_score, axis=1))
print(cm)

[[15001     0     0]
 [ 3360  7050  4591]
 [    0     0 60004]]


In [14]:
report_map = classification_report(y_test_unseen, np.argmax(y_pred_score, axis=1), output_dict=True)
result_c1 = str(report_map['1']['precision']) + ";" + str(
        report_map['1']['recall']) + ";" + str(report_map['1']['f1-score']) 

result_c2 = str(report_map['2']['precision']) + ";" + str(
        report_map['2']['recall']) + ";" + str(report_map['2']['f1-score']) 
print(result_c1)
print(result_c2)

1.0;0.4699686687554163;0.6394267833658338
0.9289263874912919;1.0;0.9631537973820015


In [15]:
print(report_map)

{'0': {'precision': 0.8170034311856653, 'recall': 1.0, 'f1-score': 0.8992866135123794, 'support': 15001}, '1': {'precision': 1.0, 'recall': 0.4699686687554163, 'f1-score': 0.6394267833658338, 'support': 15001}, '2': {'precision': 0.9289263874912919, 'recall': 1.0, 'f1-score': 0.9631537973820015, 'support': 60004}, 'accuracy': 0.9116614447925694, 'macro avg': {'precision': 0.9153099395589858, 'recall': 0.8233228895851387, 'f1-score': 0.8339557314200716, 'support': 90006}, 'weighted avg': {'precision': 0.9221181635251388, 'recall': 0.9116614447925694, 'f1-score': 0.8985547644010364, 'support': 90006}}
