In [19]:
import matplotlib
import matplotlib.pyplot as plt
import wisardpkg as wp
import numpy as np
import pandas as pd
import time
import cv2
import gc
import os
import pickle
%matplotlib inline

import itertools
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

def plot_confusion_matrix(name, cm, classes,
                          normalize=True,
                          title=' ',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    plt.imshow(cm, interpolation='nearest', cmap=cmap, aspect='auto')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    # "K-S D-value = {}\nK-S p-value = {:.3e}".format(round(ks_D, 3), ks_p)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, "{:.2f}".format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(name, pad_inches=0.2, bbox_inches='tight', dpi=300)
    plt.close()
    
def clf_eval(name, y_true, y_pred, classes=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
    clf_matrix = confusion_matrix(y_true, y_pred)
    #print('Classification Report')
    # print(classification_report(y_true, y_pred, target_names=classes))

    plot_confusion_matrix(name, clf_matrix, classes=classes)
    
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        if img is not None:
            images.append(img)
    return images

In [16]:
img = load_images_from_folder('malevis_train_val_224x224/train/Adposhel')
data = np.array(img).reshape(350,224*224*3)
y = ['Adposhel']*350

img = load_images_from_folder('malevis_train_val_224x224/train/Agent')
data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
y = np.concatenate((y,['Agent']*350),axis=0)

img = load_images_from_folder('malevis_train_val_224x224/train/Other')
data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
y = np.concatenate((y,['Other']*350),axis=0)


In [17]:
print(data.shape)
X = np.where(data > 127, 1, 0)
print(X.shape)

(1050, 150528)
(1050, 150528)


In [20]:
SPLIT_SIZE = 0.3

ADDRESS_SIZE = [14,28,42,56,70,84,98]
ADDRESS_SIZE = [50]
addressSize = 50

NUM_EXPERIMENTS = 1

wsd = wp.Wisard(addressSize, verbose=True)

print("Splitting...")

X_traincv, X_testcv, y_traincv, y_testcv = model_selection.train_test_split(X,
                                                                            y,
                                                                            test_size=SPLIT_SIZE,
                                                                            random_state=0)
print("Split done")

print("Training...")

start_train = time.time()
wsd.train(X_traincv,y_traincv)
finish_train = time.time()

print("Netword trained")

print("Testing...")
# classify some data
start_classify = time.time()
out = wsd.classify(X_testcv)
finish_classify = time.time()

print("Tests done")

# the output of classify is a string list in the same sequence as the input
total = 0
corrects = 0
for count in range(len(y_testcv)):
    if y_testcv[count] == out[count]:
        corrects = corrects + 1
    total = total + 1

clf_eval('confusion ' + str(int(SPLIT_SIZE*100)) + ' ' + str(addressSize) + '.png', y_testcv, out, classes = list(dict.fromkeys(y)))
print(float(corrects)/total)
print(finish_train-start_train)
print(finish_classify-start_classify)


Splitting...
Split done
Training...
Netword trained
Testing...
Tests done
0.6761904761904762
16.918784618377686
7.640617609024048


In [62]:
# Import

THRESHOLD = 127

def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset -f")
    globals().update(to_save)

def load_data():
    img = load_images_from_folder('malevis_train_val_224x224/train/Adposhel')
    data = np.array(img).reshape(350,224*224*3)
    y = ['Adposhel']*350
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Agent')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Agent']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Allaple')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Allaple']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Amonetize')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Amonetize']*350),axis=0)
    del img
    gc.collect()

    img = load_images_from_folder('malevis_train_val_224x224/train/Androm')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Androm']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Autorun')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Autorun']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/BrowseFox')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['BrowseFox']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Dinwod')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Dinwod']*350),axis=0)
    del img
    gc.collect()

    img = load_images_from_folder('malevis_train_val_224x224/train/Elex')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Elex']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Expiro')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Expiro']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Fasong')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Fasong']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/HackKMS')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['HackKMS']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Hlux')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Hlux']*350),axis=0)
    del img
    gc.collect()
    
    img = load_images_from_folder('malevis_train_val_224x224/train/Injector')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Injector']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/InstallCore')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['InstallCore']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/MultiPlug')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['MultiPlug']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Neoreklami')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Neoreklami']*350),axis=0)
    del img
    gc.collect()
    
    img = load_images_from_folder('malevis_train_val_224x224/train/Neshta')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Neshta']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Other')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Other']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Regrun')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Regrun']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Sality')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Sality']*350),axis=0)
    del img
    gc.collect()
    
    img = load_images_from_folder('malevis_train_val_224x224/train/Snarasite')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Snarasite']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Stantinko')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Stantinko']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/VBA')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['VBA']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/VBKrypt')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['VBKrypt']*350),axis=0)
    del img
    img = load_images_from_folder('malevis_train_val_224x224/train/Vilsel')
    data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
    y = np.concatenate((y,['Vilsel']*350),axis=0)
    del img
    gc.collect()
    return data,y

def threshold(data):
    X = []
    for count in range(len(data)):
        atual = []
        for i in range(len(data[count])):
            if data[count][i] > THRESHOLD:
                atual.append(1)
            else:
                atual.append(0)
        X.append(atual)
        del i
        del atual
        gc.collect()
        my_reset('data','y','gc','THRESHOLD')
    del count
    gc.collect()
    return X

In [63]:
data, y = load_data()
print("Data imported")
my_reset('data','y','threshold','THRESHOLD','gc')
X = threshold(data)
print("Data processed")
my_reset('X','y')

Data imported


KeyboardInterrupt: 

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import wisardpkg as wp
import numpy as np
import pandas as pd
import time
import cv2
import gc
import os
%matplotlib inline

import itertools
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

def plot_confusion_matrix(name, cm, classes,
                          normalize=True,
                          title=' ',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    plt.imshow(cm, interpolation='nearest', cmap=cmap, aspect='auto')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    # "K-S D-value = {}\nK-S p-value = {:.3e}".format(round(ks_D, 3), ks_p)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, "{:.2f}".format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(name, pad_inches=0.2, bbox_inches='tight')
    plt.close()
    
def clf_eval(name, y_true, y_pred, classes=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']):
    clf_matrix = confusion_matrix(y_true, y_pred)
    #print('Classification Report')
    # print(classification_report(y_true, y_pred, target_names=classes))

    plot_confusion_matrix(name, clf_matrix, classes=classes)
    
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        if img is not None:
            images.append(img)
    return images

In [None]:
SPLIT_SIZE = 0.3

ADDRESS_SIZE = [14,28,42,56,70,84,98]
ADDRESS_SIZE = [50]

NUM_EXPERIMENTS = 1

prec = []
sd_prec = []
t_train = []
t_test = []

for addressSize in ADDRESS_SIZE:
    
    print(ADDRESS_SIZE)

    precision = []
    time_train = 0
    time_test = 0
    for j in range(NUM_EXPERIMENTS):
        print("Experiment "+str(j))
        ignoreZero  = False # optional; causes the rams to ignore the address 0

    # False by default for performance reasons,
    # when True, WiSARD prints the progress of train() and classify()
        verbose = False

        wsd = wp.Wisard(addressSize, ignoreZero=ignoreZero, verbose=verbose)
        
        
        X_traincv, X_testcv, y_traincv, y_testcv = model_selection.train_test_split(X,
                                                                                    y,
                                                                                    test_size=SPLIT_SIZE,
                                                                                    random_state=0)
        
        start_train = time.time()
        wsd.train(X_traincv,y_traincv)
        finish_train = time.time()
        
        print("Netword trained")
        
        # classify some data
        start_classify = time.time()
        out = wsd.classify(X_testcv)
        finish_classify = time.time()
        
        print("Tests done")

        # the output of classify is a string list in the same sequence as the input
        total = 0
        corrects = 0
        for count in range(len(y_testcv)):
            if y_testcv[count] == out[count]:
                corrects = corrects + 1
            total = total + 1

        clf_eval('confusion ' + str(int(SPLIT_SIZE*100)) + ' ' + str(addressSize) + '.png', y_testcv, out)
        time_train = time_train + finish_train-start_train
        time_test = finish_classify-start_classify
        precision.append(float(corrects)/total)

    prec.append(np.mean(precision))
    sd_prec.append(np.std(precision))
    t_train.append(time_train/NUM_EXPERIMENTS)
    t_test.append(time_test/NUM_EXPERIMENTS)

[50]
Experiment 0
Data imported


In [None]:
print(prec)
print(sd_prec)

In [None]:
SPLIT_SIZE = 0.3
THRESHOLD = 127

ADDRESS_SIZE = [14,28,42,56,70,84,98]
ADDRESS_SIZE = [50]

NUM_EXPERIMENTS = 1

#prec = []
#sd_prec = []
#t_train = []
#t_test = []

for addressSize in ADDRESS_SIZE:
    
    print(ADDRESS_SIZE)

    precision = []
    time_train = 0
    time_test = 0
    for j in range(NUM_EXPERIMENTS):
        print("Experiment "+str(j))
        ignoreZero  = False # optional; causes the rams to ignore the address 0

    # False by default for performance reasons,
    # when True, WiSARD prints the progress of train() and classify()
        verbose = False

        wsd = wp.Wisard(addressSize, ignoreZero=ignoreZero, verbose=verbose)

        X = []
        img = load_images_from_folder('malevis_train_val_224x224/train/Adposhel')
        data = np.array(img).reshape(350,224*224*3)
        y = ['Adposhel']*350
        img = load_images_from_folder('malevis_train_val_224x224/train/Agent')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Agent']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Allaple')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Allaple']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Amonetize')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Amonetize']*350),axis=0)
        atual = []
        for count in range(len(data)):
            for i in range(len(data[count])):
                if data[count][i] > THRESHOLD:
                    atual.append(1)
                else:
                    atual.append(0)
            X.append(atual)
        
        
        X_traincv, X_testcv, y_traincv, y_testcv = model_selection.train_test_split(X,
                                                                                    y,
                                                                                    test_size=SPLIT_SIZE,
                                                                                    random_state=0)
        X_test = X_testcv
        y_test = y_testcv
        
        #start_train = time.time()
        wsd.train(X_traincv,y_traincv)
        gc.collect()
        #finish_train = time.time()
        #wsd.json()

        X = []
        img = load_images_from_folder('malevis_train_val_224x224/train/Androm')
        data = np.array(img).reshape(350,224*224*3)
        y = ['Androm']*350
        img = load_images_from_folder('malevis_train_val_224x224/train/Autorun')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Autorun']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/BrowseFox')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['BrowseFox']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Dinwod')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Dinwod']*350),axis=0)
        atual = []
        for count in range(len(data)):
            for i in range(len(data[count])):
                if data[count][i] > THRESHOLD:
                    atual.append(1)
                else:
                    atual.append(0)
            X.append(atual)
            
        X_traincv, X_testcv, y_traincv, y_testcv = model_selection.train_test_split(X,
                                                                                    y,
                                                                                    test_size=SPLIT_SIZE,
                                                                                    random_state=0)
        X_test = np.concatenate((X_test,X_testcv),axis=0)
        y_test = np.concatenate((y_test,y_testcv),axis=0)
        
        #start_train = time.time()
        wsd.train(X_traincv,y_traincv)
        gc.collect()
        #finish_train = time.time()
        #wsd.json()
        
        X = []
        img = load_images_from_folder('malevis_train_val_224x224/train/Elex')
        data = np.array(img).reshape(350,224*224*3)
        y = ['Elex']*350
        img = load_images_from_folder('malevis_train_val_224x224/train/Expiro')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Expiro']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Fasong')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Fasong']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/HackKMS')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['HackKMS']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Hlux')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Hlux']*350),axis=0)
        atual = []
        for count in range(len(data)):
            for i in range(len(data[count])):
                if data[count][i] > THRESHOLD:
                    atual.append(1)
                else:
                    atual.append(0)
            X.append(atual)        
        
        X_traincv, X_testcv, y_traincv, y_testcv = model_selection.train_test_split(X,
                                                                                    y,
                                                                                    test_size=SPLIT_SIZE,
                                                                                    random_state=0)
        X_test = np.concatenate((X_test,X_testcv),axis=0)
        y_test = np.concatenate((y_test,y_testcv),axis=0)
        
        #start_train = time.time()
        wsd.train(X_traincv,y_traincv)
        gc.collect()
        #finish_train = time.time()
        #wsd.json()      
        
        X = []
        img = load_images_from_folder('malevis_train_val_224x224/train/Injector')
        data = np.array(img).reshape(350,224*224*3)
        y = ['Injector']*350
        img = load_images_from_folder('malevis_train_val_224x224/train/InstallCore')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['InstallCore']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/MultiPlug')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['MultiPlug']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Neoreklami')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Neoreklami']*350),axis=0)
        atual = []
        for count in range(len(data)):
            for i in range(len(data[count])):
                if data[count][i] > THRESHOLD:
                    atual.append(1)
                else:
                    atual.append(0)
            X.append(atual)
        
        X_traincv, X_testcv, y_traincv, y_testcv = model_selection.train_test_split(X,
                                                                                    y,
                                                                                    test_size=SPLIT_SIZE,
                                                                                    random_state=0)
        X_test = np.concatenate((X_test,X_testcv),axis=0)
        y_test = np.concatenate((y_test,y_testcv),axis=0)
        
        #start_train = time.time()
        wsd.train(X_traincv,y_traincv)
        gc.collect()
        #finish_train = time.time()
        #wsd.json()       
        
        X = []
        img = load_images_from_folder('malevis_train_val_224x224/train/Neshta')
        data = np.array(img).reshape(350,224*224*3)
        y = ['Neshta']*350
        img = load_images_from_folder('malevis_train_val_224x224/train/Other')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Other']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Regrun')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Regrun']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Sality')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Sality']*350),axis=0)        
        atual = []
        for count in range(len(data)):
            for i in range(len(data[count])):
                if data[count][i] > THRESHOLD:
                    atual.append(1)
                else:
                    atual.append(0)
            X.append(atual)
            
        X_traincv, X_testcv, y_traincv, y_testcv = model_selection.train_test_split(X,
                                                                                    y,
                                                                                    test_size=SPLIT_SIZE,
                                                                                    random_state=0)
        X_test = np.concatenate((X_test,X_testcv),axis=0)
        y_test = np.concatenate((y_test,y_testcv),axis=0)
        
        #start_train = time.time()
        wsd.train(X_traincv,y_traincv)
        gc.collect()
        #finish_train = time.time()
        #wsd.json()       
        
        X = []
        img = load_images_from_folder('malevis_train_val_224x224/train/Snarasite')
        data = np.array(img).reshape(350,224*224*3)
        y = ['Snarasite']*350
        img = load_images_from_folder('malevis_train_val_224x224/train/Stantinko')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Stantinko']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/VBA')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['VBA']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/VBKrypt')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['VBKrypt']*350),axis=0)
        img = load_images_from_folder('malevis_train_val_224x224/train/Vilsel')
        data = np.concatenate((data, np.array(img).reshape(350,224*224*3)), axis=0)
        y = np.concatenate((y,['Vilsel']*350),axis=0)
        atual = []
        for count in range(len(data)):
            for i in range(len(data[count])):
                if data[count][i] > THRESHOLD:
                    atual.append(1)
                else:
                    atual.append(0)
            X.append(atual)
            
        X_traincv, X_testcv, y_traincv, y_testcv = model_selection.train_test_split(X,
                                                                                    y,
                                                                                    test_size=SPLIT_SIZE,
                                                                                    random_state=0)
        X_test = np.concatenate((X_test,X_testcv),axis=0)
        y_test = np.concatenate((y_test,y_testcv),axis=0)
        
        #start_train = time.time()
        wsd.train(X_traincv,y_traincv)
        gc.collect()
        #finish_train = time.time()
        #wsd.json()        
        
    # classify some data
        #start_classify = time.time()
        out = wsd.classify(X_test)
        #finish_classify = time.time()

        # the output of classify is a string list in the same sequence as the input
        total = 0
        corrects = 0
        for count in range(len(y_test)):
            if y_test[count] == out[count]:
                corrects = corrects + 1
            total = total + 1

        clf_eval('confusion ' + str(int(SPLIT_SIZE*100)) + ' ' + str(addressSize) + '.png', y_test, out)
        #time_train = time_train + finish_train-start_train
        #time_test = finish_classify-start_classify
        #precision.append(float(corrects)/total)

    #prec.append(np.mean(precision))
    #sd_prec.append(np.std(precision))
    #t_train.append(time_train/10)
    #t_test.append(time_test/10)

[50]
Experiment 0


In [4]:
print(total)
print(corrects)

2730
1863


In [46]:
ADDRESS_SIZE = [14,28,42,56,70,84,98]
ADDRESS_SIZE = [98]

prec = []
sd_prec = []
t_train = []
t_test = []

for addressSize in ADDRESS_SIZE:
    
    print(ADDRESS_SIZE)

    precision = []
    time_train = 0
    time_test = 0
    for j in range(10):
        print("Experiment "+j)
        ignoreZero  = False # optional; causes the rams to ignore the address 0

    # False by default for performance reasons,
    # when True, WiSARD prints the progress of train() and classify()
        verbose = False

        wsd = wp.Wisard(addressSize, ignoreZero=ignoreZero, verbose=verbose)
        
    # train using the input data
        start_train = time.time()
        wsd.train(X_traincv,y_traincv)
        finish_train = time.time()
        wsd.json()

    # classify some data
        start_classify = time.time()
        out = wsd.classify(X_testcv)
        finish_classify = time.time()

        # the output of classify is a string list in the same sequence as the input
        total = 0
        corrects = 0
        for count in range(len(y_testcv)):
            if y_testcv[count] == out[count]:
                corrects = corrects + 1
            total = total + 1

        clf_eval('confusion ' + str(int(SPLIT_SIZE*100)) + ' ' + str(addressSize) + '.png', y_testcv, out)
        time_train = time_train + finish_train-start_train
        time_test = finish_classify-start_classify
        precision.append(float(corrects)/total)

    prec.append(np.mean(precision))
    sd_prec.append(np.std(precision))
    t_train.append(time_train/10)
    t_test.append(time_test/10)

RuntimeError: The input data has a value bigger than base of addresing!

In [228]:
title = "Split 90/10 - Train Time"
name = "Split 90_10 - Train Time"

plt.figure(figsize=(15,8))
plt.grid(zorder=0)
plt.xticks(np.arange(5, 55, step=10), fontsize=24)
#plt.yticks(np.arange(0.5, 1.1, step=0.1), fontsize=14)
#plt.ylim((0.5,1))
plt.xlabel("Address Size",fontsize=26)
plt.ylabel("Time (s)",fontsize=26)
plt.title(title,fontsize=30)
plt.bar(ADDRESS_SIZE,t_train,width=5, zorder=3)
plt.savefig(name+'accuracy.png')
plt.close()

In [229]:
title = "Split 90/10 - Test Time"
name = "Split 90_10 - Test Time"

plt.figure(figsize=(15,8))
plt.grid(zorder=0)
plt.xticks(np.arange(5, 55, step=10), fontsize=24)
#plt.yticks(np.arange(0.5, 1.1, step=0.1), fontsize=14)
#plt.ylim((0.5,1))
plt.xlabel("Address Size",fontsize=26)
plt.ylabel("Time (s)",fontsize=26)
plt.title(title,fontsize=30)
plt.bar(ADDRESS_SIZE,t_test,width=5, zorder=3)
plt.savefig(name+'accuracy.png')
plt.close()

In [230]:
print(prec)
print(sd_prec)

[0.8357857142857142, 0.9577285714285715, 0.9429857142857141, 0.8146857142857143, 0.5583142857142857]
[0.00517460439185884, 0.0009803164836741552, 0.0024180022113053637, 0.003216538892813137, 0.005197644678513678]


In [231]:
from sklearn.model_selection import KFold

In [238]:
X = np.array(X)
y = np.array(y)

In [246]:
kf = KFold(n_splits=10)

ADDRESS_SIZE = [5,15,25,35,45]

prec = []
sd_prec = []
t_train = []
t_test = []

for addressSize in ADDRESS_SIZE:
    
    precision = []
    time_train = 0
    time_test = 0
    
    for train_index, test_index in kf.split(X):
        print(train_index, test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ignoreZero  = False
        verbose = False

        wsd = wp.Wisard(addressSize, ignoreZero=ignoreZero, verbose=verbose)
        jsonConfig.append(wsd.json())
        
        start_train = time.time()
        wsd.train(X_train,y_train)
        finish_train = time.time()

        start_classify = time.time()
        out = wsd.classify(X_test)
        finish_classify = time.time()

        # the output of classify is a string list in the same sequence as the input
        total = 0
        corrects = 0
        for count in range(len(y_test)):
            if y_test[count] == out[count]:
                corrects = corrects + 1
            total = total + 1

        clf_eval('kfold ' + str(int(SPLIT_SIZE*100)) + ' ' + str(addressSize) + '.png', y_testcv, out)
        time_train = time_train + finish_train-start_train
        time_test = finish_classify-start_classify
        precision.append(float(corrects)/total)

    prec.append(np.mean(precision))
    sd_prec.append(np.std(precision))
    t_train.append(time_train/10)
    t_test.append(time_test/10)
    print("=========================")

[ 7000  7001  7002 ... 69997 69998 69999] [   0    1    2 ... 6997 6998 6999]
[    0     1     2 ... 69997 69998 69999] [ 7000  7001  7002 ... 13997 13998 13999]
[    0     1     2 ... 69997 69998 69999] [14000 14001 14002 ... 20997 20998 20999]
[    0     1     2 ... 69997 69998 69999] [21000 21001 21002 ... 27997 27998 27999]
[    0     1     2 ... 69997 69998 69999] [28000 28001 28002 ... 34997 34998 34999]
[    0     1     2 ... 69997 69998 69999] [35000 35001 35002 ... 41997 41998 41999]
[    0     1     2 ... 69997 69998 69999] [42000 42001 42002 ... 48997 48998 48999]
[    0     1     2 ... 69997 69998 69999] [49000 49001 49002 ... 55997 55998 55999]
[    0     1     2 ... 69997 69998 69999] [56000 56001 56002 ... 62997 62998 62999]
[    0     1     2 ... 62997 62998 62999] [63000 63001 63002 ... 69997 69998 69999]
[ 7000  7001  7002 ... 69997 69998 69999] [   0    1    2 ... 6997 6998 6999]
[    0     1     2 ... 69997 69998 69999] [ 7000  7001  7002 ... 13997 13998 13999]
[   

In [247]:
title = "10-fold - Train Time"
name = "10-fold - Train Time"

plt.figure(figsize=(15,8))
plt.grid(zorder=0)
plt.xticks(np.arange(5, 55, step=10), fontsize=24)
#plt.yticks(np.arange(0.5, 1.1, step=0.1), fontsize=14)
#plt.ylim((0.5,1))
plt.xlabel("Address Size",fontsize=26)
plt.ylabel("Time (s)",fontsize=26)
plt.title(title,fontsize=30)
plt.bar(ADDRESS_SIZE,t_train,width=5, zorder=3)
plt.savefig(name+'accuracy.png')
plt.close()

In [248]:
title = "10-fold - Test Time"
name = "10-fold - Test Time"

plt.figure(figsize=(15,8))
plt.grid(zorder=0)
plt.xticks(np.arange(5, 55, step=10), fontsize=24)
#plt.yticks(np.arange(0.5, 1.1, step=0.1), fontsize=14)
#plt.ylim((0.5,1))
plt.xlabel("Address Size",fontsize=26)
plt.ylabel("Time (s)",fontsize=26)
plt.title(title,fontsize=30)
plt.bar(ADDRESS_SIZE,t_test,width=5, zorder=3)
plt.savefig(name+'accuracy.png')
plt.close()

In [249]:
print(prec)
print(sd_prec)

[0.8338000000000001, 0.9560571428571428, 0.9402285714285714, 0.8124714285714285, 0.5533714285714286]
[0.008732580327925617, 0.003352854403797044, 0.005225213597821349, 0.012707710980634824, 0.01718604084715267]
