In [104]:
import numpy as np
from os import listdir
from os.path import join

in_dir = "preprocessed_data"

labels = listdir(in_dir)

count = 0
for label in labels:
    count += len(listdir(join(in_dir, label)))

mat = np.zeros((count, 256), dtype=np.int16)
targets = np.zeros(count, dtype=np.int8)

print(f"Shape of Matrix: {np.shape(mat)}, Targets: {np.shape(targets)}")

Shape of Matrix: (20565, 256), Targets: (20565,)


In [105]:
index = 0

for i, label in enumerate(labels):
    print(f"Loading '{label}' ", end="")
    
    for filename in listdir(join(in_dir, label)):
        with open(join(in_dir, label, filename)) as file:
            for line in file.readlines():
                tokens = line.strip().split(":")
                mat[index][int(tokens[0])] = int(tokens[1])
        
        targets[index] = i
        index += 1
    
    print("-> Done")

Loading 'image' -> Done
Loading 'text' -> Done
Loading 'audio' -> Done
Loading 'executable' -> Done


In [106]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data_train, data_test, targets_train, targets_test = train_test_split(mat, targets, test_size=0.3, random_state=10)

scaler = StandardScaler()
scaler.fit(data_train)

data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

print(f"Shape of Training Data: {np.shape(data_train)}, Testing Data: {np.shape(data_test)}")
print(f"Shape of Training Targets: {np.shape(targets_train)}, Testing Targets: {np.shape(targets_test)}")

Shape of Training Data: (14395, 256), Testing Data: (6170, 256)
Shape of Training Targets: (14395,), Testing Targets: (6170,)


In [107]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(128,64,32,16), alpha=1e-5, activation='relu', solver='adam')

print("Training MLPClassifier ", end="")
clf.fit(data_train, targets_train)
print("-> Done")

Training MLPClassifier -> Done


In [108]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

predictions = clf.predict(data_test)
accuracy = 100.0 * accuracy_score(targets_test, predictions)
percision = 100.0 * precision_score(targets_test, predictions, average="weighted")
recall = 100.0 * recall_score(targets_test, predictions, average="weighted")
f1_score = 100.0 * f1_score(targets_test, predictions, average="weighted")
cmat = confusion_matrix(targets_test, predictions)

print(f"Labels: {labels}")
print(f"Accuracy: {accuracy}")
print(f"Percision: {percision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(cmat)

Labels: ['image', 'text', 'audio', 'executable']
Accuracy: 98.91410048622366
Percision: 98.91589481346284
Recall: 98.91410048622366
F1 Score: 98.91448506509334
[[1650    1   19    1]
 [   0 1572    0    1]
 [  23    0 1399    4]
 [   2    4   12 1482]]


In [109]:
from joblib import dump

model_dir = "model"

print("Saving Model")
dump(clf, join(model_dir, "model.dat"))

print("Saving Scaler")
dump(scaler, join(model_dir, "scaler.dat"))

print("Writing Labels")
with open(join(model_dir, "labels.csv"), "w") as file:
    file.write(",".join(labels))

Saving Model
Saving Scaler
Writing Labels
