In [1]:
import re
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
import os,sys
import tqdm
import pickle
import copy

In [2]:
rootPath ='D:/git/IoT_Sensors_Security_Analysis/data/perf/'
tw = 50
resultsPath = 'D:/git/IoT_Sensors_Security_Analysis/results/pi4_2G/tw_{}_turn_1/'.format(tw)
# resultsPath = 'D:/git/IoT_Sensors_Security_Analysis/results/tw_50_turn_1/'

In [3]:
encoded_trace_df = pd.read_pickle(resultsPath+'encoded_bow.pkl')

In [4]:
import torch as T
import torch.utils.data as data
import tqdm
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA

In [5]:
normal = encoded_trace_df[encoded_trace_df.maltype=='normal']

In [6]:
def reshape_matrix(matrix_list):
    new_list = [np.array(i).reshape(-1) for i in matrix_list]
    return new_list

In [7]:
def padding_onehot(onehot_list, padding):
    new_list = []
    for onehot in onehot_list:
        if len(onehot) > padding:
            onehot = np.array(onehot[0:padding])
            new_list.append(onehot)
        else:
            onehot =np.pad(onehot, [(0, padding-len(onehot)), (0, 0)], mode='constant', constant_values=0)
            new_list.append(onehot)
    new_list = reshape_matrix(new_list)
    return new_list

In [8]:
def padding_dictencoding(dictencoding_list, padding):
    new_list = []
    for onehot in dictencoding_list:
        if len(onehot) > padding:
            onehot = np.array(onehot[0:padding])
            new_list.append(onehot)
        else:
            onehot =np.pad(onehot, [(0, padding-len(onehot))], mode='constant', constant_values=0)
            new_list.append(onehot)
    return new_list

In [65]:
# X = normal['frequency_features'].tolist()
# y = normal['maltype'].tolist()
# mlb = LabelBinarizer()

# y = mlb.fit_transform(y).reshape(len(y))
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=42)

In [9]:
class Autoencoder_DNN(T.nn.Module):  # input_len-32-8-32-input_len
  def __init__(self, input_len):
    super(Autoencoder_DNN, self).__init__()
    self.fc1 = T.nn.Linear(input_len, 64)
    self.fc2 = T.nn.Linear(64, 16)
    self.fc3 = T.nn.Linear(16, 8)
    self.fc4 = T.nn.Linear(8, 16)
    self.fc5 = T.nn.Linear(16, 64)
    self.fc6 = T.nn.Linear(64, input_len)

  def encode(self, x):  # input_len-32-8
    z = T.relu(self.fc1(x))
    z = T.relu(self.fc2(z)) 
    z = T.relu(self.fc3(z)) 
    return z  

  def decode(self, x):  # 8-32-input_len
    z = T.relu(self.fc4(x))
    z = T.relu(self.fc5(z)) 
    z = T.relu(self.fc6(z)) 
    return z
    
  def forward(self, x):  # 65-32-8-32-65
    z = self.encode(x) 
    z = self.decode(z) 
    return z  # in [0.0, 1.0]

In [10]:
def train(train_loader, val_loader, input_len):
    net = Autoencoder_DNN(input_len)
    net = net.train() 
    loss_func = T.nn.MSELoss()
    optimizer = T.optim.Adam(net.parameters(), lr=0.01)
    max_epochs = 100
    print("Starting training")
    last_val_loss = float("inf")
    patience = 0
    last_loss = 0
    for epoch in range(0, max_epochs):
        loss = 0
        if epoch > 0 and epoch % (max_epochs/10) == 0:
            print("epoch = %6d" % epoch, end="")
            print(" prev total loss = %7.4f, perv total val-loss = %7.4f" %( last_loss,val_loss))
        for curr_bat in train_loader:
            X = T.Tensor(curr_bat)
            optimizer.zero_grad()
            oupt = net(X)
            loss_obj = loss_func(oupt, X)  # note X not Y
            loss += loss_obj.item()
            loss_obj.backward()
            optimizer.step()
        last_loss = loss
        val_loss = 0
        with T.no_grad():
            for curr_bat in val_loader:
                X = T.Tensor(curr_bat)
                oupt = net(X)
                val_loss_obj = loss_func(oupt, X)  # note X not Y
                val_loss += val_loss_obj.item()
            # print(loss, val_loss)
            if val_loss < last_val_loss:
                last_val_loss = val_loss
                patience = 0
            else:
                patience += 1
        if patience >= 50:
            break               
    print("Training stop at epoch： %d" %epoch)
    return net


In [11]:
def find_threshold(net, X_train):
    net = net.eval()
    loss_func = T.nn.MSELoss()
    with T.no_grad():
        x_t = T.Tensor(X_train)
        y_t = net(x_t)
        y_pred = np.array([loss_func(y_t[i],x_t[i]).item() for i in range(0,len(x_t))])
    down_threshold = np.percentile(y_pred, 2.5)
    up_threshold = np.percentile(y_pred, 97.5)
    return down_threshold, up_threshold

In [12]:
def test(net, X, down_threshold, up_threshold):
    net = net.eval()  
    loss_func = T.nn.MSELoss()
    y_pred = []
    with T.no_grad():
        for i in range(0,len(X)):
            x = T.Tensor(X[i])
            y_t = net(x)
            t = loss_func(y_t,x).item()
            if t > down_threshold and t < up_threshold:
                y_pred.append(0)
            else:
                y_pred.append(1)
    return y_pred

In [70]:
# def find_threshold(net, X_train):
#     net = net.eval()
#     x_t = X_train
#     y_t = net(x_t)
#     y_pred = np.array([float(T.sum((x_t[i]-y_t[i])*(x_t[i]-y_t[i]))) for i in range(0,len(x_t))])
#     down_threshold = np.percentile(y_pred, 5)
#     up_threshold = np.percentile(y_pred, 95)
#     return down_threshold, up_threshold

In [71]:
# def test(net, X, down_threshold, up_threshold):
#     net = net.eval()  
#     encoded = net(X)
#     y_pred = []
#     for i in range(0,len(X)):
#         t = T.sum((X[i]-encoded[i])*(X[i]-encoded[i]))
#         if t > down_threshold and t < up_threshold:
#             y_pred.append(0)
#         else:
#             y_pred.append(1)
#     return y_pred

In [13]:
def train_models(feature, normal):
    X = normal[feature].tolist()
    y = normal['maltype'].tolist()
    y = np.zeros(len(X))
    # mlb = LabelBinarizer()
    
    # y = mlb.fit_transform(y).reshape(len(y))
    # h = .02  # step size in the mesh
    outliers_fraction = 0.15
    nu = 0.05
    clfs = []
    results = []
    preds = []
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=42)
    y_val_2 = [1 if i==0 else -1 for i in y_val]

    X_tv = []
    pca = PCA(n_components=100)
    if feature =='system calls dependency graph':
        X_train = reshape_matrix(X_train)
        X_val = reshape_matrix(X_val)    
        X_tv=[(X_train,X_val)]
        features = [feature]
    elif feature =='one hot encoding':
        X_train_150000 = padding_onehot(X_train, 150000)
        X_val_150000 = padding_onehot(X_val, 150000)
        X_train_pca = pca.fit_transform(X_train_150000)
        X_val_pca = pca.transform(X_val_150000)
        X_tv = [(X_train_pca,X_val_pca), (X_train_150000, X_val_150000)]
        features = [feature+'-pca', feature]
    elif feature =='dict index encoding':
        X_train_150000 = padding_dictencoding(X_train, 150000)
        X_val_150000 = padding_dictencoding(X_val, 150000)
        X_train_pca = pca.fit_transform(X_train_150000)
        X_val_pca = pca.transform(X_val_150000)
        # X_train_10000 = padding_dictencoding(X_train, 55000)
        # X_val_10000 = padding_dictencoding(X_val, 55000)
        X_tv = [(X_train_pca,X_val_pca),(X_train_150000,X_val_150000)]
        features = [feature+'-pca', feature]
    elif feature =='system calls hashing':
        X_tv=[(X_train,X_val)]
        features = [feature]
    else:
        X_train_pca = pca.fit_transform(X_train)
        X_val_pca = pca.transform(X_val)
        X_tv = [(X_train_pca,X_val_pca),(X_train,X_val)]
        features = [feature+'-pca', feature]

    for i,(X_train,X_val)  in enumerate(X_tv):
        feature = features[i]
        X_train = T.FloatTensor(X_train)
        X_val = T.FloatTensor(X_val)
        train_loader = data.DataLoader(X_train, batch_size=256, shuffle=False, drop_last=False, pin_memory=True, num_workers=4)
        val_loader = data.DataLoader(X_val, batch_size=256, shuffle=False, drop_last=False, num_workers=4)
        input_len = len(X_train[0])
        pred = dict()
        t1 =time.time()
        net = train(train_loader,val_loader, input_len)
        t2 =time.time()
        down_threshold, up_threshold = find_threshold(net, X_train)
        name = 'Autoencoder_DNN'
        y_pred = test(net, X_val,down_threshold, up_threshold)
        score = metrics.accuracy_score(y_val,y_pred)
        pred['valid_' + feature + '_' + name] = y_pred
        t = t2 -t1
        res = dict()
        res['Model'] = 'valid_' + feature + '_' + name
        res['Accuracy'] = score
        res['Training time'] = t
        preds.append(pred)          
        results.append(res)
        classifier = dict()
        classifier['Autoencoder_DNN'] = net
        classifier['down_threshold'] = down_threshold 
        classifier['up_threshold'] = up_threshold 
        print('Model: {}, accuracy score: {}, training time is: {} seconds'.format(res['Model'], score, t))
        clfs.append(classifier)
    return clfs, results, preds, pca

In [14]:
def test_models(encoded_trace_df, malware, feature, clfs, selectkmodels):
    dfs = encoded_trace_df[encoded_trace_df.maltype==malware]
    X_test = dfs[feature].tolist()
    X_tv = [] 
    if feature =='system calls dependency graph':
        X_test = reshape_matrix(X_test)    
        X_tv=[X_test]
        features = [feature]
    elif feature =='one hot encoding':
        X_test_55000 = padding_onehot(X_test, 150000)
        X_test_pca = pca.transform(X_test_55000)
        X_tv = [X_test_pca, X_test_55000]
        features = [feature+'-pca', feature]
    elif feature =='dict index encoding':
        X_test_55000 = padding_dictencoding(X_test, 150000)       
        X_test_pca = pca.transform(X_test_55000)
        X_test_10000 = padding_dictencoding(X_test, 150000)       
        X_tv = [X_test_pca, X_test_10000]
        features = [feature+'-pca', feature]
    elif feature =='system calls hashing':
        X_tv=[X_test]
        features = [feature]
    else:

        X_test_pca = pca.transform(X_test)
        X_tv = [X_test_pca, X_test]
        features = [feature+'-pca', feature]

    results = []
    preds = []
    y_test = np.ones(len(X_test))

    for i,classifier in enumerate(clfs):
        feature = features[i]
        X_test = X_tv[i]
        X_test = T.FloatTensor(X_test)
        result = []
        pred = dict()    
        name = 'Autoencoder_DNN'
        res = dict()
        net = classifier[name]
        down_threshold = classifier['down_threshold']
        up_threshold = classifier['up_threshold']
        t1 =time.time()
        y_pred = test(net, X_test, down_threshold, up_threshold)
        t2 =time.time()
        score = metrics.accuracy_score(y_test,y_pred)
        t = t2 -t1
        pred[malware +'_' + feature + '_' + name] = y_pred
        res['Model'] =malware +'_' + feature + '_' + name
        res['Accuracy'] = score
        res['Testing time'] = t
        result.append(res)
        print('Model: {}, accuracy score: {}, testing time is: {} seconds'.format( res['Model'], score, t))
        results.append(result)
        preds.append(pred)
    return results, preds       

In [15]:
def run(device, tw):
    resultsPath = 'D:/git/IoT_Sensors_Security_Analysis/results/{}/tw_{}_turn_1/'.format(device, tw)
    encoded_trace_df = pd.read_pickle(resultsPath+'encoded_bow.pkl')
    normal = encoded_trace_df[encoded_trace_df.maltype=='normal']
    resultsdict = dict()
    predsdict = dict()
    classifiersdict = dict()
    pcas = dict()
    for feature in features:
        #train stage
        clfs, results, preds, pca = train_models(feature, normal)
        
        resultsdict[feature+'_validation'] = results
        predsdict[feature+'_validation'] = preds
        classifiersdict[feature] = clfs
        pcas[feature] = pca
        # testing stage
        for malware in malwares:
            results, preds = test_models(encoded_trace_df, malware, feature, clfs, pca)
            resultsdict[malware +'_' + feature] = results
            predsdict[malware +'_' + feature] = preds
            
    loc=open(resultsPath+'dnn_classifiers.pk','wb')
    pickle.dump(classifiersdict,loc)
    loc=open(resultsPath+'dnn_results.pk','wb')
    pickle.dump(resultsdict,loc)
    loc=open(resultsPath+'dnn_preds.pk','wb')
    pickle.dump(predsdict,loc)   
    loc=open(resultsPath+'dnn_selectkmodels.pk','wb')
    pickle.dump(pcas,loc)   

    rd = []
    for rs in resultsdict:
        if 'validation' in rs:
            for r in resultsdict[rs]:
                rd.append(r)
        else:
            for r in resultsdict[rs]:
                for s in r:
                    rd.append(s)
    rd = pd.DataFrame(rd)
    md = [i.split('_') for i in rd['Model']]
    md = pd.DataFrame(md)
    md.columns  = ['Dataset','Features','Model','Architecture']
    nrd=pd.DataFrame([md['Dataset'],md['Features'],md['Model'],md['Architecture'], rd['Accuracy']]).transpose()
    nrd.to_csv(resultsPath+'Autoencoder_DNN_results.csv',index=None)

In [16]:
features = ['system calls frequency' ,'system calls tfidf','system calls hashing', 'system calls dependency graph', 'one hot encoding', 'dict index encoding' ]
malwares=["delay", "disorder", "freeze", "hop", "mimic", "noise", "repeat", "spoof"]

In [17]:
devices = ['pi4_2G', 'pi4_4G']
tws = [50, 60, 70]
for device in devices:
    for tw in tws:
        if devices == 'pi4_2G' and tw == 50:
            continue
        else:
            run(device, tw)

Starting training
epoch =     30 prev total loss = 828920.7812, perv total val-loss = 337321.2500
epoch =     60 prev total loss = 827474.6875, perv total val-loss = 334745.7812
Training stop at epoch： 78
Model: valid_system calls frequency-pca_Autoencoder_DNN, accuracy score: 0.8359375, training time is: 182.56922817230225 seconds
Starting training
epoch =     30 prev total loss = 12822634.5000, perv total val-loss = 6146123.5000
epoch =     60 prev total loss = 12774234.5000, perv total val-loss = 6101868.0000
epoch =     90 prev total loss = 12773858.5000, perv total val-loss = 6101629.5000
epoch =    120 prev total loss = 12773726.0000, perv total val-loss = 6101546.5000
epoch =    150 prev total loss = 12773620.0000, perv total val-loss = 6101477.0000
epoch =    180 prev total loss = 12773539.5000, perv total val-loss = 6101423.5000
epoch =    210 prev total loss = 12773481.0000, perv total val-loss = 6101383.5000
epoch =    240 prev total loss = 12773440.0000, perv total val-loss

NameError: name 'pca' is not defined

In [16]:
resultsdict = dict()
predsdict = dict()
classifiersdict = dict()
pcas = dict()
for feature in features:
    #train stage
    clfs, results, preds, pca = train_models(feature, normal)
    
    resultsdict[feature+'_validation'] = results
    predsdict[feature+'_validation'] = preds
    classifiersdict[feature] = clfs
    pcas[feature] = pca
    # testing stage
    for malware in malwares:
        results, preds = test_models(malware, feature, clfs, pca)
        resultsdict[malware +'_' + feature] = results
        predsdict[malware +'_' + feature] = preds
        
loc=open(resultsPath+'dnn_classifiers.pk','wb')
pickle.dump(classifiersdict,loc)
loc=open(resultsPath+'dnn_results.pk','wb')
pickle.dump(resultsdict,loc)
loc=open(resultsPath+'dnn_preds.pk','wb')
pickle.dump(predsdict,loc)   
loc=open(resultsPath+'dnn_selectkmodels.pk','wb')
pickle.dump(pcas,loc)   

Starting training
epoch =     30 prev total loss = 835459.3750, perv total val-loss = 335393.2812
epoch =     60 prev total loss = 827658.4688, perv total val-loss = 334605.1875
epoch =     90 prev total loss = 827652.0625, perv total val-loss = 334604.4062
epoch =    120 prev total loss = 827651.7500, perv total val-loss = 334603.9375
epoch =    150 prev total loss = 827651.5938, perv total val-loss = 334603.6250
epoch =    180 prev total loss = 827651.5000, perv total val-loss = 334603.4062
epoch =    210 prev total loss = 827651.4688, perv total val-loss = 334603.2812
epoch =    240 prev total loss = 827651.4062, perv total val-loss = 334603.1562
epoch =    270 prev total loss = 827651.4062, perv total val-loss = 334603.0938
Training stop at epoch： 299
Model: valid_system calls frequency-pca_Autoencoder_DNN, accuracy score: 0.8359375, training time is: 458.4078722000122 seconds
Starting training
epoch =     30 prev total loss = 13168792.5000, perv total val-loss = 6295819.5000
epoch

In [17]:
rd = []
for rs in resultsdict:
    if 'validation' in rs:
        for r in resultsdict[rs]:
            rd.append(r)
    else:
        for r in resultsdict[rs]:
            for s in r:
                rd.append(s)

In [18]:
rd = pd.DataFrame(rd)

In [19]:
md = [i.split('_') for i in rd['Model']]
md = pd.DataFrame(md)


In [20]:
md

Unnamed: 0,0,1,2,3
0,valid,system calls frequency-pca,Autoencoder,DNN
1,valid,system calls frequency,Autoencoder,DNN
2,delay,system calls frequency-pca,Autoencoder,DNN
3,delay,system calls frequency,Autoencoder,DNN
4,disorder,system calls frequency-pca,Autoencoder,DNN
...,...,...,...,...
85,noise,dict index encoding,Autoencoder,DNN
86,repeat,dict index encoding-pca,Autoencoder,DNN
87,repeat,dict index encoding,Autoencoder,DNN
88,spoof,dict index encoding-pca,Autoencoder,DNN


In [21]:
md.columns  = ['Dataset','Features','Model','Architecture']

In [22]:
nrd=pd.DataFrame([md['Dataset'],md['Features'],md['Model'],md['Architecture'], rd['Accuracy']]).transpose()

In [23]:
nrd

Unnamed: 0,Dataset,Features,Model,Architecture,Accuracy
0,valid,system calls frequency-pca,Autoencoder,DNN,0.835938
1,valid,system calls frequency,Autoencoder,DNN,0.90625
2,delay,system calls frequency-pca,Autoencoder,DNN,0.096296
3,delay,system calls frequency,Autoencoder,DNN,0.133333
4,disorder,system calls frequency-pca,Autoencoder,DNN,0.192308
...,...,...,...,...,...
85,noise,dict index encoding,Autoencoder,DNN,0.109264
86,repeat,dict index encoding-pca,Autoencoder,DNN,0.835681
87,repeat,dict index encoding,Autoencoder,DNN,0.011737
88,spoof,dict index encoding-pca,Autoencoder,DNN,0.823834


In [24]:
nrd.to_csv(resultsPath+'Autoencoder_DNN_results.csv',index=None)