In [1]:
import re
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
import os,sys

In [2]:
def get_systemcall_name_with_fork(line):
    syscall = ''
    linelist = line.split(' ')
    pid = int(linelist[0])
    time_cost = re.split('<|>', linelist[-1])

    try:
        time_cost = float(time_cost[1])
    except:
        time_cost = 0
    for i,l in enumerate(linelist):
        if re.match(r'\d{2}:\d{2}:\d{2}', l) is not None:
            timestamp = l
            if '(' in linelist[i+1]:
                syscall = linelist[i+1].split('(')[0]
            elif '<...' in linelist[i+1]:
                syscall = linelist[i+2]
            elif '+++' in linelist[i+1]:
                syscall = linelist[i+2]
            elif '---' in linelist[i+1]:
                syscall = linelist[i+2]
            else:
                syscall = '!!'+linelist[i+1]
            break
    return [pid, timestamp, syscall, time_cost]

In [3]:
def get_systemcall_name_without_fork(line):
    syscall = ''
    linelist = line.split(' ')
    # pid = int(linelist[0])
    time_cost = re.split('<|>', linelist[-1])

    try:
        time_cost = float(time_cost[1])
    except:
        time_cost = 0
    for i,l in enumerate(linelist):
        if re.match(r'\d{2}:\d{2}:\d{2}', l) is not None:
            timestamp = l
            if '(' in linelist[i+1]:
                syscall = linelist[i+1].split('(')[0]
            elif '<...' in linelist[i+1]:
                syscall = linelist[i+2]
            elif '+++' in linelist[i+1]:
                syscall = linelist[i+2]
            elif '---' in linelist[i+1]:
                syscall = linelist[i+2]
            else:
                syscall = '!!'+linelist[i+1]
            break
    return [timestamp, syscall, time_cost]

In [4]:
def create_onehot_encoding(total, index):
    onehot = []
    for i in range(0, total):
        if i == index:
            onehot.append(1)
        else:
            onehot.append(0)
    return onehot

In [5]:
def create_vectorizers(corpus, ngram):
    syscall_dict = {}
    ngrams_dict = {}
    countvectorizer = CountVectorizer().fit(corpus)
    syscall_dict = countvectorizer.vocabulary_
    countvectorizer = CountVectorizer(ngram_range=(1, ngram)).fit(corpus)
    ngrams_dict = countvectorizer.vocabulary_
    tfidfvectorizer = TfidfVectorizer(ngram_range=(1, ngram), vocabulary=ngrams_dict).fit(corpus)
    hashingvectorizer = HashingVectorizer(n_features=2**7).fit(corpus)  
    return syscall_dict, ngrams_dict, countvectorizer, tfidfvectorizer, hashingvectorizer


In [6]:
def read_dict_from_file(dictfilepath):
    syscall_dict = dict()
    syscall_dict_onehot = dict()
    index_dict = dict()
    file_dict = pd.read_csv(dictfilepath,header=None)
    file_dict.columns = ['syscall', 'index']
    syscall = file_dict['syscall']
    index = file_dict['index']
    total = len(syscall)
    for i,sc in enumerate(syscall):
        syscall_dict[sc] = index[i]
        syscall_dict_onehot[sc] = create_onehot_encoding(total, index[i])
        index_dict[index[i]] = sc
    return syscall_dict,syscall_dict_onehot,index_dict

In [7]:
def trace_onehot_encoding(trace, syscall_dict_onehot):
    encoded_trace = []
    for syscall in trace:
        syscall = syscall.lower()
        if syscall.lower() in syscall_dict_onehot:
            one_hot = syscall_dict_onehot[syscall]
        else:
            syscall = 'UNK'
            one_hot = syscall_dict_onehot[syscall]
        encoded_trace.append(one_hot)
    return encoded_trace

In [8]:
def get_distance(trace,head,tail):
    start = 0
    end = -1
    distance = 0.0
    for i,s in enumerate(trace):
        if s == head:
            start=i
            rest = trace[i+1:]
            # print(rest)
            if i+1 < len(trace):
                if rest.count(head)>0:
                    end = rest.index(head)+start+1
                    sort = trace[start+1:end]
                    for j,t in enumerate(sort):
                        if t==tail:
                            distance += 1/(j+1)
                else:
                    sort = trace[start+1:]
                    for j,t in enumerate(sort):
                        if t==tail:
                            distance += 1/(j+1)
    return distance

In [9]:
def get_dependency_graph(trace,term_dict):
    dp = []
    for head in term_dict:
        dp_ = []
        for tail in term_dict:
            if head == tail:
                dp_.append(0)
            else:
                distance = get_distance(trace,head,tail)
                dp_.append(distance)
        dp.append(dp_)
    return dp

In [10]:
def get_frequency_vector(trace, syscall_dict):
    syscall_frequency = []
    for syscall in syscall_dict:
        f = trace.count(syscall)
        syscall_frequency.append(f)
    return syscall_frequency

In [11]:
def get_bigram_dict(syscall_dict):
    bigram_dict = []
    for i in syscall_dict:
        for j in syscall_dict:
            bigram_dict.append((i,j))
    return bigram_dict

In [12]:
def get_trigram_dict(syscall_dict):
    trigram_dict = []
    for i in syscall_dict:
        for j in syscall_dict:
            for k in syscall_dict:
                trigram_dict.append((i,j,k))
    return trigram_dict

In [13]:
def get_ngram_trace(syscall_list, n):
    n_gram = list(nltk.ngrams(syscall_list, n))
    return n_gram

In [14]:
def from_trace_to_longstr(syscall_trace):
    tracestr = ''
    for syscall in syscall_trace:
        tracestr += syscall + ' '
    return tracestr

In [15]:
def from_raw_to_features(inputFilePath, syscall_dict, bigram_dict, syscall_dict_onehot):
    trace = []
    # turn raw data to standardlized syscalls trace
    with open(inputFilePath, 'r') as inputfile:
        for line in inputfile:
            syscall = get_systemcall_name_without_fork(line)
            if syscall[1].startswith('!!'):
                print(line)
            else:
                trace.append(syscall)    
    trace = pd.DataFrame(trace)
    trace.columns = ['timestamp', 'syscall', 'timecost']
   

    # get bi/tri-gram trace
    # syscall_trace = replace_with_UNK(trace, syscall_dict)
    bigram_trace = get_ngram_trace(syscall_trace, 2)
    trigram_trace = get_ngram_trace(syscall_trace, 3)
    tracestr = from_trace_to_longstr(syscall_trace)

    # get frequency features
    syscall_frequency = get_frequency_vector(syscall_trace, syscall_dict)
    bigranm_frequency = get_frequency_vector(bigram_trace, bigram_dict)
    # trigram_frequency = get_frequency_vector(trigram_trace, trigram_dict)

    # get onehot encoding
    syscall_one_hot =  trace_onehot_encoding(syscall_trace, syscall_dict_onehot)

    # get dependency graph
    dependency_graph = get_dependency_graph(syscall_trace,syscall_dict)

    return syscall_trace, tracestr, syscall_frequency, bigranm_frequency, syscall_one_hot, dependency_graph


In [16]:
dictfilepath = 'syscalls_to_index.csv'
rootPath ='D:/git/IoT_Sensors_Security_Analysis/data/'
inputFilePath = rootPath + '0.txt'

In [17]:
def read_file(inputFilePath):
    trace = []
    # turn raw data to standardlized syscalls trace
    with open(inputFilePath, 'r') as inputfile:
        for line in inputfile:
            syscall = get_systemcall_name_without_fork(line)
            if syscall[1].startswith('!!'):
                print(line)
            else:
                trace.append(syscall)    
    trace = pd.DataFrame(trace)
    trace.columns = ['timestamp', 'syscall', 'timecost']
    tracestr = from_trace_to_longstr(trace['syscall'])
    return trace, tracestr

In [18]:
def read_all_rawdata(rootPath):
    filesName = os.listdir(rootPath)
    corpus_dataframe = []
    corpus = []
    for fn in filesName:
        inputFilePath = rootPath + fn
        trace, tracestr = read_file(inputFilePath)
        corpus_dataframe.append(trace)
        corpus.append(tracestr)
    return corpus_dataframe, corpus
    

In [19]:
corpus_dataframe, corpus = read_all_rawdata(rootPath)

In [20]:
syscall_dict, ngrams_dict, countvectorizer, tfidfvectorizer, hashingvectorizer = create_vectorizers(corpus, 3)

In [21]:
def add_unk_to_dict(syscall_dict):
    total = len(syscall_dict)
    syscall_dict['unk'] = total
    syscall_dict_onehot = dict()
    for sc in syscall_dict:
        syscall_dict_onehot[sc] = create_onehot_encoding(total+1, syscall_dict[sc])
    return syscall_dict, syscall_dict_onehot



In [22]:
syscall_dict, syscall_dict_onehot = add_unk_to_dict(syscall_dict)

In [23]:
def replace_with_unk(syscall_trace, syscall_dict):
    for i, sc in enumerate(syscall_trace):
        if sc.lower() not in syscall_dict:
            syscall_trace[i] = 'unk'
    return syscall_trace


In [24]:
frequency_features = countvectorizer.transform(corpus)
tfidf_features = tfidfvectorizer.transform(corpus)
hashing_features = hashingvectorizer.transform(corpus)

In [25]:
one_hot_features = []
dependency_graph_features = []
for trace in corpus_dataframe:
    one_hot = []
    dependency_graph = []
    syscall_trace = replace_with_unk(trace['syscall'].to_list(), syscall_dict)
    syscall_one_hot =  trace_onehot_encoding(syscall_trace, syscall_dict_onehot)
    dependency_graph = get_dependency_graph(syscall_trace,syscall_dict)
    one_hot_features.append(syscall_one_hot)
    dependency_graph_features.append(dependency_graph)    

In [27]:
import random

In [28]:
timestamp = []
ys = []
for trace in corpus_dataframe:
    t = trace['timestamp'][0]
    timestamp.append(t)
    y = random.randint(0,1)
    ys.append(y)

In [29]:
encoded_trace_df = pd.DataFrame([timestamp, corpus_dataframe,corpus,frequency_features.toarray() ,tfidf_features.toarray(),hashing_features.toarray(), dependency_graph_features, one_hot_features,ys] ).transpose()

In [30]:
encoded_trace_df.columns = ['timestamp', 'corpus_raw', 'corpus_str', 'frequency_features' ,'tfidf_features','hashing_features', 'dependency_graph_features', 'one_hot_features','y']

In [31]:
encoded_trace_df.to_pickle('encoded_bow.pkl')

In [32]:
import random

In [33]:
encoded_trace_df = pd.read_pickle('encoded_bow.pkl')

In [34]:
encoded_trace_df

Unnamed: 0,timestamp,corpus_raw,corpus_str,frequency_features,tfidf_features,hashing_features,dependency_graph_features,one_hot_features,y
0,11:18:49,timestamp syscall timecost 0 11:...,wait4 rt_sigaction rt_sigprocmask SIGCHLD wait...,"[5, 5, 4, 1, 0, 8, 4, 4, 0, 1, 1, 2, 2, 4, 4, ...","[0.09223390905526405, 0.09223390905526405, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 4.027651515151515, 7.709931734931733, 0.0...","[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, ...",0
1,11:19:02,timestamp syscall timecost 0 11:...,rt_sigprocmask rt_sigaction rt_sigaction rt_si...,"[5, 5, 3, 2, 0, 6, 3, 3, 0, 1, 1, 2, 2, 4, 3, ...","[0.08609533342124387, 0.08609533342124387, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 3.944318181818182, 7.2400543900543886, 0....","[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, ...",0
2,11:21:44,timestamp syscall timecost 0 11:...,rt_sigprocmask clone rt_sigprocmask rt_sigacti...,"[6, 6, 4, 2, 0, 8, 4, 4, 0, 2, 2, 2, 2, 3, 3, ...","[0.10831094520026928, 0.10831094520026928, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 3.3867424242424247, 6.659870684870684, 0....","[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, ...",0
3,11:21:54,timestamp syscall timecost 0 11...,rt_sigprocmask rt_sigaction wait4 rt_sigaction...,"[5, 5, 4, 1, 0, 7, 4, 4, 0, 1, 1, 2, 2, 4, 4, ...","[0.09103370650132216, 0.09103370650132216, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 4.027651515151515, 7.709931734931733, 0.0...","[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, ...",0
4,11:22:14,timestamp syscall timecost 0 11:...,sigreturn close rt_sigprocmask rt_sigaction rt...,"[5, 5, 3, 2, 0, 7, 3, 3, 0, 2, 2, 2, 2, 3, 3, ...","[0.0871722968778249, 0.0871722968778249, 0.052...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 3.944318181818182, 7.040054390054388, 0.0...","[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, ...",0
5,11:22:34,timestamp syscall timecost 0 11:...,rt_sigprocmask pipe rt_sigprocmask rt_sigprocm...,"[5, 5, 3, 2, 0, 6, 3, 3, 0, 2, 2, 1, 1, 4, 4, ...","[0.08390713451361656, 0.08390713451361656, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 3.7825757575757577, 7.574517149517148, 0....","[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 1, ...",0
6,11:22:55,timestamp syscall timecost 0 11:...,clone rt_sigprocmask rt_sigaction close read r...,"[6, 6, 4, 2, 0, 8, 4, 4, 0, 2, 2, 2, 2, 3, 3, ...","[0.1077506020105247, 0.1077506020105247, 0.071...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 3.3867424242424247, 6.726537351537351, 0....","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, ...",0
7,11:23:05,timestamp syscall timecost 0 11...,rt_sigaction wait4 rt_sigaction rt_sigprocmask...,"[5, 5, 4, 1, 0, 7, 4, 4, 0, 3, 3, 0, 0, 4, 4, ...","[0.09064689219315002, 0.09064689219315002, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 5.061965240641711, 8.347557997557995, 0.0...","[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, ...",0
8,11:23:25,timestamp syscall timecost 0 11:...,close rt_sigprocmask rt_sigaction rt_sigaction...,"[5, 5, 3, 2, 0, 7, 3, 3, 0, 2, 2, 2, 2, 3, 3, ...","[0.08608029346138814, 0.08608029346138814, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 3.944318181818182, 7.2400543900543886, 0....","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, ...",1
9,11:23:45,timestamp syscall timecost 0 11:...,pipe rt_sigprocmask rt_sigprocmask rt_sigprocm...,"[6, 5, 3, 2, 0, 6, 3, 3, 0, 2, 2, 1, 1, 4, 4, ...","[0.1016667211300969, 0.08472226760841409, 0.05...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0, 3.7825757575757577, 7.574517149517148, 0....","[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, ...",0


In [35]:
from sklearn import svm
from sklearn import metrics

In [36]:
X = encoded_trace_df['frequency_features']
y = encoded_trace_df['y']

In [None]:
X = X.to_list()

In [None]:
y = y.to_list()

In [None]:
clf = svm.SVC()
clf.fit(X,y)


SVC()

In [94]:
pre = clf.predict(X)
acc = metrics.accuracy_score(pre, y)

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import metrics
import time

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
		 "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
		 "Naive Bayes", "QDA"]


encoded_trace_df = pd.read_pickle('encoded_bow.pkl')

classifiers = [
	KNeighborsClassifier(3),
	SVC(kernel="linear", C=0.025),
	SVC(gamma=2, C=1),
	GaussianProcessClassifier(1.0 * RBF(1.0)),
	DecisionTreeClassifier(max_depth=5),
	RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
	MLPClassifier(alpha=1, max_iter=1000),
	AdaBoostClassifier(),
	GaussianNB(),
	QuadraticDiscriminantAnalysis()]


X = encoded_trace_df['frequency_features'].to_list()
y = encoded_trace_df['y'].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

for name, clf in zip(names, classifiers):
	t1 =time.time()
	clf.fit(X_train, y_train)
	score = clf.score(X_test, y_test)
	t2 =time.time()
	t = t2 -t1
	print('Model: {}, accuracy score: {}, costed {} seconds'.format(name, score, t))

Model: Nearest Neighbors, accuracy score: 0.7777777777777778, costed 0.0020008087158203125 seconds
Model: Linear SVM, accuracy score: 0.6111111111111112, costed 0.0009992122650146484 seconds
Model: RBF SVM, accuracy score: 0.5555555555555556, costed 0.0 seconds
Model: Gaussian Process, accuracy score: 0.6111111111111112, costed 0.03000640869140625 seconds
Model: Decision Tree, accuracy score: 0.5, costed 0.0020008087158203125 seconds
Model: Random Forest, accuracy score: 0.4444444444444444, costed 0.009002208709716797 seconds




Model: Neural Net, accuracy score: 0.5555555555555556, costed 0.5551245212554932 seconds
Model: AdaBoost, accuracy score: 0.5, costed 0.04100918769836426 seconds
Model: Naive Bayes, accuracy score: 0.3888888888888889, costed 0.0010006427764892578 seconds
Model: QDA, accuracy score: 0.3333333333333333, costed 0.0010001659393310547 seconds


