In [29]:
import re
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
import os,sys
import tqdm
import pickle
import time
from concurrent.futures import ThreadPoolExecutor  
import concurrent.futures

In [30]:
def get_syscall_dict(ngrams_dict):
    syscall_dict = {}
    i = 0
    for ngram in ngrams_dict:
        if len(ngram.split()) == 1:
            syscall_dict[ngram] = i
            i+=1
    return syscall_dict

In [31]:
def create_vectorizers(corpus, ngram):
    syscall_dict = {}
    ngrams_dict = {}
    # countvectorizer = CountVectorizer().fit(corpus)
    # syscall_dict = countvectorizer.vocabulary_
    t1 = time.time()
    countvectorizer = CountVectorizer(ngram_range=(1, ngram)).fit(corpus)
    t2 = time.time()
    print('create ngram {} count vectorizer finished, fitting time is {}'.format(ngram, t2-t1))
    ngrams_dict = countvectorizer.vocabulary_
    syscall_dict = get_syscall_dict(ngrams_dict)
    t1 = time.time()
    tfidfvectorizer = TfidfVectorizer(ngram_range=(1, ngram), vocabulary=ngrams_dict).fit(corpus)
    t2 = time.time()
    print('create ngram {} tf-idf vectorizer finished, fitting time is {}'.format(ngram, t2-t1))

    t1 = time.time()
    hashingvectorizer = HashingVectorizer(n_features=2**5).fit(corpus)  
    t2 = time.time()
    print('create ngram {} hashing vectorizer finished, fitting time is {}'.format(ngram,t2-t1))
    return syscall_dict, ngrams_dict, countvectorizer, tfidfvectorizer, hashingvectorizer

In [32]:
def from_trace_to_longstr(syscall_trace):
    tracestr = ''
    for syscall in syscall_trace:
        tracestr += syscall + ' '
    # print(tracestr)
    return tracestr

In [33]:
def read_rawdata(corpus_dataframe,corpus, rawdataPath, rawFileNames):    
    pool = ThreadPoolExecutor  (max_workers = 16)
    def read_file(inputFilePath):
        if '.txt' in inputFilePath:
            trace = pd.read_csv(inputFilePath)
        elif '.csv'in inputFilePath:
            trace = pd.read_csv(inputFilePath, header=None)
            trace.columns = ['pid','timestamp','syscall','time_cost']
        trace = trace.drop(len(trace)-1)
        tr = trace['syscall'].tolist()             
        longstr = from_trace_to_longstr(tr)
        return (trace,longstr)
        # print(inputFilePath)
    def asyn_page(filenames):
        future_to_url  = dict()
        for i, url in enumerate(filenames):
            t = pool.submit(read_file, url)
            future_to_url[t] = url               
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                (trace,longstr) = data
                corpus_dataframe.append(trace)
                corpus.append(longstr)
            except Exception as exc:
                print('%r generated an exception: %s' % (filenames, exc))
    
    par = tqdm.tqdm(total = len(rawFileNames), ncols=100)
    # i = 0
    start, end = 0,0
    for n in range(0,len(rawFileNames),16):
        par.update(16)
        start = n
        if start + 16 < len(rawFileNames):
            end = start + 16
        else:
            end = len(rawFileNames)
        filenames = [rawdataPath + rawFileNames[i] for i in range(start, end)]
        asyn_page(filenames)
    par.close()
    pool.shutdown()
    print("Sub-process(es) done.")
    return corpus_dataframe, corpus

In [34]:
def read_rawdata_1(corpus_dataframe,corpus, rawdataPath, rawFileNames):   
    for rawFileName in rawFileNames:
        inputFilePath = rawdataPath + rawFileName
        if '.txt' in inputFilePath:
            trace = pd.read_csv(inputFilePath)
        elif '.csv'in inputFilePath:
            trace = pd.read_csv(inputFilePath, header=None)
            trace.columns = ['pid','timestamp','syscall','time_cost']
        tr = trace['syscall'].tolist()             
        longstr = from_trace_to_longstr(tr)
        corpus_dataframe.append(trace)
        corpus.append(longstr)
    return corpus_dataframe, corpus

In [35]:
def create_onehot_encoding(total, index):
    onehot = []
    for i in range(0, total):
        if i == index:
            onehot.append(1)
        else:
            onehot.append(0)
    return onehot

In [36]:
def add_unk_to_dict(syscall_dict):
    total = len(syscall_dict)
    syscall_dict['unk'] = total
    syscall_dict_onehot = dict()
    for sc in syscall_dict:
        syscall_dict_onehot[sc] = create_onehot_encoding(total+1, syscall_dict[sc])
    return syscall_dict, syscall_dict_onehot

In [37]:
def fit_vectorizators(corpus, device, tw, n_gram):
    syscall_dict, ngrams_dict, countvectorizer, tfidfvectorizer, hashingvectorizer = create_vectorizers(corpus, n_gram)
    syscall_dict, syscall_dict_onehot = add_unk_to_dict(syscall_dict)

    # dictPath = rootPath +'dicts/uniform/{}/'.format(tw)
    dictPath = rootPath +'{}/'.format(tw)
    loc=open(dictPath+'countvectorizer_ngram{}.pk'.format(n_gram),'wb')
    pickle.dump(countvectorizer,loc)
    loc=open(dictPath+'tfidfvectorizer_ngram{}.pk'.format(n_gram),'wb')
    pickle.dump(tfidfvectorizer,loc)
    loc=open(dictPath+'hashingvectorizer_ngram{}.pk'.format(n_gram),'wb')
    pickle.dump(hashingvectorizer,loc)
    loc=open(dictPath+'syscall_dict_ngram{}.pk'.format(n_gram),'wb')
    pickle.dump(syscall_dict,loc)
    loc=open(dictPath+'syscall_dict_onehot_ngram{}.pk'.format(n_gram),'wb')
    pickle.dump(syscall_dict_onehot,loc)
    loc=open(dictPath+'ngrams_dict_ngram{}.pk'.format(n_gram),'wb')
    pickle.dump(ngrams_dict,loc)
    loc.close()

In [21]:
def get_data(device, tw, cf, co):
    # rawdataPath =rootPath+'{}/splited_1/{}/'.format(device, tw)
    rawdataPath =rootPath+'{}/'.format(device)
    rawFileNames = os.listdir(rawdataPath)
    rawdatas = dict()
    for n in range(0,len(rawFileNames),1000):
        start = n
        if start + 1000 < len(rawFileNames):
            end = start + 1000
        else:
            end = len(rawFileNames)
        rawdatas[n] = rawFileNames[start:end]
    corpus_dataframes, corpuses = {}, {}

    for rfs in rawdatas:
        corpus_dataframe, corpus = [],[]
        corpus_dataframe, corpus = read_rawdata(corpus_dataframe,corpus, rawdataPath, rawdatas[rfs])
        corpus_dataframes[rfs] = corpus_dataframe
        corpuses[rfs] = corpus
    

    for rfs in corpus_dataframes:
        cf += corpus_dataframes[rfs]
        co += corpuses[rfs]
    return cf, co

In [38]:
rootPath ='D:/mt_data/1126_withrw/'

In [39]:
devices = ['pi3', 'pi4_2G', 'pi4_4G']
tws = [60]

In [40]:
for tw in tws:
   cf, co = [],[]
   for device in devices:
      cf, co = get_data(device, tw, cf, co)
   print(len(cf))
   for n_gram in range(1,6):
      fit_vectorizators(co, device, tw, n_gram)

  result = self.fn(*self.args, **self.kwargs)
112it [00:13,  8.51it/s]


Sub-process(es) done.


1008it [03:12,  5.24it/s]


Sub-process(es) done.


1008it [03:17,  5.11it/s]


Sub-process(es) done.


1008it [03:24,  4.94it/s]


Sub-process(es) done.


608it [02:05,  4.85it/s]


Sub-process(es) done.


1008it [03:40,  4.58it/s]


Sub-process(es) done.


1008it [03:25,  4.91it/s]


Sub-process(es) done.


1008it [03:37,  4.63it/s]


Sub-process(es) done.


608it [02:03,  4.91it/s]


Sub-process(es) done.
7300
create ngram 1 count vectorizer finished, fitting time is 513.5494050979614
create ngram 1 tf-idf vectorizer finished, fitting time is 522.5125312805176
create ngram 1 hashing vectorizer finished, fitting time is 0.0
create ngram 2 count vectorizer finished, fitting time is 974.5562417507172
create ngram 2 tf-idf vectorizer finished, fitting time is 947.1551079750061
create ngram 2 hashing vectorizer finished, fitting time is 0.0
create ngram 3 count vectorizer finished, fitting time is 1458.9337661266327
create ngram 3 tf-idf vectorizer finished, fitting time is 1463.871610879898
create ngram 3 hashing vectorizer finished, fitting time is 0.0010001659393310547
create ngram 4 count vectorizer finished, fitting time is 1840.4847145080566
create ngram 4 tf-idf vectorizer finished, fitting time is 1829.791666984558
create ngram 4 hashing vectorizer finished, fitting time is 0.0
create ngram 5 count vectorizer finished, fitting time is 2332.824594974518
create ng

In [41]:
dictPath = rootPath +'{}/'.format(60)

In [42]:
loc=open(dictPath+'syscall_dict_ngram{}.pk'.format(1),'rb')
syscall_dict = pickle.load(loc)

In [43]:
syscall_dict

{'ioctl': 0,
 'timerfd_settime': 1,
 'poll': 2,
 'getpid': 3,
 'write': 4,
 'futex': 5,
 'open': 6,
 'close': 7,
 'read': 8,
 'mkdir': 9,
 'fstat64': 10,
 'getdents': 11,
 'unlink': 12,
 'mprotect': 13,
 'clock_gettime': 14,
 'gettimeofday': 15,
 'madvise': 16,
 'unk': 17}