In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
import os,sys
import tqdm
import pickle
import time
from concurrent.futures import ThreadPoolExecutor  
import concurrent.futures
import numpy as np
from sklearn.decomposition import PCA

In [40]:
def read_dicts(base_dict_path, tw):
    vectorizers = {}
    dicts ={}
    dictPath = base_dict_path + str(tw) + '/'
    for i in range(1, 6):
        cvName = 'countvectorizer_ngram{}'.format(i)
        tvName = 'tfidfvectorizer_ngram{}'.format(i)
        hvName = 'hashingvectorizer_ngram{}'.format(i)
        ndName = 'ngrams_dict_ngram{}'.format(i)
        sdName = 'syscall_dict_ngram{}'.format(i)
        shdName = 'syscall_dict_onehot_ngram{}'.format(i)

        loc=open(dictPath + cvName+'.pk','rb')
        cv = pickle.load(loc)
        vectorizers[cvName] = cv

        loc=open(dictPath + tvName+'.pk','rb')
        tv = pickle.load(loc)
        vectorizers[tvName] = tv

        loc=open(dictPath + hvName+'.pk','rb')
        hv = pickle.load(loc)
        vectorizers[hvName] = hv

        loc=open(dictPath + ndName+'.pk','rb')
        nd = pickle.load(loc)
        dicts[ndName] = nd

        loc=open(dictPath + sdName+'.pk','rb')
        sd = pickle.load(loc)
        dicts[sdName] = sd

        loc=open(dictPath + shdName+'.pk','rb')
        shd = pickle.load(loc)
        dicts[shdName] = shd
    
    return vectorizers, dicts

In [14]:
rootPath ='D:/git/IoT_Sensors_Security_Analysis/data/perf/'
rawdataPath =rootPath+'{}/splited_1/{}/'.format('pi3', 60)
rawFileNames = os.listdir(rawdataPath)
rawdatas = dict()

In [3]:
def from_trace_to_longstr(syscall_trace):
    tracestr = ''
    for syscall in syscall_trace:
        tracestr += syscall + ' '
    # print(tracestr)
    return tracestr

In [4]:
def read_all_rawdata( rawdataPath, rawFileNames):    
    corpus_dataframe, corpus = [],[]
    par = tqdm.tqdm(total=len(rawFileNames), ncols=100)
    for fn in rawFileNames:
        if '.csv' in fn:
            par.update(1)
            fp = rawdataPath + fn
            trace = pd.read_csv(fp)
            tr = trace['syscall'].tolist()             
            longstr = from_trace_to_longstr(tr)
            corpus_dataframe.append(trace)
            corpus.append(longstr)
    par.close()
    return corpus_dataframe, corpus

In [17]:
corpus_dataframe, corpus = read_all_rawdata( rawdataPath, rawFileNames)

  corpus_dataframe, corpus = read_all_rawdata( rawdataPath, rawFileNames)
  0%|▏                                                            | 12/3219 [00:10<36:23,  1.47it/s]

KeyboardInterrupt: 

In [5]:
def get_syscall_dict(ngrams_dict):
    syscall_dict = {}
    i = 0
    for ngram in ngrams_dict:
        if len(ngram.split()) == 1:
            syscall_dict[ngram] = i
            i+=1
    return syscall_dict

In [6]:
def create_vectorizers(corpus, ngram):
    syscall_dict = {}
    ngrams_dict = {}
    # countvectorizer = CountVectorizer().fit(corpus)
    # syscall_dict = countvectorizer.vocabulary_
    countvectorizer = CountVectorizer(ngram_range=(1, ngram)).fit(corpus)
    print('create count vectorizer finished')
    ngrams_dict = countvectorizer.vocabulary_
    syscall_dict = get_syscall_dict(ngrams_dict)
    tfidfvectorizer = TfidfVectorizer(ngram_range=(1, ngram), vocabulary=ngrams_dict).fit(corpus)
    print('create tf-idf vectorizer finished')
    hashingvectorizer = HashingVectorizer(n_features=2**5).fit(corpus)  
    print('create hashing vectorizer finished')
    return syscall_dict, ngrams_dict, countvectorizer, tfidfvectorizer, hashingvectorizer

In [7]:
def read_all_rawdata(corpus_dataframe,corpus, rawdataPath, rawFileNames):    
    pool = ThreadPoolExecutor  (max_workers = 16)
    def read_file(inputFilePath):
        trace = pd.read_csv(inputFilePath)
        tr = trace['syscall'].tolist()             
        longstr = from_trace_to_longstr(tr)
        return (trace,longstr)
        # print(inputFilePath)
    def asyn_page(filenames):
        future_to_url  = dict()
        for i, url in enumerate(filenames):
            t = pool.submit(read_file, url)
            future_to_url[t] = url               
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                (trace,longstr) = data
                corpus_dataframe.append(trace)
                corpus.append(longstr)
            except Exception as exc:
                print('%r generated an exception: %s' % (filenames, exc))
    
    par = tqdm.tqdm(total = len(rawFileNames), ncols=100)
    # i = 0
    start, end = 0,0
    for n in range(0,len(rawFileNames),16):
        par.update(16)
        start = n
        if start + 16 < len(rawFileNames):
            end = start + 16
        else:
            end = len(rawFileNames)
        filenames = [rawdataPath + rawFileNames[i] for i in range(start, end)]
        asyn_page(filenames)
    par.close()
    pool.shutdown()
    print("Sub-process(es) done.")
    return corpus_dataframe, corpus

In [8]:
def create_onehot_encoding(total, index):
    onehot = []
    for i in range(0, total):
        if i == index:
            onehot.append(1)
        else:
            onehot.append(0)
    return onehot

In [9]:
def add_unk_to_dict(syscall_dict):
    total = len(syscall_dict)
    syscall_dict['unk'] = total
    syscall_dict_onehot = dict()
    for sc in syscall_dict:
        syscall_dict_onehot[sc] = create_onehot_encoding(total+1, syscall_dict[sc])
    return syscall_dict, syscall_dict_onehot

In [10]:
def replace_with_unk(syscall_trace, syscall_dict):
    for i, sc in enumerate(syscall_trace):
        if sc.lower() not in syscall_dict:
            syscall_trace[i] = 'unk'
    return syscall_trace

In [11]:
def trace_onehot_encoding(trace, syscall_dict_onehot):
    encoded_trace = []
    for syscall in trace:
        syscall = syscall.lower()
        if syscall.lower() in syscall_dict_onehot:
            one_hot = syscall_dict_onehot[syscall]
        else:
            syscall = 'UNK'
            one_hot = syscall_dict_onehot[syscall]
        encoded_trace.append(one_hot)
    return encoded_trace

In [12]:
def find_all_head(trace, head):
    starts, ends,se = [], [], []

    for i,s in enumerate(trace):
        if s == head:
            start=i
            starts.append(start)
            if len(starts) > 1:
                end = starts[-1] 
                ends.append(end)
        if i == len(trace)-1:
            end = len(trace)
            ends.append(end)
    se = [(starts[i], ends[i]) for i in range(0, len(starts))]
    return se

In [13]:
def get_distance( trace, head, tails):
    se = find_all_head(trace, head)
    mw = 12
    pool = ThreadPoolExecutor  (max_workers = mw)
    distances = []
    res = dict()
    def return_distance(tails, sort):
        distance = dict()
        for tail in tails:
            d = 0
            for j,t in enumerate(sort):
                if t==tail:
                    d += 1/(j)
            distance[tail] = d
        return distance

    def asyn_page(tails, sorts):
        future_to_url  = dict()
        for i, url in enumerate(sorts):
            t = pool.submit(return_distance, tails=tails, sort=url)
            future_to_url[t] = url               
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                distance = data
                distances.append(distance)
            except Exception as exc:
                print('generated an exception: %s' % (exc))
    start, end = 0,0
    for n in range(0, len(se), mw):
        start = n
        if start + mw < len(trace):
            end = start + mw
        else:
            end = len(trace)            
        sorts = [trace[s:e] for (s, e) in se[start:end]]
        asyn_page(tails, sorts)
    pool.shutdown()
    # print("Sub-process(es) done.")
    ds = pd.DataFrame(distances)
    for tail in tails:
        if tail in ds:
            res[(head, tail)] = sum(ds[tail])
        else:
            res[(head, tail)] = 0
    return res

In [14]:
def get_dependency_graph( trace, term_dict):
    dp = []
    dp_ = {}
    for head in term_dict:
        tails = list(term_dict.keys())
        tails.remove(head)  
        p = get_distance( trace, head,tails)
        dp_ = {**dp_, **p}
    
    for head in term_dict:
        d_ =  []
        for tail in term_dict:
            if head == tail:
                d_.append(0)
            else:
                d_.append(dp_[(head, tail)])
        dp.append(d_)
    return dp

In [15]:
def get_dict_sequence(trace,term_dict):
    dict_sequence = []
    for syscall in trace:
        if syscall in term_dict:
            dict_sequence.append(term_dict[syscall])
        else:
            dict_sequence.append(term_dict['unk'])
    return dict_sequence

In [16]:
def get_sequence_features(rawFileNames, corpus_dataframe, syscall_dict, syscall_dict_onehot):
    one_hot_features = []
    dependency_graph_features = []
    dict_sequence_features = []
    pool = ThreadPoolExecutor  (max_workers = 16)

    def get_features(trace, syscall_dict, syscall_dict_onehot):
        syscall_one_hot = []
        dependency_graph = []
        dict_sequence = []
        syscall_trace = replace_with_unk(trace['syscall'].to_list(), syscall_dict)
        syscall_one_hot =  trace_onehot_encoding(syscall_trace, syscall_dict_onehot)
        dependency_graph = get_dependency_graph(rawFileNames, syscall_trace,syscall_dict)
        dict_sequence=get_dict_sequence(syscall_trace,syscall_dict)
        return (syscall_one_hot, dependency_graph, dict_sequence)

    def asyn_page(cf):
        future_to_url  = dict()
        for i, url in enumerate(cf):
            t = pool.submit(get_features, url, syscall_dict, syscall_dict_onehot)
            future_to_url[t] = url               
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                (syscall_one_hot, dependency_graph, dict_sequence) = data
                one_hot_features.append(syscall_one_hot)
                dependency_graph_features.append(dependency_graph)
                dict_sequence_features.append(dict_sequence)
            except Exception as exc:
                print('generated an exception: %s' % ( exc))


    par = tqdm.tqdm(total = len(corpus_dataframe), ncols=100)
    # i = 0
    start, end = 0,0
    for n in range(0,len(corpus_dataframe),16):
        par.update(16)
        start = n
        if start + 16 < len(corpus_dataframe):
            end = start + 16
        else:
            end = len(corpus_dataframe)
        cf = corpus_dataframe[start: end]
        asyn_page(cf)
    par.close()   
    return one_hot_features, dependency_graph_features, dict_sequence_features

In [30]:
rootPath ='D:/git/IoT_Sensors_Security_Analysis/data/perf/'

In [17]:
def run(device, tw):
    rawdataPath =rootPath+'{}/splited_1/{}/'.format(device, tw)
    rawFileNames = os.listdir(rawdataPath)
    rawdatas = dict()
    for n in range(0,len(rawFileNames),1000):
        start = n
        if start + 1000 < len(rawFileNames):
            end = start + 1000
        else:
            end = len(rawFileNames)
        rawdatas[n] = rawFileNames[start:end]
    corpus_dataframes, corpuses = {}, {}

    for rfs in rawdatas:
        corpus_dataframe, corpus = [],[]
        corpus_dataframe, corpus = read_all_rawdata(corpus_dataframe,corpus, rawdataPath, rawdatas[rfs])
        corpus_dataframes[rfs] = corpus_dataframe
        corpuses[rfs] = corpus
    
    corpus_dataframe, corpus = [],[]
    for rfs in corpus_dataframes:
        corpus_dataframe += corpus_dataframes[rfs]
        corpus += corpuses[rfs]
    
    syscall_dict, ngrams_dict, countvectorizer, tfidfvectorizer, hashingvectorizer = create_vectorizers(corpus, 3)
    syscall_dict, syscall_dict_onehot = add_unk_to_dict(syscall_dict)

    frequency_features = countvectorizer.transform(corpus)
    tfidf_features = tfidfvectorizer.transform(corpus)
    hashing_features = hashingvectorizer.transform(corpus)
    one_hot_features, dependency_graph_features, dict_sequence_features = get_sequence_features(rawFileNames, corpus_dataframe, syscall_dict, syscall_dict_onehot)

    maltype = []
    ids = []
    
    for fi in rawFileNames:
        fis = fi.split('_')
        fn = fis[0]
        i = '{}_{}_{}'.format(fis[0], fis[2], fis[3])
        maltype.append(fn)
        ids.append(i)
    
    dictPath = rootPath +'dicts/{}/{}/'.format(device, tw)
    loc=open(dictPath+'countvectorizer.pk','wb')
    pickle.dump(countvectorizer,loc)
    loc=open(dictPath+'tfidfvectorizer.pk','wb')
    pickle.dump(tfidfvectorizer,loc)
    loc=open(dictPath+'hashingvectorizer.pk','wb')
    pickle.dump(hashingvectorizer,loc)
    loc=open(dictPath+'syscall_dict.pk','wb')
    pickle.dump(syscall_dict,loc)
    loc=open(dictPath+'syscall_dict_onehot.pk','wb')
    pickle.dump(syscall_dict_onehot,loc)
    loc=open(dictPath+'ngrams_dict.pk','wb')
    pickle.dump(ngrams_dict,loc)
    loc.close()

    encoded_trace_df = pd.DataFrame([ids, maltype,frequency_features.toarray() ,tfidf_features.toarray(),hashing_features.toarray(), dependency_graph_features, one_hot_features, dict_sequence_features] ).transpose()
    encoded_trace_df.columns = ['ids', 'maltype',  'system calls frequency' ,'system calls tfidf','system calls hashing', 'system calls dependency graph', 'one hot encoding', 'dict index encoding']
    resultsPath = 'D:/git/IoT_Sensors_Security_Analysis/results/{}/tw_{}_turn_1/'.format(device, tw)
    encoded_trace_df.to_pickle(resultsPath+'encoded_bow.pkl')

In [17]:
def reshape_matrix(matrix_list):
    new_list = [np.array(i).reshape(-1) for i in matrix_list]
    return new_list

In [18]:
def padding_onehot(onehot_list, padding):
    new_list = []
    for onehot in onehot_list:
        if len(onehot) > padding:
            onehot = np.array(onehot[0:padding])
            new_list.append(onehot)
        else:
            onehot =np.pad(onehot, [(0, padding-len(onehot)), (0, 0)], mode='constant', constant_values=0)
            new_list.append(onehot)
    new_list = reshape_matrix(new_list)
    return new_list

In [19]:
def padding_dictencoding(dictencoding_list, padding):
    new_list = []
    for onehot in dictencoding_list:
        if len(onehot) > padding:
            onehot = np.array(onehot[0:padding])
            new_list.append(onehot)
        else:
            onehot =np.pad(onehot, [(0, padding-len(onehot))], mode='constant', constant_values=0)
            new_list.append(onehot)
    return new_list

In [20]:
def get_pca_feature(input):
    pca = PCA(n_components=100)
    X_pca = pca.fit_transform(input)
    return X_pca, pca

In [22]:
def read_dfs(devices, tws):
    for tw in tws:
        for device in devices:
            # rawdataPath =rootPath+'{}/splited_1/{}/'.format(device, tw)
             rawdataPath =rootPath+'{}/'.format(device)
            rawFileNames = os.listdir(rawdataPath)
            rawdatas = dict()
            for n in range(0,len(rawFileNames),1000):
                start = n
                if start + 1000 < len(rawFileNames):
                    end = start + 1000
                else:
                    end = len(rawFileNames)
                rawdatas[n] = rawFileNames[start:end]
            corpus_dataframes, corpuses = {}, {}

            for rfs in rawdatas:
                corpus_dataframe, corpus = [],[]
                corpus_dataframe, corpus = read_all_rawdata(corpus_dataframe,corpus, rawdataPath, rawdatas[rfs])
                corpus_dataframes[rfs] = corpus_dataframe
                corpuses[rfs] = corpus
            
            corpus_dataframe, corpus = [],[]
            for rfs in corpus_dataframes:
                corpus_dataframe += corpus_dataframes[rfs]
                corpus += corpuses[rfs]

            loc=open(rawdataPath+'corpus_dataframe.pk','wb')
            pickle.dump(corpus_dataframe,loc)
            loc=open(rawdataPath+'corpus.pk','wb')
            pickle.dump(corpus,loc)
                

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 6)

In [31]:
devices = [ 'pi4_2G', 'pi4_4G']
tws=[60]
read_dfs(devices, tws)

  result = self.fn(*self.args, **self.kwargs)
1008it [05:03,  3.33it/s]


Sub-process(es) done.


1008it [03:30,  4.80it/s]


Sub-process(es) done.


1008it [03:28,  4.83it/s]


Sub-process(es) done.


1008it [05:08,  3.27it/s]


Sub-process(es) done.


1008it [03:30,  4.79it/s]


Sub-process(es) done.


1008it [03:27,  4.87it/s]


Sub-process(es) done.


In [29]:
rootPath = 'd:/mt_data/1126_withrw/'

In [39]:
def get_features(devices, tws):
    times = {} 
    # pcas = {}
    for tw in tws:
        base_dict_path = rootPath
        vectorizers, dicts = read_dicts(base_dict_path, tw)

        for device in devices:
            features = []
            rawdataPath =rootPath+'{}/'.format(device)
            rawFileNames = os.listdir(rawdataPath)
            ids, maltype = [], []
            corpus_dataframe, corpus=[], []
            par = tqdm.tqdm(total=len(rawFileNames), ncols=80)
            for fi in rawFileNames:    
                par.update(1)            
                if '.csv' or '.txt' in fi:
                    fis = fi.split('_')
                    if('_v2') in fi:
                        fn = fis[0] + fis[1]
                    else:
                        fn = fis[0]
                    i = '{}_{}_{}'.format(fis[0], fis[2], fis[3])
                    maltype.append(fn)
                    ids.append(i)
                fi = rawdataPath + fi
                if '.txt' in fi:
                    trace = pd.read_csv(fi)
                elif '.csv'in fi:
                    trace = pd.read_csv(fi, header=None)
                    trace.columns = ['pid','timestamp','syscall','time_cost']
                trace = trace.drop(len(trace)-1)
                tr = trace['syscall'].tolist()             
                longstr = from_trace_to_longstr(tr)
                corpus_dataframe.append(trace)
                corpus.append(longstr)
            par.close()
            features.append(ids)
            features.append(maltype)

            # loc=open(rawdataPath +'corpus_dataframe.pk','rb')
            # corpus_dataframe = pickle.load(loc)
            # loc=open(rawdataPath +'corpus.pk','rb')
            # corpus = pickle.load(loc)
            print('got rawdata')
            ndName = 'ngrams_dict_ngram{}'.format(1)
            sdName = 'syscall_dict_ngram{}'.format(1)
            shdName = 'syscall_dict_onehot_ngram{}'.format(1)
            nd = dicts[ndName]
            sd = dicts[sdName]
            shd = dicts[shdName]
            
            # one_hot_features = []
            # dependency_graph_features = []
            # dict_sequence_features = []

            # t1 = time.time()
            # par = tqdm.tqdm(total=len(corpus_dataframe), ncols=100)
            # for trace in corpus_dataframe:
            #     syscall_trace = replace_with_unk(trace['syscall'].to_list(), sd)
            #     syscall_one_hot =  trace_onehot_encoding(syscall_trace, shd)
            #     one_hot_features.append(syscall_one_hot)
            #     par.update(1)
            # t2 = time.time()
            # par.close()
            # key = 'syscall_one_hot'+'_'+device+'_'+str(tw)
            # t = t2 - t1
            # times[key] = t
            # print(key+": "+str(t))
            # pca_name = key+'_pca'
            # inputs = padding_onehot(one_hot_features, 160000)
            # one_hot_features_pca, pca = get_pca_feature(inputs)
            # pcas[pca_name] = pca
            # features.append(one_hot_features)
            # features.append(one_hot_features_pca)
            # par = tqdm.tqdm(total=len(corpus_dataframe), ncols=100)
            # t1 = time.time()
            # for trace in corpus_dataframe:
            #     syscall_trace = replace_with_unk(trace['syscall'].to_list(), sd)
            #     dict_sequence = get_dict_sequence(syscall_trace,sd)
            #     dict_sequence_features.append(dict_sequence)
            #     par.update(1)
            # t2 = time.time()
            # par.close()
            # t = t2 - t1
            # key = 'dict_sequence'+'_'+device+'_'+str(tw)
            # times[key] = t
            # print(key+": "+str(t))
            # pca_name = key+'_pca'
            # inputs = padding_dictencoding(dict_sequence_features, 160000)
            # dict_sequence_pca, pca = get_pca_feature(inputs)
            # pcas[pca_name] = pca
            # features.append(dict_sequence_features)
            # features.append(dict_sequence_pca)


            # par = tqdm.tqdm(total=len(corpus_dataframe), ncols=100)
            # t1 = time.time()    
            # for trace in corpus_dataframe:
            #     syscall_trace = replace_with_unk(trace['syscall'].to_list(), sd)
            #     dependency_graph = get_dependency_graph(syscall_trace,sd)
            #     dependency_graph_features.append(dependency_graph)
            #     par.update(1)
            # t2 = time.time()
            # par.close()
            # t = t2 - t1
            # key = 'dict_sequence'+'_'+device+'_'+str(tw)
            # times[key] = t   
            # print(key+": "+str(t))
            # features.append(dependency_graph_features)

            for i in range(1, 6):
                cvName = 'countvectorizer_ngram{}'.format(i)
                tvName = 'tfidfvectorizer_ngram{}'.format(i)
                hvName = 'hashingvectorizer_ngram{}'.format(i)             

                cv = vectorizers[cvName]
                tv = vectorizers[tvName]
                hv = vectorizers[hvName]

                t1 = time.time()
                frequency_features = cv.transform(corpus)
                t2 = time.time()
                key = cvName+'_'+device+'_'+str(tw)
                t = t2 - t1
                times[key] = t
                print(key+": "+str(t))
                frequency_features = frequency_features.toarray()
                # frequency_pca_name = key+'_pca'
                # frequency_pca,pca = get_pca_feature(frequency_features)
                # pcas[frequency_pca_name] = pca               

                t1 = time.time()
                tfidf_features = tv.transform(corpus)
                t2 = time.time()
                t = t2 - t1
                key = tvName+'_'+device+'_'+str(tw)
                times[key] = t
                print(key+": "+str(t))
                tfidf_features = tfidf_features.toarray()
                # tfidf_pca_name = key+'_pca'
                # tfidf_pca,pca = get_pca_feature(tfidf_features)
                # pcas[tfidf_pca_name] = pca

                t1 = time.time()
                hashing_features = hv.transform(corpus)
                t2 = time.time()
                t = t2 - t1
                key = hvName+'_'+device+'_'+str(tw)
                times[key] = t
                print(key+": "+str(t))
                hashing_features = hashing_features.toarray()

                features.append(frequency_features)
                # features.append(frequency_pca)
                features.append(tfidf_features)
                # features.append(tfidf_pca)
                features.append(hashing_features)           
            
            encoded_trace_df = pd.DataFrame(features).transpose()
            encoded_trace_df.columns = ['ids', 'maltype',#'one hot encoding', 'dict index encoding', 'system calls dependency graph', 
            'system calls frequency_1gram', 'system calls tfidf_1gram', 'system calls hashing_1gram',
            'system calls frequency_2gram', 'system calls tfidf_2gram', 'system calls hashing_2gram',
            'system calls frequency_3gram', 'system calls tfidf_3gram', 'system calls hashing_3gram',
            'system calls frequency_4gram', 'system calls tfidf_4gram', 'system calls hashing_4gram',
            'system calls frequency_5gram', 'system calls tfidf_5gram', 'system calls hashing_5gram'
            ]
            
            resultsPath = rootPath+'encoded/t1/'
            encoded_trace_df.to_pickle(resultsPath+'encoded_bow{}_{}.pkl'.format(device, tw))  
    return times         

In [41]:
devices = [ 'pi3', 'pi4_2G', 'pi4_4G']
tws = [60]

In [42]:
times =  get_features(devices, tws)

  times =  get_features(devices, tws)


In [66]:
resultsPath = rootPath+'encoded/t1/'
loc=open(resultsPath +'encoded_bowpi4_4G_60.pkl','rb')
corpus_dataframe = pickle.load(loc)

In [67]:
rawdataPath =rootPath+'{}/'.format('pi4_4G')
rawFileNames = os.listdir(rawdataPath)

In [68]:
ids, maltype = [], []
for fi in rawFileNames:              
    if '.csv' or '.txt' in fi:
        fis = fi.split('_')
        if('_v2') in fi:
            fn = fis[0] + fis[1]
        else:
            fn = fis[0]
        i = '{}_{}_{}'.format(fis[0], fis[2], fis[3])
        maltype.append(fn)
        ids.append(i)

In [69]:
corpus_dataframe['maltype'] = maltype

In [70]:
corpus_dataframe.to_pickle(resultsPath+'encoded_bow{}_{}.pkl'.format('pi4_4G', 60))