In [1]:
import json
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')

### JSON FEATURES FILE IMPORT

In [2]:
with open('features.json', 'r') as f:
    main_results = json.load(f)

### JSON FEATURES DATA STRUCTURES GENERATION

* **json_feature_main**

    * **call_feature_keys**

        * call_feature_function
        * call_feature_call_file

    * **registry_feature_keys**
    
    * **file_feature_keys**
    
        * file_feature_file
        * file_feature_directory
    
    * **misc_feature_keys**
    
        * misc_feature_misc
        * misc_feature_com_sign

In [3]:
json_features_main = []

for k,v in main_results.items():
    #pprint(k)
    json_features_main.append(k)

json_features_main

['call_feature', 'registry_feature', 'file_feature', 'misc_feature']

In [4]:
call_feature_keys = []

for k in main_results['call_feature'].keys():
    #pprint(k)
    call_feature_keys.append(k)
    
call_feature_keys

['function', 'call_file']

In [5]:
call_feature_function = []

for i in main_results['call_feature']['function']:
    call_feature_function.append(i)
    
#print(call_feature_function)

In [6]:
call_feature_call_file = []

for i in main_results['call_feature']['call_file']:
    call_feature_call_file.append(i)
    
#print(call_feature_call_file)

In [7]:
registry_feature_keys = []

for k in main_results['registry_feature']:
    #pprint(k)
    registry_feature_keys.append(k)

registry_feature_keys

['regkey_written', 'regkey_opened', 'regkey_read', 'regkey_deleted']

In [8]:
file_feature_keys =[]

for k in main_results['file_feature'].keys():
    #pprint(k)
    file_feature_keys.append(k)

file_feature_keys

['file', 'directory']

In [9]:
file_feature_file = []

for i in main_results['file_feature']['file']:
    file_feature_file.append(i)
    
print(file_feature_file)

['file_opened', 'file_written', 'file_exists', 'file_moved', 'file_read', 'file_deleted', 'file_failed', 'file_copied']


In [10]:
file_feature_directory = []

for i in main_results['file_feature']['directory']:
    file_feature_directory.append(i)

file_feature_directory

['directory_enumerated', 'directory_created', 'directory_removed']

In [11]:
misc_feature_keys = []

for k in main_results['misc_feature'].keys():
    #pprint(k)
    misc_feature_keys.append(k)

misc_feature_keys

['misc', 'com_sign']

In [12]:
misc_feature_misc = []

for i in main_results['misc_feature']['misc']:
    misc_feature_misc.append(i)

misc_feature_misc

['mutex', 'processes', 'processtree']

In [13]:
misc_feature_com_sign = []

for i in main_results['misc_feature']['com_sign']:
    misc_feature_com_sign.append(i)

#print(misc_feature_com_sign)

### DATASETS DYNAMIC IMPORT AND MERGE

In [14]:
from pprint import pprint
import numpy as np
import os
import glob

In [15]:
raw_dataset = []
raw_lines = []

path = './dataset/'

for filename in glob.glob(os.path.join(path, '*.txt')):
    raw_lines.clear()
    with open(filename, 'r') as file:
        #print(filename)
        for line in file:
            raw_lines.append(line)
    raw_dataset.append(raw_lines[:])
    
pprint(raw_dataset[0][0], compact=True)

('0.0740740740741 0:3 19:134 22:31 24:12 27:19 34:24 36:2 50:7 52:384 55:11 '
 '64:1 66:4 67:78 71:109 80:33 84:12 87:3 88:58 90:1 100:17 107:132 115:30 '
 '121:105 141:95 151:45 160:1 165:13 177:1 178:4 181:1 193:12 197:1 198:3 '
 '203:455 209:1 225:7 230:4 231:12 232:2 238:4 240:17 245:30 354:61 355:96 '
 '357:6 359:4 361:2 363:1 368:2 369:2 370:1 \n')


In [16]:
presplit = []
raw_results = []

for i in range(len(raw_dataset)):
    presplit.clear()
    for j in raw_dataset[i]:
        presplit.append(j.split())
    raw_results.append(presplit[:])

pprint(raw_results[0][0], compact=True)

['0.0740740740741', '0:3', '19:134', '22:31', '24:12', '27:19', '34:24', '36:2',
 '50:7', '52:384', '55:11', '64:1', '66:4', '67:78', '71:109', '80:33', '84:12',
 '87:3', '88:58', '90:1', '100:17', '107:132', '115:30', '121:105', '141:95',
 '151:45', '160:1', '165:13', '177:1', '178:4', '181:1', '193:12', '197:1',
 '198:3', '203:455', '209:1', '225:7', '230:4', '231:12', '232:2', '238:4',
 '240:17', '245:30', '354:61', '355:96', '357:6', '359:4', '361:2', '363:1',
 '368:2', '369:2', '370:1']


In [17]:
results = []

for i in range(len(raw_results)):
    for j in range(len(raw_dataset[i])):
        results.append(float(raw_results[i][j][0]))
        results.append(dict(item.split(':') for item in raw_results[i][j][1:]))

len(results), results[0], type(results[2])

(215712, 0.0740740740741, float)

In [18]:
results[0], results[1]

(0.0740740740741,
 {'0': '3',
  '19': '134',
  '22': '31',
  '24': '12',
  '27': '19',
  '34': '24',
  '36': '2',
  '50': '7',
  '52': '384',
  '55': '11',
  '64': '1',
  '66': '4',
  '67': '78',
  '71': '109',
  '80': '33',
  '84': '12',
  '87': '3',
  '88': '58',
  '90': '1',
  '100': '17',
  '107': '132',
  '115': '30',
  '121': '105',
  '141': '95',
  '151': '45',
  '160': '1',
  '165': '13',
  '177': '1',
  '178': '4',
  '181': '1',
  '193': '12',
  '197': '1',
  '198': '3',
  '203': '455',
  '209': '1',
  '225': '7',
  '230': '4',
  '231': '12',
  '232': '2',
  '238': '4',
  '240': '17',
  '245': '30',
  '354': '61',
  '355': '96',
  '357': '6',
  '359': '4',
  '361': '2',
  '363': '1',
  '368': '2',
  '369': '2',
  '370': '1'})

In [19]:
results[215710],results[215711]

(0.40350877193,
 {'0': '1',
  '5': '4',
  '12': '1',
  '15': '3',
  '19': '7',
  '20': '11',
  '22': '5',
  '24': '32',
  '33': '1',
  '34': '7',
  '36': '9',
  '43': '10',
  '61': '51',
  '64': '4',
  '67': '44',
  '68': '15',
  '71': '6',
  '80': '5',
  '81': '2',
  '83': '84',
  '84': '11',
  '87': '3',
  '90': '4',
  '93': '1',
  '97': '132',
  '100': '11',
  '101': '104',
  '107': '457',
  '108': '3',
  '111': '7',
  '115': '50',
  '121': '31',
  '123': '2',
  '132': '3',
  '140': '2',
  '141': '44',
  '142': '9',
  '146': '6',
  '151': '217',
  '153': '6',
  '159': '1',
  '160': '2',
  '161': '1',
  '173': '3',
  '178': '1',
  '182': '1',
  '183': '2',
  '186': '1',
  '190': '1',
  '193': '2',
  '196': '2',
  '197': '4',
  '198': '1',
  '203': '68',
  '205': '2',
  '209': '108',
  '218': '2',
  '223': '3',
  '225': '6',
  '230': '1',
  '232': '2',
  '236': '3',
  '239': '4',
  '240': '16',
  '245': '5',
  '254': '1',
  '353': '5',
  '354': '30',
  '355': '78',
  '356': '3',
  '35

In [20]:
head = []
body = []
temp = []
dataset_array = []

for j in range(1,len(results),2):
    #print("j", j)
    head.clear()
    body.clear()
    for i in range(483):
        #print("i", i)
        found = False
        for k,v in results[j].items():
            if(i == int(k)):
                #print("found", i, k)
                body.insert(i, int(v))
                found = True
        if(found == False):
            #print("key", i, "not found")
            body.insert(i, np.nan)
    head.insert(0,results[j-1])
    temp = head + body
    dataset_array.append(temp)

In [21]:
dataset_array[-1][367]

5

### DATAFRAME COLUMNS PROTOTYPING 

In [22]:
import pandas as pd

### INDEX PROTOTYPE GENERATION

In [23]:
index = pd.MultiIndex.from_product([[i for i in range(3)]], names=['Keys'])

index

MultiIndex(levels=[[0, 1, 2]],
           labels=[[0, 1, 2]],
           names=['Keys'])

### CALL_FEATURE -> FUNCTION

In [24]:
call_feature_function_columns = pd.MultiIndex.from_product([[json_features_main[0]], [call_feature_keys[0]], call_feature_function])

call_feature_function_df = pd.DataFrame(np.random.randint(0,100,(len(index), 257)), \
                                        index=index, columns=call_feature_function_columns)

call_feature_function_df

Unnamed: 0_level_0,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature
Unnamed: 0_level_1,function,function,function,function,function,function,function,function,function,function,function,function,function,function,function,function,function,function,function,function,function
Unnamed: 0_level_2,NtOpenSection,NtWaitForSingleObject,GetAsyncKeyState,NtDeleteValueKey,WSARecv,getaddrinfo,InternetGetConnectedState,NtCreateEvent,GetFileVersionInfoSizeW,GetAdaptersAddresses,...,DnsQuery_UTF8,CoInternetSetFeatureEnabled,NtResumeProcess,OpenSCManagerA,GetFileVersionInfoW,CryptDecodeObjectEx,InternetOpenUrlW,OpenSCManagerA,WinHttpOpenRequest,SetupDiGetDeviceRegistryPropertyA
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,27,27,67,0,61,12,23,24,26,62,...,91,33,86,70,85,74,2,37,45,34
1,80,53,60,14,23,35,13,78,96,80,...,37,14,80,53,99,26,8,51,14,34
2,3,39,27,86,60,36,64,42,68,51,...,81,51,92,52,16,38,74,89,20,45


### CALL_FEATURE -> CALL_FILE

In [25]:
call_feature_call_file_columns = pd.MultiIndex.from_product([[json_features_main[0]], [call_feature_keys[1]], call_feature_call_file])

call_feature_call_file_df = pd.DataFrame(np.random.randint(0,100,(len(index), 96)), \
                           index=index, columns=call_feature_call_file_columns)

call_feature_call_file_df

Unnamed: 0_level_0,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature
Unnamed: 0_level_1,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file
Unnamed: 0_level_2,msxml3.dll,winsta.dll,icm32.dll,sqlite3.dll,msls31.dll,pstorec.dll,mpr.dll,iertutil.dll,crypt32.dll,clbcatq.dll,...,scrrun.dll,winhttp.dll,fastprox.dll,version.dll,wininet.dll,shfolder.dll,cscdll.dll,sxs.dll,msvfw32.dll,secur32.dll
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,14,46,85,34,39,54,79,91,19,46,...,68,89,0,95,6,52,68,34,69,52
1,36,42,11,57,1,31,81,67,11,2,...,30,75,1,7,20,32,11,9,97,67
2,4,69,60,78,66,13,28,92,95,8,...,20,27,37,34,79,76,10,69,60,0


### CALL_FEATURE -> (FUNCTION, CALL_FILE) [MERGE]

In [26]:
call_feature_df = pd.merge(call_feature_function_df,call_feature_call_file_df, left_index=True, right_index=True)

display(call_feature_df)

Unnamed: 0_level_0,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature,call_feature
Unnamed: 0_level_1,function,function,function,function,function,function,function,function,function,function,...,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file,call_file
Unnamed: 0_level_2,NtOpenSection,NtWaitForSingleObject,GetAsyncKeyState,NtDeleteValueKey,WSARecv,getaddrinfo,InternetGetConnectedState,NtCreateEvent,GetFileVersionInfoSizeW,GetAdaptersAddresses,...,scrrun.dll,winhttp.dll,fastprox.dll,version.dll,wininet.dll,shfolder.dll,cscdll.dll,sxs.dll,msvfw32.dll,secur32.dll
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,27,27,67,0,61,12,23,24,26,62,...,68,89,0,95,6,52,68,34,69,52
1,80,53,60,14,23,35,13,78,96,80,...,30,75,1,7,20,32,11,9,97,67
2,3,39,27,86,60,36,64,42,68,51,...,20,27,37,34,79,76,10,69,60,0


In [27]:
len(call_feature_df.columns) == (len(call_feature_function_df.columns) + len(call_feature_call_file_df.columns))

True

In [28]:
len(call_feature_df.columns)

353

### REGISTRY_FEATURE

In [29]:
registry_feature_columns = pd.MultiIndex.from_product([[json_features_main[1]], registry_feature_keys])

registry_feature_df = pd.DataFrame(np.random.randint(0,100,(len(index), 4)), \
                           index=index, columns=registry_feature_columns)

registry_feature_df

Unnamed: 0_level_0,registry_feature,registry_feature,registry_feature,registry_feature
Unnamed: 0_level_1,regkey_written,regkey_opened,regkey_read,regkey_deleted
Keys,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,21,83,66,71
1,66,73,35,70
2,58,25,34,62


In [30]:
len(registry_feature_df.columns)

4

### CALL_FEATURE + REGISTRY_FEATURE [DATAFRAME MERGE]

In [31]:
call_feature_registry_feature_merge_df = pd.merge(call_feature_df,registry_feature_df, left_index=True, right_index=True)

display(call_feature_registry_feature_merge_df)

Unnamed: 0_level_0,"(call_feature, function, NtOpenSection)","(call_feature, function, NtWaitForSingleObject)","(call_feature, function, GetAsyncKeyState)","(call_feature, function, NtDeleteValueKey)","(call_feature, function, WSARecv)","(call_feature, function, getaddrinfo)","(call_feature, function, InternetGetConnectedState)","(call_feature, function, NtCreateEvent)","(call_feature, function, GetFileVersionInfoSizeW)","(call_feature, function, GetAdaptersAddresses)",...,"(call_feature, call_file, wininet.dll)","(call_feature, call_file, shfolder.dll)","(call_feature, call_file, cscdll.dll)","(call_feature, call_file, sxs.dll)","(call_feature, call_file, msvfw32.dll)","(call_feature, call_file, secur32.dll)","(registry_feature, regkey_written)","(registry_feature, regkey_opened)","(registry_feature, regkey_read)","(registry_feature, regkey_deleted)"
Keys,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,27,27,67,0,61,12,23,24,26,62,...,6,52,68,34,69,52,21,83,66,71
1,80,53,60,14,23,35,13,78,96,80,...,20,32,11,9,97,67,66,73,35,70
2,3,39,27,86,60,36,64,42,68,51,...,79,76,10,69,60,0,58,25,34,62


In [32]:
print(len(call_feature_registry_feature_merge_df.columns) == \
      (len(call_feature_df.columns) + len(registry_feature_df.columns)))

True


In [33]:
len(call_feature_registry_feature_merge_df.columns)

357

### FILE_FEATURE -> FILE

In [34]:
file_feature_file_columns = pd.MultiIndex.from_product([[json_features_main[2]], [file_feature_keys[0]], file_feature_file])

file_feature_file_df = pd.DataFrame(np.random.randint(0,100,(len(index), 8)), \
                           index=index, columns=file_feature_file_columns)

file_feature_file_df

Unnamed: 0_level_0,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature
Unnamed: 0_level_1,file,file,file,file,file,file,file,file
Unnamed: 0_level_2,file_opened,file_written,file_exists,file_moved,file_read,file_deleted,file_failed,file_copied
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
0,12,66,93,96,48,59,69,0
1,42,99,66,0,59,44,80,46
2,73,35,71,29,38,83,0,22


### FILE_FEATURE -> DIRECTORY

In [35]:
file_feature_directory_columns = pd.MultiIndex.from_product([[json_features_main[2]], [file_feature_keys[1]], file_feature_directory])

file_feature_directory_df = pd.DataFrame(np.random.randint(0,100,(len(index), 3)), \
                           index=index, columns=file_feature_directory_columns)

file_feature_directory_df

Unnamed: 0_level_0,file_feature,file_feature,file_feature
Unnamed: 0_level_1,directory,directory,directory
Unnamed: 0_level_2,directory_enumerated,directory_created,directory_removed
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
0,30,65,79
1,16,84,72
2,6,35,28


### FILE_FEATURE -> (FILE, DIRECTORY) [MERGE]

In [36]:
file_feature_df = pd.merge(file_feature_file_df,file_feature_directory_df, left_index=True, right_index=True)

display(file_feature_df)

Unnamed: 0_level_0,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature,file_feature
Unnamed: 0_level_1,file,file,file,file,file,file,file,file,directory,directory,directory
Unnamed: 0_level_2,file_opened,file_written,file_exists,file_moved,file_read,file_deleted,file_failed,file_copied,directory_enumerated,directory_created,directory_removed
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3
0,12,66,93,96,48,59,69,0,30,65,79
1,42,99,66,0,59,44,80,46,16,84,72
2,73,35,71,29,38,83,0,22,6,35,28


In [37]:
len(file_feature_df.columns) == (len(file_feature_file_df.columns) + len(file_feature_directory_df.columns))

True

In [38]:
len(file_feature_df.columns)

11

### CALL_FEATURE + REGISTRY_FEATURE + FILE_FEATURE [DATAFRAME MERGE]

In [39]:
call_feature_registry_feature_file_feature_merge_df = pd.merge(call_feature_registry_feature_merge_df,file_feature_df, left_index=True, right_index=True)

display(call_feature_registry_feature_file_feature_merge_df)

Unnamed: 0_level_0,"(call_feature, function, NtOpenSection)","(call_feature, function, NtWaitForSingleObject)","(call_feature, function, GetAsyncKeyState)","(call_feature, function, NtDeleteValueKey)","(call_feature, function, WSARecv)","(call_feature, function, getaddrinfo)","(call_feature, function, InternetGetConnectedState)","(call_feature, function, NtCreateEvent)","(call_feature, function, GetFileVersionInfoSizeW)","(call_feature, function, GetAdaptersAddresses)",...,"(file_feature, file, file_written)","(file_feature, file, file_exists)","(file_feature, file, file_moved)","(file_feature, file, file_read)","(file_feature, file, file_deleted)","(file_feature, file, file_failed)","(file_feature, file, file_copied)","(file_feature, directory, directory_enumerated)","(file_feature, directory, directory_created)","(file_feature, directory, directory_removed)"
Keys,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,27,27,67,0,61,12,23,24,26,62,...,66,93,96,48,59,69,0,30,65,79
1,80,53,60,14,23,35,13,78,96,80,...,99,66,0,59,44,80,46,16,84,72
2,3,39,27,86,60,36,64,42,68,51,...,35,71,29,38,83,0,22,6,35,28


In [40]:
print(len(call_feature_registry_feature_file_feature_merge_df.columns) == \
      (len(call_feature_df.columns) + len(registry_feature_df.columns) + len(file_feature_df.columns)))

True


In [41]:
len(call_feature_registry_feature_file_feature_merge_df.columns)

368

### MISC_FEATURE -> MISC

In [42]:
misc_feature_misc_columns = pd.MultiIndex.from_product([[json_features_main[3]], [misc_feature_keys[0]], misc_feature_misc])

misc_feature_misc_df = pd.DataFrame(np.random.randint(0,100,(len(index), 3)), \
                           index=index, columns=misc_feature_misc_columns)

misc_feature_misc_df

Unnamed: 0_level_0,misc_feature,misc_feature,misc_feature
Unnamed: 0_level_1,misc,misc,misc
Unnamed: 0_level_2,mutex,processes,processtree
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
0,36,21,33
1,90,64,65
2,58,8,0


### MISC_FEATURE -> COM_SIGN

In [43]:
misc_feature_com_sign_columns = pd.MultiIndex.from_product([[json_features_main[3]], [misc_feature_keys[1]], misc_feature_com_sign])

misc_feature_com_sign_df = pd.DataFrame(np.random.randint(0,100,(len(index), 112)), \
                           index=index, columns=misc_feature_com_sign_columns)

misc_feature_com_sign_df

Unnamed: 0_level_0,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature
Unnamed: 0_level_1,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign
Unnamed: 0_level_2,recon_beacon,recon_checkip,mimics_agent,antiav_detectreg,packer_upx,packer_vmprotect,packer_armadillo_regkey,removes_zoneid_ads,antiemu_wine_func,network_tor,...,infostealer_keylog,multiple_useragents,bypass_firewall,origin_langid,process_needed,infostealer_ftp,bot_russkill,rat_fynloski_mutexes,antiemu_wine_reg,stealth_timeout
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,22,70,84,30,11,10,63,56,21,87,...,98,39,74,14,33,10,84,51,26,49
1,96,23,73,11,47,91,39,19,88,49,...,85,71,45,92,23,84,91,2,2,6
2,95,62,48,74,48,3,53,8,59,12,...,36,60,74,48,48,21,5,16,87,90


### MISC_FEATURE -> (MISC, COM_SIGN) [MERGE]

In [44]:
misc_feature_df = pd.merge(misc_feature_misc_df,misc_feature_com_sign_df, left_index=True, right_index=True)

display(misc_feature_df)

Unnamed: 0_level_0,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature,misc_feature
Unnamed: 0_level_1,misc,misc,misc,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign,com_sign
Unnamed: 0_level_2,mutex,processes,processtree,recon_beacon,recon_checkip,mimics_agent,antiav_detectreg,packer_upx,packer_vmprotect,packer_armadillo_regkey,...,infostealer_keylog,multiple_useragents,bypass_firewall,origin_langid,process_needed,infostealer_ftp,bot_russkill,rat_fynloski_mutexes,antiemu_wine_reg,stealth_timeout
Keys,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,36,21,33,22,70,84,30,11,10,63,...,98,39,74,14,33,10,84,51,26,49
1,90,64,65,96,23,73,11,47,91,39,...,85,71,45,92,23,84,91,2,2,6
2,58,8,0,95,62,48,74,48,3,53,...,36,60,74,48,48,21,5,16,87,90


In [45]:
len(misc_feature_df.columns) == (len(misc_feature_misc_df.columns) + len(misc_feature_com_sign_df.columns))

True

In [46]:
len(misc_feature_df.columns)

115

### CALL_FEATURE + REGISTRY_FEATURE + FILE_FEATURE + MISC_FEATURE [FINAL MERGE]

In [47]:
final_merge_df = pd.merge(call_feature_registry_feature_file_feature_merge_df,misc_feature_df, left_index=True, right_index=True)

display(final_merge_df)

Unnamed: 0_level_0,"(call_feature, function, NtOpenSection)","(call_feature, function, NtWaitForSingleObject)","(call_feature, function, GetAsyncKeyState)","(call_feature, function, NtDeleteValueKey)","(call_feature, function, WSARecv)","(call_feature, function, getaddrinfo)","(call_feature, function, InternetGetConnectedState)","(call_feature, function, NtCreateEvent)","(call_feature, function, GetFileVersionInfoSizeW)","(call_feature, function, GetAdaptersAddresses)",...,"(misc_feature, com_sign, infostealer_keylog)","(misc_feature, com_sign, multiple_useragents)","(misc_feature, com_sign, bypass_firewall)","(misc_feature, com_sign, origin_langid)","(misc_feature, com_sign, process_needed)","(misc_feature, com_sign, infostealer_ftp)","(misc_feature, com_sign, bot_russkill)","(misc_feature, com_sign, rat_fynloski_mutexes)","(misc_feature, com_sign, antiemu_wine_reg)","(misc_feature, com_sign, stealth_timeout)"
Keys,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,27,27,67,0,61,12,23,24,26,62,...,98,39,74,14,33,10,84,51,26,49
1,80,53,60,14,23,35,13,78,96,80,...,85,71,45,92,23,84,91,2,2,6
2,3,39,27,86,60,36,64,42,68,51,...,36,60,74,48,48,21,5,16,87,90


In [48]:
print(len(final_merge_df.columns) == (len(call_feature_df.columns) + \
                                      len(registry_feature_df.columns) + \
                                      len(file_feature_df.columns) + \
                                     len(misc_feature_df.columns)))

True


In [49]:
len(final_merge_df.columns)

483

### DATASET DATAFRAME GENERATION

In [50]:
final_merge_df.insert(0, 'risk_value', np.nan)

final_merge_df

Unnamed: 0_level_0,risk_value,"(call_feature, function, NtOpenSection)","(call_feature, function, NtWaitForSingleObject)","(call_feature, function, GetAsyncKeyState)","(call_feature, function, NtDeleteValueKey)","(call_feature, function, WSARecv)","(call_feature, function, getaddrinfo)","(call_feature, function, InternetGetConnectedState)","(call_feature, function, NtCreateEvent)","(call_feature, function, GetFileVersionInfoSizeW)",...,"(misc_feature, com_sign, infostealer_keylog)","(misc_feature, com_sign, multiple_useragents)","(misc_feature, com_sign, bypass_firewall)","(misc_feature, com_sign, origin_langid)","(misc_feature, com_sign, process_needed)","(misc_feature, com_sign, infostealer_ftp)","(misc_feature, com_sign, bot_russkill)","(misc_feature, com_sign, rat_fynloski_mutexes)","(misc_feature, com_sign, antiemu_wine_reg)","(misc_feature, com_sign, stealth_timeout)"
Keys,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,27,27,67,0,61,12,23,24,26,...,98,39,74,14,33,10,84,51,26,49
1,,80,53,60,14,23,35,13,78,96,...,85,71,45,92,23,84,91,2,2,6
2,,3,39,27,86,60,36,64,42,68,...,36,60,74,48,48,21,5,16,87,90


In [51]:
df = pd.DataFrame(dataset_array)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,474,475,476,477,478,479,480,481,482,483
0,0.074074,3.0,,,,,,,,,...,,,,,,,,,,
1,0.684211,,,,,,,,,,...,,,,,,,,,,
2,0.814815,,,,,,,,,,...,,,,,,,,,,
3,0.814815,,,,,,1.0,,,,...,,,,,,,,,,
4,0.421053,,,,,,,,,,...,,,,,,,,,,
5,0.755102,,,,,,,,,,...,,,,,,,,,,
6,0.632653,,,,,,,,,,...,,,,,,,,,,
7,0.886792,,,,,,,,,,...,,,,,,,,,,
8,0.673469,,,,,,,,,,...,,,,,,,,,,
9,0.438596,2.0,,1.0,,,4.0,,,,...,,,,,,,,,,


In [52]:
df.columns = final_merge_df.columns
df

Unnamed: 0,risk_value,"(call_feature, function, NtOpenSection)","(call_feature, function, NtWaitForSingleObject)","(call_feature, function, GetAsyncKeyState)","(call_feature, function, NtDeleteValueKey)","(call_feature, function, WSARecv)","(call_feature, function, getaddrinfo)","(call_feature, function, InternetGetConnectedState)","(call_feature, function, NtCreateEvent)","(call_feature, function, GetFileVersionInfoSizeW)",...,"(misc_feature, com_sign, infostealer_keylog)","(misc_feature, com_sign, multiple_useragents)","(misc_feature, com_sign, bypass_firewall)","(misc_feature, com_sign, origin_langid)","(misc_feature, com_sign, process_needed)","(misc_feature, com_sign, infostealer_ftp)","(misc_feature, com_sign, bot_russkill)","(misc_feature, com_sign, rat_fynloski_mutexes)","(misc_feature, com_sign, antiemu_wine_reg)","(misc_feature, com_sign, stealth_timeout)"
0,0.074074,3.0,,,,,,,,,...,,,,,,,,,,
1,0.684211,,,,,,,,,,...,,,,,,,,,,
2,0.814815,,,,,,,,,,...,,,,,,,,,,
3,0.814815,,,,,,1.0,,,,...,,,,,,,,,,
4,0.421053,,,,,,,,,,...,,,,,,,,,,
5,0.755102,,,,,,,,,,...,,,,,,,,,,
6,0.632653,,,,,,,,,,...,,,,,,,,,,
7,0.886792,,,,,,,,,,...,,,,,,,,,,
8,0.673469,,,,,,,,,,...,,,,,,,,,,
9,0.438596,2.0,,1.0,,,4.0,,,,...,,,,,,,,,,


In [53]:
df.to_msgpack('dataframe/dataset-nan.msg')

In [54]:
#df.to_csv('dataset-nan.csv')

In [55]:
df.to_pickle('dataframe/dataset-nan.pkl')