# 3 layers of detection on a few PE files
### AV1, AV2, AV3
### Hashes of files considered



In [102]:
import pandas as pd
df = pd.read_csv("data.csv")
print(df["sha256"])
print(df["sha256"].nunique(), "unique samples", len(df), "analyses")


0     6d810d55f0e23f579169db2eb78cad4fb8f038c5279411...
1     88a247b58d284dffaea7c0d7cde2a7979d9e2317e832a1...
2     d3eeb7953bd70e26355dc3d137f76ae8db88b7f77b99b2...
3     25b651798dd8863d2756417dcba34d1802b50a568ee3ef...
4     466059f14e8570d79611515b1f22758954e07c8275f356...
5     efa4e5ea8bb605462924b9a5db269eed6cd74b9a1487e4...
6     7b5b0ce909723174dfb5addb7a7a339604e7ee6c67f3c0...
7     44b422c25be5fc52261ca2f4c3781f9d37bae887af83bf...
8     c9d057c5c5b3989c8bc44cd014758cc1d5d1287419b6a4...
9     39ea74a8a18f900f51bc2be7c6897341ab867cb17370c6...
10    0d5ff3dfb41faab6fef24442df6738f1f9eb443690d63d...
11    e31429f190badb98819f6a9a87a0da37758aa97d8badaf...
12    47adfb751807fe3313aa8ac3ade27d3ac47a6ec4aa3f74...
13    a427cb7a0b453e547318f271d7517a1cb8923b631f5b48...
14    d5a3e2499ff189f96cc8f0142a731b81b4901155e687a5...
15    d5a3e2499ff189f96cc8f0142a731b81b4901155e687a5...
16    32821027d5af75ad14acdfc8efd4577623a57efc0d473e...
17    7fa2cede2f7bef08d7e5984a65bf8da6830f3c9c43

## Step 1: Static analysis
We have 2 static analysis AV engines: AV_1 and AV_2.

N.B. these detections are taken from real antivirus using historic scans and the labels for some of these samples have now changed.

In [86]:
# Check agreement 
df["AV_total"] = df[["AV_1", "AV_2"]].fillna(0).astype(int).max(axis=1)
print(df.groupby(["AV_1", "AV_2"])["AV_total"].count())

print(df[df.duplicated(subset=["sha256"], keep=False)].sort_values(by="sha256")[[
    "sha256", "AV_1", "AV_2", "AV_total"]])


AV_1   AV_2 
False  False    8
       True     4
True   False    2
       True     4
Name: AV_total, dtype: int64
                                               sha256   AV_1   AV_2  AV_total
13  a427cb7a0b453e547318f271d7517a1cb8923b631f5b48...  False  False         0
20  a427cb7a0b453e547318f271d7517a1cb8923b631f5b48...    NaN  False         0
14  d5a3e2499ff189f96cc8f0142a731b81b4901155e687a5...   True   True         1
15  d5a3e2499ff189f96cc8f0142a731b81b4901155e687a5...   True  False         1


In [87]:
# Eliminate samples with at least one detection
print(df[df["AV_total"] >= 1]["sha256"].nunique())

9


In [88]:
# At this point we can check how well the AVs are working... 
df["false_negative"] = (df["malware"] == 1) & (df["AV_total"] == 0)
df["false_positive"] = (df["malware"] == 0) & (df["AV_total"] == 1)
print(df[["AV_total", "malware", "false_negative", "false_positive"]])

    AV_total  malware  false_negative  false_positive
0          1        1           False           False
1          1        1           False           False
2          1        1           False           False
3          1        1           False           False
4          1        1           False           False
5          1        1           False           False
6          0        0           False           False
7          0        0           False           False
8          0        0           False           False
9          0        0           False           False
10         0        0           False           False
11         0        0           False           False
12         1        1           False           False
13         0        0           False           False
14         1        1           False           False
15         1        1           False           False
16         0        0           False           False
17         0        1       

# Step 2. Dynamic analysis 

All of these samples have been run in a (cuckoo) sandbox - let's:
1. Look at the signatures
2. Use a heuristic model to classsify the samples

In [89]:
# Let's look at some signatures (from cuckoo sandbox)
# The names give a reasonable clue as to the signature

dynamic_signatures = ['antivm_memory_available', 'infostealer_browser', 'suspicious_process', 
              'stealth_window', 'installs_bho', 'infostealer_mail',
              'persistence_autorun', 'antivm_queries_computername', 'antisandbox_mouse_hook', 
              'infostealer_keylogger', 'stealth_hiddenfile', 'stops_service', 
              'antivm_network_adapters', 'deletes_self', 'uses_windows_utilities', 
              'creates_service', 'creates_doc', 'creates_shortcut', 'ransomware_dropped_files', 
              'dropper', 'persistence_ads', 'exploit_heapspray', 'has_wmi', 'infostealer_im', 
              'generates_crypto_key', 'browser_startpage', 'ransomware_message', 'timeGetTime', 
              'multiple_useragents', 'antisandbox_idletime']

In [90]:
# look at the postive match signatures for the false negative
print(df[df["false_negative"]][signatures].dropna(axis=1))

    antivm_memory_available  allocates_rwx  suspicious_process  \
17                      1.0            1.0                 2.0   

    stealth_window  infostealer_mail  deletes_self  persistence_ads  \
17             1.0              48.0           2.0              1.0   

    ransomware_message  timeGetTime  
17                 1.0        422.0  


In [91]:
# features for heuristic model
api_calls = ['AssignProcessToJobObject', 'CertControlStore', 'CertOpenStore', 'CoCreateInstance', 
             'CoCreateInstanceEx', 'CoGetClassObject', 'CoInitializeEx', 'CoInitializeSecurity', 
             'CoUninitialize', 'ControlService', 'CopyFileA', 'CopyFileW', 'CreateActCtxW', 'CreateDirectoryW', 
             'CreateJobObjectW', 'CreateProcessInternalW', 'CreateServiceW', 'CreateThread', 
             'CreateToolhelp32Snapshot', 'CryptAcquireContextA', 'CryptAcquireContextW', 'CryptCreateHash', 
             'CryptDecodeObjectEx', 'CryptExportKey', 'CryptHashData', 'CryptUnprotectData', 'DeleteFileW', 
             'DeviceIoControl', 'DrawTextExW', 'EnumWindows', 'FindFirstFileExW', 'FindResourceA', 
             'FindResourceExA', 'FindResourceExW', 'FindResourceW', 'FindWindowExW', 'FindWindowW', 
             'GetAdaptersAddresses', 'GetAdaptersInfo', 'GetBestInterfaceEx', 'GetComputerNameA', 
             'GetComputerNameW', 'GetCursorPos', 'GetFileAttributesExW', 'GetFileAttributesW', 
             'GetFileInformationByHandle', 'GetFileInformationByHandleEx', 'GetFileSize', 'GetFileSizeEx', 
             'GetFileType', 'GetFileVersionInfoSizeW', 'GetFileVersionInfoW', 'GetForegroundWindow', 
             'GetKeyState', 'GetNativeSystemInfo', 'GetShortPathNameW', 'GetSystemDirectoryA', 
             'GetSystemDirectoryW', 'GetSystemInfo', 'GetSystemMetrics', 'GetSystemTimeAsFileTime', 
             'GetSystemWindowsDirectoryA', 'GetSystemWindowsDirectoryW', 'GetTempPathW', 'GetTimeZoneInformation',
             'GetUserNameA', 'GetUserNameW', 'GetVolumeNameForVolumeMountPointW', 'GetVolumePathNameW', 
             'GetVolumePathNamesForVolumeNameW', 'GlobalMemoryStatus', 'GlobalMemoryStatusEx', 'HttpOpenRequestW', 
             'HttpSendRequestW', 'IWbemServices_ExecQuery', 'InternetCloseHandle', 'InternetConnectW', 
             'InternetCrackUrlA', 'InternetCrackUrlW', 'InternetGetConnectedState', 'InternetOpenW', 
             'InternetQueryOptionA', 'InternetSetOptionA', 'InternetSetStatusCallback', 'IsDebuggerPresent', 
             'LdrGetDllHandle', 'LdrGetProcedureAddress', 'LdrLoadDll', 'LdrUnloadDll', 'LoadResource', 
             'LoadStringA', 'LoadStringW', 'LookupAccountSidW', 'LookupPrivilegeValueW', 
             'NtAllocateVirtualMemory', 'NtClose', 'NtCreateFile', 'NtCreateKey', 'NtCreateMutant', 
             'NtCreateSection', 'NtCreateThreadEx', 'NtDelayExecution', 'NtDeviceIoControlFile', 
             'NtDuplicateObject', 'NtEnumerateKey', 'NtEnumerateValueKey', 'NtFreeVirtualMemory', 
             'NtMapViewOfSection', 'NtOpenDirectoryObject', 'NtOpenFile', 'NtOpenKey', 'NtOpenKeyEx', 
             'NtOpenMutant', 'NtOpenProcess', 'NtOpenSection', 'NtOpenThread', 'NtProtectVirtualMemory', 
             'NtQueryAttributesFile', 'NtQueryDirectoryFile', 'NtQueryFullAttributesFile', 
             'NtQueryInformationFile', 'NtQueryKey', 'NtQuerySystemInformation', 'NtQueryValueKey', 'NtReadFile', 
             'NtReadVirtualMemory', 'NtResumeThread', 'NtSetInformationFile', 'NtSetValueKey', 
             'NtTerminateProcess', 'NtUnmapViewOfSection', 'NtWriteFile', 'NtWriteVirtualMemory', 
             'OleInitialize', 'OpenSCManagerA','OpenSCManagerW', 'OpenServiceA', 'OpenServiceW', 
             'Process32FirstW', 'Process32NextW','ReadProcessMemory', 'RegCloseKey', 'RegCreateKeyExA', 
             'RegCreateKeyExW', 'RegDeleteKeyA', 'RegDeleteKeyW', 'RegDeleteValueA', 'RegDeleteValueW', 
             'RegEnumKeyExA', 'RegEnumKeyExW', 'RegEnumKeyW', 'RegEnumValueA', 'RegEnumValueW', 
             'RegOpenKeyExA', 'RegOpenKeyExW', 'RegQueryInfoKeyW', 'RegQueryValueExA', 'RegQueryValueExW', 
             'RegSetValueExA', 'RegSetValueExW', 'RemoveDirectoryA', 'RtlAddVectoredContinueHandler', 
             'RtlAddVectoredExceptionHandler', 'RtlRemoveVectoredExceptionHandler', 'SHGetFolderPathW',
             'SHGetSpecialFolderLocation', 'SearchPathW', 'SendNotifyMessageA', 
             'SendNotifyMessageW', 'SetEndOfFile', 'SetErrorMode', 'SetFileAttributesW', 'SetFilePointer', 
             'SetFilePointerEx', 'SetFileTime', 'SetInformationJobObject', 'SetUnhandledExceptionFilter', 
             'SetWindowsHookExA', 'SetWindowsHookExW', 'ShellExecuteExW', 'SizeofResource', 'StartServiceA', 
             'StartServiceW', 'UnhookWindowsHookEx', 'UuidCreate', 'WSAConnect', 'WSASocketW', 'WSAStartup', 
             'WriteConsoleA', 'WriteConsoleW']


In [95]:
df[api_calls] = df[api_calls].fillna(0)
print(df.groupby("malware")[api_calls].mean())

         AssignProcessToJobObject  CertControlStore  CertOpenStore  \
malware                                                              
0                        0.000000          0.000000       0.000000   
1                        0.181818          0.181818       5.090909   

         CoCreateInstance  CoCreateInstanceEx  CoGetClassObject  \
malware                                                           
0                1.272727            0.000000          0.000000   
1                2.727273            0.181818          0.363636   

         CoInitializeEx  CoInitializeSecurity  CoUninitialize  ControlService  \
malware                                                                         
0              5.272727              0.090909        3.727273        0.000000   
1              4.636364              0.272727        2.545455        0.181818   

         ...  SizeofResource  StartServiceA  StartServiceW  \
malware  ...                                                 
0

In [99]:
# We have a machine learning model (gradient boosted machine) api call frequencies 
# achieved 92% accuracy with no feature selection (but test samples not split by date)

import pickle
with open("dynamic_classifier.pkl", "rb") as f:
    gbm_model = pickle.load(f)

df["dynamic_classifier"] = gbm_model.predict(df[api_calls])

df["false_negative"] = (df["malware"] == 1) & (df["dynamic_classifier"] == 0)
df["false_positive"] = (df["malware"] == 0) & (df["dynamic_classifier"] == 1)
print(df[["dynamic_classifier", "malware", "false_negative", "false_positive"]])
print("accuracy:", (df["dynamic_classifier"] == df["malware"]).mean().round(4) * 100, 
      "FPR", df["false_negative"].mean().round(4) * 100, 
      "FNR",  df["false_positive"].mean().round(4) * 100)

    dynamic_classifier  malware  false_negative  false_positive
0                  1.0        1           False           False
1                  1.0        1           False           False
2                  1.0        1           False           False
3                  1.0        1           False           False
4                  1.0        1           False           False
5                  1.0        1           False           False
6                  0.0        0           False           False
7                  0.0        0           False           False
8                  0.0        0           False           False
9                  0.0        0           False           False
10                 0.0        0           False           False
11                 1.0        0           False            True
12                 0.0        1            True           False
13                 0.0        0           False           False
14                 1.0        1         

## Combine static and dynamic

In [101]:
df["depth_classifier"] = df[["AV_total", "dynamic_classifier"]].max(axis=1)

df["false_negative"] = (df["malware"] == 1) & (df["depth_classifier"] == 0)
df["false_positive"] = (df["malware"] == 0) & (df["depth_classifier"] == 1)
print(df[["dynamic_classifier", "AV_total", "malware", "false_negative", "false_positive"]])
print("accuracy:", (df["depth_classifier"] == df["malware"]).mean().round(4) * 100, 
      "FPR", df["false_negative"].mean().round(4) * 100, 
      "FNR",  df["false_positive"].mean().round(4) * 100)

    dynamic_classifier  AV_total  malware  false_negative  false_positive
0                  1.0         1        1           False           False
1                  1.0         1        1           False           False
2                  1.0         1        1           False           False
3                  1.0         1        1           False           False
4                  1.0         1        1           False           False
5                  1.0         1        1           False           False
6                  0.0         0        0           False           False
7                  0.0         0        0           False           False
8                  0.0         0        0           False           False
9                  0.0         0        0           False           False
10                 0.0         0        0           False           False
11                 1.0         0        0           False            True
12                 0.0         1      