* Asking the right questions
* Preparing the data
* Selecting the algorithm
* Training the model
* Testing the model
Is a PE file Malware or Not Malware?
Hotdog or Not Hotdog?
Attempt to use the machine learning workflow to process and transform sampled PE file data to create a prediction model. Using the generated data, predict with 65% accuracy which PE files are likely to be classified as malware.
http://resources.infosecinstitute.com/machine-learning-malware-detection/
https://app.pluralsight.com/library/courses/python-understanding-machine-learning/exercise-files
Based off of the research "Selecting Features to Classify Malware", we are interested in extracting the following fields of a PE File:
-
Major Image Version: Used to indicate the major version number of the application; in Microsoft Excel version 4.0, it would be 4.
-
Virtual Adress and Size of the IMAGE_DATA_DIRECTORY
-
OS Version (may not give much)
-
Import Adress Table Adress
-
Ressources Size
-
Number Of Sections (we should look into section names)
-
Linker Version (may not give much)
-
Size of Stack Reserve
-
DLL Characteristics
-
Export Table Size and Adress
-
Address of Entry Point
-
Image Base
-
Number Of Import DLL
-
Number Of Import Functions
-
Number Of Sections
Included in the dataset but not used:
DLL name and Imported Symbols (we might be able to create a weighted score to use with this info?)
filename
Stuff to include: DebugSize DebugRVA ImageVersion OperatingSystemVersion SizeOfStackReserve LinkerVersion DllCharacteristics IatRVA ExportSize ExportRVA ExportNameLen ResourceSize ExportFunctionsCount
Pandas - provided data frames
matplotlib.pyplot - plotting support
import os
import pefile
import pprint as pp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import csv
import glob
import magic
import hashlib
import sys
import struct
import peutils
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
To make our code more organized let’s start by creating a class that represents the PE File information as one object. We are using the python module pefile which is a multi-platform Python module to parse and work with Portable Executable (aka PE) files. https://github.com/erocarrera/pefile
def sha256_checksum(filename, block_size=65536):
sha256 = hashlib.sha256()
with open(filename, 'rb') as f:
for block in iter(lambda: f.read(block_size), b''):
sha256.update(block)
return sha256.hexdigest()
class PEFile:
"""
This Class is constructed by parsing the pe file for the interesting features
each pe file is an object by itself and we extract the needed information
into a dictionary
"""
# look to add PEid signatures to detect packers
# https://github.com/erocarrera/pefile/blob/wiki/PEiDSignatures.md
# signatures = peutils.SignatureDatabase('./userdb.txt')
def __init__(self, filename):
self.pe = pefile.PE(filename, fast_load=True)
self.filename = filename
self.DebugSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size
self.DebugRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress
self.ImageVersion = self.pe.OPTIONAL_HEADER.MajorImageVersion
self.OSVersion = self.pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
self.ExportRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress
self.ExportSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size
self.IATRVA = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress
self.ResSize = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size
self.LinkerVersion = self.pe.OPTIONAL_HEADER.MajorLinkerVersion
self.NumberOfSections = self.pe.FILE_HEADER.NumberOfSections
self.StackReserveSize = self.pe.OPTIONAL_HEADER.SizeOfStackReserve
self.Dll = self.pe.OPTIONAL_HEADER.DllCharacteristics
self.AddressOfEntryPoint = self.pe.OPTIONAL_HEADER.AddressOfEntryPoint
self.ImageBase = self.pe.OPTIONAL_HEADER.ImageBase
# If the PE file was loaded using the fast_load=True argument, we will need to parse the data directories:
self.pe.parse_data_directories()
imported_dll = {}
number_dll = 0
try:
for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
if entry is not None:
#print(entry.dll)
number_dll += 1
for imp in entry.imports:
#print('\t', hex(imp.address), imp.name)
if imp.name is not None:
#print(imp.name.decode())
imported_dll[entry.dll.decode()] = imp.name.decode()
except:
pass#print("[-]")
self.ImportedDLL = imported_dll
self.NumberOfImportDLL = number_dll
section_names = {}
number_sections = 0
try:
for section in self.pe.sections:
number_sections += 1
#print (section.Name, hex(section.VirtualAddress), hex(section.Misc_VirtualSize), section.SizeOfRawData )
section_names[section.Name.decode()] = section.SizeOfRawData
self.SectionNames = section_names
self.NumberOfSections = number_sections
except:
pass#print("[-]")
number_import_functions = 0
import_function = []
try:
if self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT']].VirtualAddress != 0:
self.pe.parse_data_directories(directories=[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT']])
for entry in self.pe.DIRECTORY_ENTRY_IMPORT:
for imp in entry.imports:
#print('\t', hex(imp.address), imp.name)
if imp.name:
number_import_functions += 1
import_function.append(imp.name.decode())
except:
pass#print("[-]")
self.NumberOfImportFunctions = number_import_functions
self.ImportedFunctions = import_function
def Construct(self):
sample = {}
for attr, k in self.__dict__.items():
if(attr != "pe"):
sample[attr] = k
return sample
Now we move on to write a small method that constructs a dictionnary for each PE File thus each sample will be represented as a python dictionnary where keys are the features and values are the value of each parsed field .
def pe2vec(directory):
"""
dirty function (handling all exceptions) for each sample
it construct a dictionary of dictionaries in the format:
sample x : pe informations
"""
dataset = {}
#directory = "./data/"
print("")
print("[*] Extracting the PE file data: ")
print("")
for subdir, dirs, files in os.walk(directory):
for f in files:
file_path = os.path.join(subdir, f)
#print(magic.from_file(file_path))
if re.match('^PE.*', magic.from_file(file_path)):
try:
#print("[+] "+file_path)
pe = PEFile(file_path)
dataset[str(f)] = pe.Construct()
except Exception as e:
raise
else:
#print()
#print("[-] File not PE: "+file_path+" \n=>\t "+magic.from_file(file_path))
#print()
#print("Removing file.")
os.remove(file_path)
return dataset
Print out the dataset for the Malware and Clean Samples
for subdir, dirs, files in os.walk("./data/malware/"):
for f in files:
#print(f)
os.rename("./data/malware/"+f, "./data/malware/"+sha256_checksum("./data/malware/"+f))
print("Done Renaming files to sha256")
Done Renaming files to sha256
# create a dataset dictionary from the collected PE file information
# we might consider adding an MD5 hash value and aappend it to a new column
# an asscoiated MD5 value could be used to lookup the VirusToltal score to confirm it is Malware if we did not get the sample from a good source
ds_malware = {}
ds_malware = pe2vec("./data/malware")
print("")
print("[*] Malware PE information:")
print("")
#pp.pprint(ds_malware)
print("")
print("[*] Completed Malware PE information extraction:")
print("")
[*] Extracting the PE file data:
[*] Malware PE information:
[*] Completed Malware PE information extraction:
for subdir, dirs, files in os.walk("./data/clean/"):
for f in files:
#print(f)
os.rename("./data/clean/"+f, "./data/clean/"+sha256_checksum("./data/clean/"+f))
print("Done Renaming files to sha256")
Done Renaming files to sha256
ds_clean = {}
ds_clean = pe2vec("./data/clean")
print("")
print("[*] Clean PE information:")
print("")
#pp.pprint(ds_clean)
print("")
print("[*] Completed Clean PE information extraction:")
print("")
[*] Extracting the PE file data:
[*] Clean PE information:
[*] Completed Clean PE information extraction:
Loop trough all samples in a folder and process each one of them then dump all those dictionaries into a csv file that we will use .
# now that we have a dictionary let's put it in a clean csv file
def vec2csv(dataset, output_file):
df = pd.DataFrame(dataset)
test_data = df.transpose() # transpose to have the features as columns and samples as rows
# utf-8 is prefered
#output_file = './output/dataset.csv'
test_data.to_csv(output_file,sep=',', encoding='utf-8')
print("")
print("[+] Saving file to: " + output_file)
print("")
datasetOutput_malware = "./output/dataset_malware.csv"
vec2csv(ds_malware, datasetOutput_malware)
datasetOutput_clean = "./output/dataset_clean.csv"
vec2csv(ds_clean, datasetOutput_clean)
[+] Saving file to: ./output/dataset_malware.csv
[+] Saving file to: ./output/dataset_clean.csv
Use Magic %matplotlib to display graphics inline instead of in a popup window.
%matplotlib inline
# get the dataframe
df_malware = pd.read_csv("./output/dataset_malware.csv")
df_malware.shape
df_malware.head(5)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Unnamed: 0 | AddressOfEntryPoint | DebugRVA | DebugSize | Dll | ExportRVA | ExportSize | IATRVA | ImageBase | ImageVersion | ... | ImportedFunctions | LinkerVersion | NumberOfImportDLL | NumberOfImportFunctions | NumberOfSections | OSVersion | ResSize | SectionNames | StackReserveSize | filename | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0004cec68fdb95507c6161d84e4965db60f997a679ce20... | 1073962461 | 0 | 0 | 32768 | 0 | 0 | 155648 | 4194304 | 0 | ... | ['GetPrivateProfileSectionW', 'CopyFileW', 'Se... | 8 | 14 | 253 | 5 | 4 | 10856 | {'.text\x00\x00\x00': 148992, '.rdata\x00\x00'... | 1048576 | ./data/malware/0004cec68fdb95507c6161d84e4965d... |
1 | 0024eec62931670946abd4240d38127e23b4c0c9321de4... | 37296 | 24720 | 28 | 0 | 0 | 0 | 24576 | 65536 | 6 | ... | ['StorPortPauseDevice', 'StorPortGetDeviceBase... | 9 | 2 | 16 | 7 | 6 | 1536 | {'.text\x00\x00\x00': 18432, '.rdata\x00\x00':... | 262144 | ./data/malware/0024eec62931670946abd4240d38127... |
2 | 00a02d154e7389d3a5fe572e9800f1628e74b8aabe4270... | 1073916548 | 0 | 0 | 32768 | 0 | 0 | 114688 | 5368709120 | 0 | ... | ['LoadBITMAP', 'LoadSTRINGW', 'LoadICON', 'Loa... | 9 | 9 | 179 | 5 | 5 | 2552 | {'.text\x00\x00\x00': 107520, '.rdata\x00\x00'... | 1048576 | ./data/malware/00a02d154e7389d3a5fe572e9800f16... |
3 | 00aac566d9664b844e5d7ae641c58131ce59deced31223... | 51656 | 5424 | 28 | 32832 | 0 | 0 | 4096 | 4294967296 | 6 | ... | ['OpenProcessToken', 'GetTokenInformation', 'R... | 9 | 7 | 151 | 5 | 6 | 149376 | {'.text\x00\x00\x00': 55296, '.data\x00\x00\x0... | 524288 | ./data/malware/00aac566d9664b844e5d7ae641c5813... |
4 | 00cb557ec3c36d07f27e264dd6bffb6c858a3d9568878d... | 39104 | 9360 | 28 | 32768 | 0 | 0 | 8192 | 4294967296 | 5 | ... | [] | 8 | 0 | 0 | 4 | 5 | 3492 | {'.text\x00\x00\x00': 57344, '.data\x00\x00\x0... | 524288 | ./data/malware/00cb557ec3c36d07f27e264dd6bffb6... |
5 rows × 21 columns
# histograms
df_malware['NumberOfImportFunctions'].hist()
plt.title(r'Histogram of the Number of Imported Functions')
plt.xlabel('NumberOfImportFunctions')
plt.ylabel('Executables')
plt.show()
df_malware['NumberOfSections'].hist()
plt.title(r'Histogram of the Number of Sections')
plt.xlabel('NumberOfSections')
plt.ylabel('Executables')
plt.show()
df_clean = pd.read_csv("./output/dataset_clean.csv")
df_clean.shape
df_clean.head(5)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Unnamed: 0 | AddressOfEntryPoint | DebugRVA | DebugSize | Dll | ExportRVA | ExportSize | IATRVA | ImageBase | ImageVersion | ... | ImportedFunctions | LinkerVersion | NumberOfImportDLL | NumberOfImportFunctions | NumberOfSections | OSVersion | ResSize | SectionNames | StackReserveSize | filename | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 00070b0d4cb037c40d5d2464f92841aeb9ad863472bf95... | 21704 | 4880 | 28 | 0 | 0 | 0 | 4096 | 4194304 | 1 | ... | ['__vbaVarTstGt', '__vbaVarSub', '__vbaStrI2',... | 6 | 1 | 139 | 3 | 4 | 2184 | {'.text\x00\x00\x00': 1179648, '.data\x00\x00\... | 1048576 | ./data/clean/00070b0d4cb037c40d5d2464f92841aeb... |
1 | 00696555cbf6db83af785f8acb2270b9411cfc75e7f6d3... | 29424 | 4256 | 28 | 49472 | 32576 | 163 | 36864 | 4194304 | 6 | ... | ['_except_handler4_common', '_controlfp', '?te... | 11 | 21 | 114 | 6 | 6 | 2008 | {'.text\x00\x00\x00': 28672, '.data\x00\x00\x0... | 262144 | ./data/clean/00696555cbf6db83af785f8acb2270b94... |
2 | 007247436f041ca59c5ee0e8636c668c2a43376aeb8cfa... | 227872 | 82400 | 84 | 49472 | 0 | 0 | 3522560 | 4194304 | 10 | ... | ['CryptAcquireContextW', 'CryptCreateHash', 'C... | 12 | 8 | 157 | 5 | 6 | 1892 | {'.text\x00\x00\x00': 246272, '.data\x00\x00\x... | 4194304 | ./data/clean/007247436f041ca59c5ee0e8636c668c2... |
3 | 007bdab757d03d94e60c9b1e3eec13b07562705c514992... | 10656 | 4320 | 28 | 49472 | 0 | 0 | 20480 | 4194304 | 6 | ... | ['??3@YAXPAX@Z', '_controlfp', '?terminate@@YA... | 11 | 12 | 46 | 5 | 6 | 15632 | {'.text\x00\x00\x00': 8704, '.data\x00\x00\x00... | 262144 | ./data/clean/007bdab757d03d94e60c9b1e3eec13b07... |
4 | 008fa2b9697f9a173e40572face100410e51975e34a5ce... | 152696 | 4224 | 28 | 33120 | 0 | 0 | 200704 | 5368709120 | 6 | ... | ['GetFileType', 'GetExitCodeProcess', 'CreateP... | 11 | 7 | 144 | 6 | 6 | 28384 | {'.text\x00\x00\x00': 181248, '.data\x00\x00\x... | 524288 | ./data/clean/008fa2b9697f9a173e40572face100410... |
5 rows × 21 columns
# histograms
df_clean['NumberOfImportFunctions'].hist()
plt.title(r'Histogram of the Number of Imported Functions')
plt.xlabel('NumberOfImportFunctions')
plt.ylabel('Executables')
plt.show()
df_clean['NumberOfSections'].hist()
plt.title(r'Histogram of the Number of Sections')
plt.xlabel('NumberOfSections')
plt.ylabel('Executables')
plt.show()
Helper function that displays correlation by color. Red is most correlated, Blue least.
def plot_corr(df, size=15):
"""
Function plots a graphical correlation matrix for each pair of columns in the dataframe.
Input:
df: pandas DataFrame
size: vertical and horizontal size of the plot
Displays:
matrix of correlation between columns. Blue-cyan-yellow-red-darkred => less to more correlated
0 ------------------> 1
Expect a darkred line running from top left to bottom right
"""
corr = df.corr() # data frame correlation function
fig, ax = plt.subplots(figsize=(size, size))
ax.matshow(corr) # color code the rectangles by correlation value
plt.xticks(range(len(corr.columns)), corr.columns) # draw x tick marks
plt.yticks(range(len(corr.columns)), corr.columns) # draw y tick marks
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of deleted columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if corr_matrix.iloc[i, j] >= threshold:
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
if colname in dataset.columns:
del dataset[colname] # deleting the column from the dataset
print(dataset)
df_malware.corr()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
AddressOfEntryPoint | DebugRVA | DebugSize | Dll | ExportRVA | ExportSize | IATRVA | ImageBase | ImageVersion | LinkerVersion | NumberOfImportDLL | NumberOfImportFunctions | NumberOfSections | OSVersion | ResSize | StackReserveSize | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AddressOfEntryPoint | 1.000000 | 0.035230 | -0.090583 | 0.116372 | -0.005256 | -0.025972 | -0.013244 | -0.061953 | -0.015695 | -0.023625 | 0.074893 | 0.039400 | -0.106693 | -0.265357 | -0.048695 | -0.003774 |
DebugRVA | 0.035230 | 1.000000 | 0.230066 | -0.077622 | 0.019823 | -0.023057 | 0.226593 | -0.023712 | -0.011859 | 0.056641 | -0.018453 | 0.035660 | 0.290906 | -0.058632 | 0.001549 | -0.023430 |
DebugSize | -0.090583 | 0.230066 | 1.000000 | -0.120497 | -0.032691 | 0.031825 | -0.087493 | -0.086138 | -0.035535 | 0.108552 | -0.109140 | -0.178110 | 0.095590 | 0.464331 | 0.032082 | -0.277207 |
Dll | 0.116372 | -0.077622 | -0.120497 | 1.000000 | 0.010906 | 0.031482 | -0.022954 | 0.682349 | 0.006064 | 0.054148 | 0.276413 | 0.216764 | -0.570699 | -0.182660 | 0.071357 | 0.122206 |
ExportRVA | -0.005256 | 0.019823 | -0.032691 | 0.010906 | 1.000000 | 0.001543 | 0.897566 | 0.027485 | -0.002995 | -0.000331 | 0.080676 | -0.013044 | 0.115386 | -0.011312 | -0.005866 | -0.004723 |
ExportSize | -0.025972 | -0.023057 | 0.031825 | 0.031482 | 0.001543 | 1.000000 | -0.012096 | 0.024463 | -0.000733 | -0.012812 | 0.013740 | 0.002069 | -0.030079 | 0.077484 | 0.226113 | -0.023774 |
IATRVA | -0.013244 | 0.226593 | -0.087493 | -0.022954 | 0.897566 | -0.012096 | 1.000000 | 0.026367 | -0.002431 | 0.011505 | 0.184731 | 0.119592 | 0.177349 | -0.056255 | -0.006979 | 0.043554 |
ImageBase | -0.061953 | -0.023712 | -0.086138 | 0.682349 | 0.027485 | 0.024463 | 0.026367 | 1.000000 | -0.056380 | 0.069552 | 0.249721 | 0.237441 | -0.311564 | 0.075914 | 0.049947 | 0.157359 |
ImageVersion | -0.015695 | -0.011859 | -0.035535 | 0.006064 | -0.002995 | -0.000733 | -0.002431 | -0.056380 | 1.000000 | -0.003820 | 0.039075 | 0.002036 | 0.028018 | -0.032553 | -0.003823 | -0.008052 |
LinkerVersion | -0.023625 | 0.056641 | 0.108552 | 0.054148 | -0.000331 | -0.012812 | 0.011505 | 0.069552 | -0.003820 | 1.000000 | 0.095095 | 0.139848 | 0.017280 | 0.127573 | 0.021913 | -0.025194 |
NumberOfImportDLL | 0.074893 | -0.018453 | -0.109140 | 0.276413 | 0.080676 | 0.013740 | 0.184731 | 0.249721 | 0.039075 | 0.095095 | 1.000000 | 0.894713 | -0.012561 | -0.073946 | 0.058651 | 0.233540 |
NumberOfImportFunctions | 0.039400 | 0.035660 | -0.178110 | 0.216764 | -0.013044 | 0.002069 | 0.119592 | 0.237441 | 0.002036 | 0.139848 | 0.894713 | 1.000000 | 0.055095 | -0.113983 | 0.032964 | 0.322661 |
NumberOfSections | -0.106693 | 0.290906 | 0.095590 | -0.570699 | 0.115386 | -0.030079 | 0.177349 | -0.311564 | 0.028018 | 0.017280 | -0.012561 | 0.055095 | 1.000000 | 0.155466 | -0.045400 | -0.038067 |
OSVersion | -0.265357 | -0.058632 | 0.464331 | -0.182660 | -0.011312 | 0.077484 | -0.056255 | 0.075914 | -0.032553 | 0.127573 | -0.073946 | -0.113983 | 0.155466 | 1.000000 | 0.069099 | -0.269090 |
ResSize | -0.048695 | 0.001549 | 0.032082 | 0.071357 | -0.005866 | 0.226113 | -0.006979 | 0.049947 | -0.003823 | 0.021913 | 0.058651 | 0.032964 | -0.045400 | 0.069099 | 1.000000 | -0.033629 |
StackReserveSize | -0.003774 | -0.023430 | -0.277207 | 0.122206 | -0.004723 | -0.023774 | 0.043554 | 0.157359 | -0.008052 | -0.025194 | 0.233540 | 0.322661 | -0.038067 | -0.269090 | -0.033629 | 1.000000 |
plot_corr(df_malware)
#correlation(df_malware, .99)
#plot_corr(df_malware)
print("# rows in dataframe {0}".format(len(df_malware)))
print("# rows missing AddressOfEntryPoint: {0}".format(len(df_malware.loc[df_malware['AddressOfEntryPoint'] == 0])))
print("# rows missing DebugRVA: {0}".format(len(df_malware.loc[df_malware['DebugRVA'] == 0])))
print("# rows missing DebugSize: {0}".format(len(df_malware.loc[df_malware['DebugSize'] == 0])))
print("# rows missing Dll: {0}".format(len(df_malware.loc[df_malware['Dll'] == 0])))
print("# rows missing ExportRVA: {0}".format(len(df_malware.loc[df_malware['ExportRVA'] == 0])))
print("# rows missing ExportSize: {0}".format(len(df_malware.loc[df_malware['ExportSize'] == 0])))
print("# rows missing IATRVA: {0}".format(len(df_malware.loc[df_malware['IATRVA'] == 0])))
print("# rows missing ImageBase: {0}".format(len(df_malware.loc[df_malware['ImageBase'] == 0])))
print("# rows missing ImageVersion: {0}".format(len(df_malware.loc[df_malware['ImageVersion'] == 0])))
print("# rows missing LinkerVersion: {0}".format(len(df_malware.loc[df_malware['LinkerVersion'] == 0])))
print("# rows missing NumberOfSections: {0}".format(len(df_malware.loc[df_malware['NumberOfSections'] == 0])))
print("# rows missing OSVersion: {0}".format(len(df_malware.loc[df_malware['OSVersion'] == 0])))
print("# rows missing ResSize: {0}".format(len(df_malware.loc[df_malware['ResSize'] == 0])))
print("# rows missing StackReserveSize: {0}".format(len(df_malware.loc[df_malware['StackReserveSize'] == 0])))
print("# rows missing NumberOfImportDLL: {0}".format(len(df_malware.loc[df_malware['NumberOfImportDLL'] == 0])))
print("# rows missing NumberOfImportFunctions: {0}".format(len(df_malware.loc[df_malware['NumberOfImportFunctions'] == 0])))
print("# rows missing NumberOfSections: {0}".format(len(df_malware.loc[df_malware['NumberOfSections'] == 0])))
# rows in dataframe 995
# rows missing AddressOfEntryPoint: 3
# rows missing DebugRVA: 267
# rows missing DebugSize: 267
# rows missing Dll: 141
# rows missing ExportRVA: 910
# rows missing ExportSize: 910
# rows missing IATRVA: 19
# rows missing ImageBase: 0
# rows missing ImageVersion: 478
# rows missing LinkerVersion: 0
# rows missing NumberOfSections: 6
# rows missing OSVersion: 0
# rows missing ResSize: 64
# rows missing StackReserveSize: 0
# rows missing NumberOfImportDLL: 147
# rows missing NumberOfImportFunctions: 147
# rows missing NumberOfSections: 6
df_clean.corr()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
AddressOfEntryPoint | DebugRVA | DebugSize | Dll | ExportRVA | ExportSize | IATRVA | ImageBase | ImageVersion | LinkerVersion | NumberOfImportDLL | NumberOfImportFunctions | NumberOfSections | OSVersion | ResSize | StackReserveSize | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AddressOfEntryPoint | 1.000000 | 0.376455 | -0.001188 | -0.046729 | 0.686100 | 0.159009 | 0.378447 | -0.017464 | 0.048777 | 0.082353 | 0.120774 | 0.195789 | -0.026106 | -0.013296 | 0.130714 | 0.085269 |
DebugRVA | 0.376455 | 1.000000 | 0.235510 | -0.030202 | 0.165931 | 0.021710 | 0.550086 | 0.124848 | -0.018012 | 0.158134 | 0.069459 | 0.363659 | -0.050907 | -0.039534 | 0.032462 | 0.068129 |
DebugSize | -0.001188 | 0.235510 | 1.000000 | 0.386591 | -0.001758 | 0.043174 | 0.132536 | 0.108120 | 0.027022 | 0.489312 | 0.062968 | 0.069459 | -0.175783 | 0.499236 | 0.047429 | -0.190557 |
Dll | -0.046729 | -0.030202 | 0.386591 | 1.000000 | -0.064785 | -0.050356 | -0.033908 | -0.014890 | 0.049520 | 0.326582 | 0.250669 | -0.020009 | -0.000204 | 0.537415 | -0.020063 | -0.354395 |
ExportRVA | 0.686100 | 0.165931 | -0.001758 | -0.064785 | 1.000000 | 0.334106 | 0.347014 | -0.008358 | -0.007997 | 0.011629 | 0.083542 | 0.192323 | 0.090855 | -0.021997 | 0.214102 | 0.044172 |
ExportSize | 0.159009 | 0.021710 | 0.043174 | -0.050356 | 0.334106 | 1.000000 | 0.178111 | -0.006341 | -0.006171 | 0.023095 | 0.088044 | 0.071586 | 0.214809 | 0.029201 | 0.005778 | -0.008533 |
IATRVA | 0.378447 | 0.550086 | 0.132536 | -0.033908 | 0.347014 | 0.178111 | 1.000000 | -0.025444 | 0.158022 | 0.041628 | 0.282145 | 0.542615 | 0.170300 | 0.074932 | 0.216881 | 0.084239 |
ImageBase | -0.017464 | 0.124848 | 0.108120 | -0.014890 | -0.008358 | -0.006341 | -0.025444 | 1.000000 | -0.005917 | 0.031480 | -0.101067 | -0.064428 | -0.064906 | -0.007955 | -0.008473 | -0.085809 |
ImageVersion | 0.048777 | -0.018012 | 0.027022 | 0.049520 | -0.007997 | -0.006171 | 0.158022 | -0.005917 | 1.000000 | 0.028758 | 0.137971 | -0.000887 | 0.017238 | 0.044935 | -0.007953 | -0.046360 |
LinkerVersion | 0.082353 | 0.158134 | 0.489312 | 0.326582 | 0.011629 | 0.023095 | 0.041628 | 0.031480 | 0.028758 | 1.000000 | -0.032442 | -0.016006 | -0.331794 | 0.366225 | 0.015706 | -0.160583 |
NumberOfImportDLL | 0.120774 | 0.069459 | 0.062968 | 0.250669 | 0.083542 | 0.088044 | 0.282145 | -0.101067 | 0.137971 | -0.032442 | 1.000000 | 0.481672 | 0.334914 | 0.195613 | 0.026599 | -0.093554 |
NumberOfImportFunctions | 0.195789 | 0.363659 | 0.069459 | -0.020009 | 0.192323 | 0.071586 | 0.542615 | -0.064428 | -0.000887 | -0.016006 | 0.481672 | 1.000000 | 0.168574 | 0.060505 | 0.085063 | 0.061108 |
NumberOfSections | -0.026106 | -0.050907 | -0.175783 | -0.000204 | 0.090855 | 0.214809 | 0.170300 | -0.064906 | 0.017238 | -0.331794 | 0.334914 | 0.168574 | 1.000000 | -0.086133 | -0.035504 | 0.090988 |
OSVersion | -0.013296 | -0.039534 | 0.499236 | 0.537415 | -0.021997 | 0.029201 | 0.074932 | -0.007955 | 0.044935 | 0.366225 | 0.195613 | 0.060505 | -0.086133 | 1.000000 | 0.018603 | -0.326980 |
ResSize | 0.130714 | 0.032462 | 0.047429 | -0.020063 | 0.214102 | 0.005778 | 0.216881 | -0.008473 | -0.007953 | 0.015706 | 0.026599 | 0.085063 | -0.035504 | 0.018603 | 1.000000 | 0.010637 |
StackReserveSize | 0.085269 | 0.068129 | -0.190557 | -0.354395 | 0.044172 | -0.008533 | 0.084239 | -0.085809 | -0.046360 | -0.160583 | -0.093554 | 0.061108 | 0.090988 | -0.326980 | 0.010637 | 1.000000 |
plot_corr(df_clean)
#correlation(df_clean, 0.8)
#plot_corr(df_clean)
print("# rows in dataframe {0}".format(len(df_clean)))
print("# rows missing AddressOfEntryPoint: {0}".format(len(df_clean.loc[df_clean['AddressOfEntryPoint'] == 0])))
print("# rows missing DebugRVA: {0}".format(len(df_clean.loc[df_clean['DebugRVA'] == 0])))
print("# rows missing DebugSize: {0}".format(len(df_clean.loc[df_clean['DebugSize'] == 0])))
print("# rows missing Dll: {0}".format(len(df_clean.loc[df_clean['Dll'] == 0])))
print("# rows missing ExportRVA: {0}".format(len(df_clean.loc[df_clean['ExportRVA'] == 0])))
print("# rows missing ExportSize: {0}".format(len(df_clean.loc[df_clean['ExportSize'] == 0])))
print("# rows missing IATRVA: {0}".format(len(df_clean.loc[df_clean['IATRVA'] == 0])))
print("# rows missing ImageBase: {0}".format(len(df_clean.loc[df_clean['ImageBase'] == 0])))
print("# rows missing ImageVersion: {0}".format(len(df_clean.loc[df_clean['ImageVersion'] == 0])))
print("# rows missing LinkerVersion: {0}".format(len(df_clean.loc[df_clean['LinkerVersion'] == 0])))
print("# rows missing NumberOfSections: {0}".format(len(df_clean.loc[df_clean['NumberOfSections'] == 0])))
print("# rows missing OSVersion: {0}".format(len(df_clean.loc[df_clean['OSVersion'] == 0])))
print("# rows missing ResSize: {0}".format(len(df_clean.loc[df_clean['ResSize'] == 0])))
print("# rows missing StackReserveSize: {0}".format(len(df_clean.loc[df_clean['StackReserveSize'] == 0])))
print("# rows missing NumberOfImportDLL: {0}".format(len(df_clean.loc[df_clean['NumberOfImportDLL'] == 0])))
print("# rows missing NumberOfImportFunctions: {0}".format(len(df_clean.loc[df_clean['NumberOfImportFunctions'] == 0])))
print("# rows missing NumberOfSections: {0}".format(len(df_clean.loc[df_clean['NumberOfSections'] == 0])))
# rows in dataframe 1297
# rows missing AddressOfEntryPoint: 25
# rows missing DebugRVA: 387
# rows missing DebugSize: 387
# rows missing Dll: 90
# rows missing ExportRVA: 1220
# rows missing ExportSize: 1220
# rows missing IATRVA: 114
# rows missing ImageBase: 0
# rows missing ImageVersion: 433
# rows missing LinkerVersion: 0
# rows missing NumberOfSections: 0
# rows missing OSVersion: 3
# rows missing ResSize: 45
# rows missing StackReserveSize: 18
# rows missing NumberOfImportDLL: 29
# rows missing NumberOfImportFunctions: 29
# rows missing NumberOfSections: 0
Supervised machine learning is best understood as approximating a target function (f) that maps input variables (X) to an output variable (Y).
Y = f(X)
This characterization describes the range of classification and prediction problems and the machine algorithms that can be used to address them.
The cause of poor performance in machine learning is either overfitting or underfitting the data.
We will add the two datasets together within a single csv file making a new column to indicate the sample is Malware or Clean. This new column will be a boolean value of 0 = "Clean" and 1 = "Malware" value.
Create a new def to assist in adding a new column for the Malware status and the corresponding value.
def appendCSV(inputfile, outputfile, newheader, newdata):
"""
Function to add a new columns, new header, and new data to a csv file.
This new "Malware" column will be a boolean value of 0 = "Clean" and 1 = "Malware" value.
Input:
inputfile: "./output/dataset_malware.csv"
outputfile: "./dataset/dataset_malware.csv"
newheader: "Malware"
newdata: 1
Example:
appendCSV("./output/dataset_malware.csv", "./dataset/dataset_malware.csv", "Malware", 1)
"""
csv.field_size_limit(100000000)
reader = csv.reader(open(inputfile, "r"))
writer = csv.writer(open(outputfile, "w"))
headers = next(reader)
headers.append(newheader)
writer.writerow(headers)
for row in reader:
row.append(newdata)
writer.writerow(row)
Append the CSV files to mark the datasets as Malware or Clean
appendCSV("./output/dataset_malware.csv", "./dataset/dataset_malware.csv", "Malware", 1)
appendCSV("./output/dataset_clean.csv", "./dataset/dataset_clean.csv", "Malware", 0)
Merge the Clean dataset and Malware dataset into one CSV file
We should also look into removing duplictaes and randomizing the order of samples in the dataset.
interesting_files = glob.glob("./dataset/dataset_*.csv")
header_saved = False
with open("./dataset/merged_output.csv","w") as fout:
for filename in interesting_files:
with open(filename) as fin:
header = next(fin)
if not header_saved:
fout.write(header)
header_saved = True
for line in fin:
fout.write(line)
merged_df = pd.read_csv("./dataset/merged_output.csv")
We dont need the Columns "ImportedSymbols" and "filename" for now so we will just drop them from the dataframe.
try:
del merged_df['ImportedSymbols']
del merged_df['filename']
del merged_df['SectionNames']
del merged_df['ImportedFunctions']
except:
print("")
print("[*] Columns removed")
print("")
[*] Columns removed
merged_df.shape
(2292, 22)
merged_df.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
AddressOfEntryPoint | DebugRVA | DebugSize | Dll | ExportRVA | ExportSize | IATRVA | ImageBase | ImageVersion | LinkerVersion | NumberOfImportDLL | NumberOfImportFunctions | NumberOfSections | OSVersion | ResSize | StackReserveSize | Malware | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2.292000e+03 | 2.292000e+03 | 2292.000000 | 2292.000000 | 2.292000e+03 | 2.292000e+03 | 2.292000e+03 | 2.292000e+03 | 2292.000000 | 2292.000000 | 2292.000000 | 2292.000000 | 2292.000000 | 2292.00000 | 2.292000e+03 | 2.292000e+03 | 2292.000000 |
mean | 6.568424e+07 | 1.244702e+05 | 23.396597 | 32417.284468 | 7.907806e+04 | 3.421876e+04 | 2.149736e+05 | 2.611155e+10 | 70.522251 | 9.008290 | 7.639616 | 143.489965 | 5.535777 | 5.18630 | 1.236307e+06 | 9.154542e+05 | 0.434119 |
std | 3.082750e+08 | 4.572434e+05 | 18.261576 | 13518.295561 | 1.006857e+06 | 6.635917e+05 | 1.023666e+06 | 4.064159e+11 | 1179.422404 | 5.318443 | 6.294335 | 160.622470 | 1.703473 | 1.17953 | 1.190246e+07 | 1.334605e+06 | 0.495749 |
min | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 6.553600e+04 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
25% | 1.582400e+04 | 0.000000e+00 | 0.000000 | 32768.000000 | 0.000000e+00 | 0.000000e+00 | 8.192000e+03 | 4.194304e+06 | 0.000000 | 8.000000 | 3.000000 | 65.000000 | 5.000000 | 5.00000 | 1.992000e+03 | 3.276800e+05 | 0.000000 |
50% | 5.000000e+04 | 4.880000e+03 | 28.000000 | 33088.000000 | 0.000000e+00 | 0.000000e+00 | 3.686400e+04 | 4.194304e+06 | 6.000000 | 9.000000 | 7.000000 | 120.000000 | 5.000000 | 5.00000 | 1.266400e+04 | 1.048576e+06 | 0.000000 |
75% | 1.204425e+05 | 4.264000e+04 | 28.000000 | 34112.000000 | 0.000000e+00 | 0.000000e+00 | 1.351680e+05 | 5.368709e+09 | 6.000000 | 11.000000 | 10.000000 | 158.000000 | 6.000000 | 6.00000 | 1.327690e+05 | 1.048576e+06 | 1.000000 |
max | 3.490505e+09 | 9.103792e+06 | 145.000000 | 53568.000000 | 3.993506e+07 | 1.298409e+07 | 3.993009e+07 | 6.892871e+12 | 21315.000000 | 187.000000 | 71.000000 | 3659.000000 | 21.000000 | 10.00000 | 3.391820e+08 | 3.355443e+07 | 1.000000 |
merged_df.head(5)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Unnamed: 0 | AddressOfEntryPoint | DebugRVA | DebugSize | Dll | ExportRVA | ExportSize | IATRVA | ImageBase | ImageVersion | ... | LinkerVersion | NumberOfImportDLL | NumberOfImportFunctions | NumberOfSections | OSVersion | ResSize | SectionNames | StackReserveSize | filename | Malware | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 00070b0d4cb037c40d5d2464f92841aeb9ad863472bf95... | 21704 | 4880 | 28 | 0 | 0 | 0 | 4096 | 4194304 | 1 | ... | 6 | 1 | 139 | 3 | 4 | 2184 | {'.text\x00\x00\x00': 1179648, '.data\x00\x00\... | 1048576 | ./data/clean/00070b0d4cb037c40d5d2464f92841aeb... | 0 |
1 | 00696555cbf6db83af785f8acb2270b9411cfc75e7f6d3... | 29424 | 4256 | 28 | 49472 | 32576 | 163 | 36864 | 4194304 | 6 | ... | 11 | 21 | 114 | 6 | 6 | 2008 | {'.text\x00\x00\x00': 28672, '.data\x00\x00\x0... | 262144 | ./data/clean/00696555cbf6db83af785f8acb2270b94... | 0 |
2 | 007247436f041ca59c5ee0e8636c668c2a43376aeb8cfa... | 227872 | 82400 | 84 | 49472 | 0 | 0 | 3522560 | 4194304 | 10 | ... | 12 | 8 | 157 | 5 | 6 | 1892 | {'.text\x00\x00\x00': 246272, '.data\x00\x00\x... | 4194304 | ./data/clean/007247436f041ca59c5ee0e8636c668c2... | 0 |
3 | 007bdab757d03d94e60c9b1e3eec13b07562705c514992... | 10656 | 4320 | 28 | 49472 | 0 | 0 | 20480 | 4194304 | 6 | ... | 11 | 12 | 46 | 5 | 6 | 15632 | {'.text\x00\x00\x00': 8704, '.data\x00\x00\x00... | 262144 | ./data/clean/007bdab757d03d94e60c9b1e3eec13b07... | 0 |
4 | 008fa2b9697f9a173e40572face100410e51975e34a5ce... | 152696 | 4224 | 28 | 33120 | 0 | 0 | 200704 | 5368709120 | 6 | ... | 11 | 7 | 144 | 6 | 6 | 28384 | {'.text\x00\x00\x00': 181248, '.data\x00\x00\x... | 524288 | ./data/clean/008fa2b9697f9a173e40572face100410... | 0 |
5 rows × 22 columns
merged_df.tail(5)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Unnamed: 0 | AddressOfEntryPoint | DebugRVA | DebugSize | Dll | ExportRVA | ExportSize | IATRVA | ImageBase | ImageVersion | ... | LinkerVersion | NumberOfImportDLL | NumberOfImportFunctions | NumberOfSections | OSVersion | ResSize | SectionNames | StackReserveSize | filename | Malware | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2287 | fab80c8daa62c97bbb4cac1520a95c25b6cb755fbd1814... | 1074123741 | 219760 | 28 | 32768 | 0 | 0 | 217088 | 5368709120 | 0 | ... | 8 | 10 | 288 | 5 | 4 | 72860 | NaN | 1048576 | ./data/malware/fab80c8daa62c97bbb4cac1520a95c2... | 1 |
2288 | fd353ce31912ea745bf0b47144171a5700b128664711ad... | 6656 | 8800 | 28 | 32768 | 0 | 0 | 8192 | 5368709120 | 0 | ... | 8 | 4 | 65 | 5 | 4 | 48484 | {'.text\x00\x00\x00': 4096, '.rdata\x00\x00': ... | 1048576 | ./data/malware/fd353ce31912ea745bf0b47144171a5... | 1 |
2289 | fde7f22dcebcfbedafd5daecba1dc9952ff51c0ee43316... | 51656 | 5424 | 28 | 32832 | 0 | 0 | 4096 | 4294967296 | 6 | ... | 9 | 7 | 151 | 5 | 6 | 344936 | {'.text\x00\x00\x00': 55296, '.data\x00\x00\x0... | 524288 | ./data/malware/fde7f22dcebcfbedafd5daecba1dc99... | 1 |
2290 | fee18f402375b210fc7b89e29084fb8e478d5ee0f0cdb8... | 72752 | 79760 | 28 | 32768 | 0 | 0 | 77824 | 5368709120 | 0 | ... | 8 | 11 | 216 | 5 | 4 | 13264 | {'.text\x00\x00\x00': 70144, '.rdata\x00\x00':... | 1048576 | ./data/malware/fee18f402375b210fc7b89e29084fb8... | 1 |
2291 | fef17c9f848a3d291aa2070105bbbc143bb48ffd4c1fdf... | 51656 | 5424 | 28 | 32832 | 0 | 0 | 4096 | 4294967296 | 6 | ... | 9 | 7 | 151 | 5 | 6 | 412204 | {'.text\x00\x00\x00': 55296, '.data\x00\x00\x0... | 524288 | ./data/malware/fef17c9f848a3d291aa2070105bbbc1... | 1 |
5 rows × 22 columns
merged_df.isnull().values.any()
True
Rare events are hard to predict
Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test.
num_obs = len(merged_df)
num_true = len(merged_df.loc[merged_df['Malware'] == 1])
num_false = len(merged_df.loc[merged_df['Malware'] == 0])
print("")
print("[*] Number of Malware files: {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("[*] Number of Clean files: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))
print("")
[*] Number of Malware files: 995 (43.41%)
[*] Number of Clean files: 1297 (56.59%)
70% for training, 30% for testing
We are using a simple split of our dataset but in the future we could use K-fold Cross Validation.
Tuning the Hyperparameters with Cross Validation
For each fold
Determine the best hyperparameter value to tune
Next
Set the model hyperparameter value to the average best
The sklearn.cross_validation library assists in this process.
Algorithm CV Variants: Algorithm + Cross Validation = AlgorithmCV
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
feature_col_names = ['AddressOfEntryPoint', 'DebugRVA', 'DebugSize', 'Dll', 'ExportRVA', 'ExportSize', 'IATRVA', 'ImageBase', 'ImageVersion', 'LinkerVersion', 'NumberOfSections', 'OSVersion', 'ResSize', 'StackReserveSize', 'Malware', "NumberOfImportDLL", "NumberOfImportFunctions", "NumberOfSections" ]
predicted_class_names = ['Malware']
X = merged_df[feature_col_names].values # predictor feature columns
y = merged_df[predicted_class_names].values # predicted class (1=true, 0=false) column
split_test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)
# test_size = 0.3 is 30%, 42 is the answer to everything
We check to ensure we have the the desired 70% train, 30% test split of the data
print("")
print("[*] {0:0.2f}% in training set".format((len(X_train)/len(merged_df.index)) * 100))
print("[*] {0:0.2f}% in test set".format((len(X_test)/len(merged_df.index)) * 100))
print("")
[*] 69.98% in training set
[*] 30.02% in test set
print("")
print("[*] Original Malware : {0} ({1:0.2f}%)".format(len(merged_df.loc[merged_df['Malware'] == 1]), (len(merged_df.loc[merged_df['Malware'] == 1])/len(merged_df.index)) * 100.0))
print("[*] Original Clean : {0} ({1:0.2f}%)".format(len(merged_df.loc[merged_df['Malware'] == 0]), (len(merged_df.loc[merged_df['Malware'] == 0])/len(merged_df.index)) * 100.0))
print("")
print("[*] Training Malware : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("[*] Training Clean : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("[*] Test Malware : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("[*] Test Clean : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))
print("")
[*] Original Malware : 995 (43.41%)
[*] Original Clean : 1297 (56.59%)
[*] Training Malware : 698 (43.52%)
[*] Training Clean : 906 (56.48%)
[*] Test Malware : 297 (43.17%)
[*] Test Clean : 391 (56.83%)
https://github.com/urwithajit9/ClaMP
https://archive.ics.uci.edu/ml/datasets/Detect+Malacious+Executable(AntiVirus)
https://marcoramilli.blogspot.com/2016/12/malware-training-sets-machine-learning.html
https://github.com/jivoi/awesome-ml-for-cybersecurity#-datasets
https://github.com/ytisf/theZoo
https://zeltser.com/malware-sample-sources/
(Naive Bayes, Logisitic Regression, Decision Tree)
We may need to test several algorithms with the dataset which we have generated.
For our initial testing algorithm we have chosen the Gaussian Naive Bayes model which is based on likelihood and probability. We have chosen this algorithm because it is fast, simple, and stable.
Algorithm decision factors:
* Learning Type
* Prediction Model => Supervised machine learning
* Results (Regression vs Classification)
* Classification (Malware vs Not Malware)
* Complexity (Ensemble vs Simple)
* Keep it Simple
* Basic vs Enhanced
* Basic
Hidden missing values?
Are these 0 values possible?
How many rows have unexpected 0 values?
Common Problems with missing data, what options do we have?
* Ignore the missing data values
* Drop observation (rows) with missing data values
* Replace missing data values (impute)
I have chosen not to imput missing data for initial testing
#from sklearn.preprocessing import Imputer
#Impute with mean all 0 readings
#fill_0 = Imputer(missing_values=0, strategy="mean", axis=0)
#X_train = fill_0.fit_transform(X_train)
#X_test = fill_0.fit_transform(X_test)
Designed to work with NumPy, SciPy and Pandas
Toolset for training and evaluation tasks
from sklearn.naive_bayes import GaussianNB
# create Gaussian Naive Bayes model object and train it with the data
nb_model = GaussianNB()
nb_model.fit(X_train, y_train.ravel())
GaussianNB(priors=None)
# predict the values using the training data
nb_predict_train = nb_model.predict(X_train)
# import the performance metrics library
from sklearn import metrics
# Accuracy
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))
print()
Training Accuracy: 0.4470
# predict the values using the testing data
nb_predict_test = nb_model.predict(X_test)
# Accuracy
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, nb_predict_test)))
print()
Testing Accuracy: 0.4375
| TN | FP |
| FN | TP |
Recall is the True Positive rate and indicates the probabilty of a true result.
Recall = TP / (TP +FN ) (How well the model is predicting true Malware)
Precision = TP / (TP + FP) (Positive predictor value)
True positives (TP) The amount of labels, which were correctly identified by the classifier, that is assigned point labeled A to class A.
True negatives (TN) The amount of labels, which were correctly rejected by the classifier, that is assigned point labeled A’ to class A’.
False positives (FP) The amount of labels, which were incorrectly identified by the classifier, that is assigned point labeled A’ to class A.
False negatives (FN) The amount of labels, which were incorrectly rejected by the classifier, that is assigned point labeled A to class A’.
Accuracy Accuracy measures a fraction of the classifier’s predictions that are correct, that is the number of correct assessments divided by the number of all assessments – (TN + TP)/(TN + TP + FN + FP).
Precision (P) Precision is the fraction of positive predictions that are correct – TP/(TP + FP). Be careful as classifier predicting only a single positive instance, that happens to be correct, will achieve perfect precision.
Recall (R) Recall, sometimes called sensitivity in medical domains, measures the fraction of the truly positive instances. A score of 1 indicates, that no false negative were present – TP/(TP + FN). Be careful as classifier predicting positive for every example will achieve a recall of 1.
F1 score Both precision and recall scores provide an incomplete view on the classifier performance and sometimes may provide skewed results. The F1 measure provides a better view by calculating weighted average of the scores – 2PR/(P + R). A model with perfect precision and recall scores will achieve an F1 score of one.
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, nb_predict_test)))
print()
print("Classification Report")
print(metrics.classification_report(y_test, nb_predict_test))
Confusion Matrix
[[ 5 386]
[ 1 296]]
Classification Report
precision recall f1-score support
0 0.83 0.01 0.03 391
1 0.43 1.00 0.60 297
avg / total 0.66 0.44 0.28 688
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test, nb_predict_test, normalize=True)
plt.show()
#random forests
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train.ravel())
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=42, verbose=0, warm_start=False)
# predict the values using the training data
rf_predict_train = rf_model.predict(X_train)
# Accuracy
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))
print()
Training Accuracy: 1.0000
# predict the values using the testing data
rf_predict_test = rf_model.predict(X_test)
# Accuracy
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))
print()
Testing Accuracy: 1.0000
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, rf_predict_test)))
print()
print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))
Confusion Matrix
[[391 0]
[ 0 297]]
Classification Report
precision recall f1-score support
0 1.00 1.00 1.00 391
1 1.00 1.00 1.00 297
avg / total 1.00 1.00 1.00 688
skplt.metrics.plot_confusion_matrix(y_test, rf_predict_test, normalize=True)
plt.show()
#logistic regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C=0.7, random_state=42)
lr_model.fit(X_train, y_train.ravel())
LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
# predict the values using the training data
lr_predict_train = lr_model.predict(X_train)
# Accuracy
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, lr_predict_train)))
print()
Training Accuracy: 0.3990
# predict the values using the testing data
lr_predict_test = lr_model.predict(X_test)
# Accuracy
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
print()
Testing Accuracy: 0.4070
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, lr_predict_test)))
print()
print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))
Confusion Matrix
[[176 215]
[193 104]]
Classification Report
precision recall f1-score support
0 0.48 0.45 0.46 391
1 0.33 0.35 0.34 297
avg / total 0.41 0.41 0.41 688
skplt.metrics.plot_confusion_matrix(y_test, lr_predict_test, normalize=True)
plt.show()
#logistic regressioncv
from sklearn.linear_model import LogisticRegressionCV
lr_cv_model = LogisticRegressionCV(n_jobs=-1, Cs=3, cv=10, refit=False, class_weight="balanced", random_state=42)
lr_cv_model.fit(X_train, y_train.ravel())
LogisticRegressionCV(Cs=3, class_weight='balanced', cv=10, dual=False,
fit_intercept=True, intercept_scaling=1.0, max_iter=100,
multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
refit=False, scoring=None, solver='lbfgs', tol=0.0001,
verbose=0)
# predict the values using the training data
lr_cv_predict_train = lr_cv_model.predict(X_train)
# Accuracy
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, lr_cv_predict_train)))
print()
Training Accuracy: 0.5648
# predict the values using the testing data
lr_cv_predict_test = lr_cv_model.predict(X_test)
# Accuracy
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_cv_predict_test)))
print()
Testing Accuracy: 0.5683
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, lr_cv_predict_test)))
print()
print("Classification Report")
print(metrics.classification_report(y_test, lr_cv_predict_test))
Confusion Matrix
[[391 0]
[297 0]]
Classification Report
precision recall f1-score support
0 0.57 1.00 0.72 391
1 0.00 0.00 0.00 297
avg / total 0.32 0.57 0.41 688
/Users/orlandobarreraii/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
'precision', 'predicted', average, warn_for)
skplt.metrics.plot_confusion_matrix(y_test, lr_cv_predict_test, normalize=True)
plt.show()
We don’t know which algorithms would be good on this problem or what configurations to use. We get an idea from the plots that some of the classes are partially linearly separable in some dimensions, so we are expecting generally good results.
Let’s evaluate 6 different algorithms:
Logistic Regression (LR) Linear Discriminant Analysis (LDA) K-Nearest Neighbors (KNN). Classification and Regression Trees (CART). Gaussian Naive Bayes (NB). Support Vector Machines (SVM). This is a good mixture of simple linear (LR and LDA), nonlinear (KNN, CART, NB and SVM) algorithms. We reset the random number seed before each run to ensure that the evaluation of each algorithm is performed using exactly the same data splits. It ensures the results are directly comparable.
import warnings
warnings.filterwarnings("ignore")
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=42)
cv_results = model_selection.cross_val_score(model, X_train, y_train.ravel(), cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
LR: 0.472457 (0.075952)
LDA: 0.672675 (0.038834)
KNN: 0.923940 (0.018017)
CART: 1.000000 (0.000000)
NB: 0.446374 (0.045136)
SVM: 0.658979 (0.033874)