In [12]:
import pandas as pd
import time
from sklearn import ensemble
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV

Let's use 3000 samples for selecting the best features

In [4]:
READ_LINES = 3000
malicious_data = pd.read_csv('/rodata/exercise03/clam_mal.csv', index_col=None, nrows=READ_LINES)
benign_data = pd.read_csv('/rodata/exercise03/clam_benign.csv', index_col=None, nrows=READ_LINES)

Merge malicious and clean data (features), remove sample hashes, and split out classes

In [5]:
learn_data = malicious_data.append(benign_data, ignore_index=True)
learn_data.fillna(0, inplace=True) # fill missing entries from either set with 0
classes = learn_data['malicious']
del learn_data['malicious']
del learn_data['hash']

  learn_data = malicious_data.append(benign_data, ignore_index=True)


Random Forests can provide stats on how relevant each feature is in the trees. Let's train them first

In [6]:
classifier = ensemble.RandomForestClassifier()
classifier.fit(learn_data, classes)

Random Forests have information how important a specific feature is within the trees

Q: Which are the most important features?

In [31]:
feature_importances = list(classifier.feature_importances_)
feature_count = len(classifier.feature_importances_)

print("Out of %i total features:" % feature_count)

for i in range(feature_count):
    feat_importance = feature_importances[i]
    if feat_importance > 0.01:
        feat_name = learn_data.columns[i]
        print("{:30}: {:.03}".format(feat_name, feat_importance))

Out of 3424 total features:
AddressOfEntryPoint           : 0.0265
ID                            : 0.016
NumberOfRvaAndSizes           : 0.0101
NumberOfSections              : 0.021
Security version              : 0.0206
SizeOfHeaders                 : 0.0245
SizeOfImage                   : 0.0223
SizeOfOptionalHeader          : 0.0212
SizeOfUninitializedData       : 0.031
Version                       : 0.0173
section_488876_VirtualAddress : 0.0168
section_488876_VirtualSize    : 0.0195
section_488876_out_of_bounds  : 0.0152
section_488876_write          : 0.0211
section_5a175_SizeOfRawData   : 0.01
section_5a175_VirtualAddress  : 0.0187
section_5a175_out_of_bounds   : 0.0135
section_5a816_PointerToRawData: 0.0118
section_780358_VirtualAddress : 0.0173
section_780358_VirtualSize    : 0.0158
section_780358_out_of_bounds  : 0.0215
section_786901_PointerToRawData: 0.019
section_794558_VirtualSize    : 0.0193
section_794558_exec           : 0.0239
section_794558_out_of_bounds  : 0.0134
se

In [37]:
best_features = [learn_data.columns[i] for i in range(feature_count) if feature_importances[i] > 0.01]
print(best_features)
type(best_features)
print(len(best_features))
actualList = [i for i in best_features]
learn_trim = learn_data[actualList]

['AddressOfEntryPoint', 'ID', 'NumberOfRvaAndSizes', 'NumberOfSections', 'Security version', 'SizeOfHeaders', 'SizeOfImage', 'SizeOfOptionalHeader', 'SizeOfUninitializedData', 'Version', 'section_488876_VirtualAddress', 'section_488876_VirtualSize', 'section_488876_out_of_bounds', 'section_488876_write', 'section_5a175_SizeOfRawData', 'section_5a175_VirtualAddress', 'section_5a175_out_of_bounds', 'section_5a816_PointerToRawData', 'section_780358_VirtualAddress', 'section_780358_VirtualSize', 'section_780358_out_of_bounds', 'section_786901_PointerToRawData', 'section_794558_VirtualSize', 'section_794558_exec', 'section_794558_out_of_bounds', 'section_794559_PointerToRawData', 'section_794559_VirtualAddress', 'section_7D6kqN5h_PointerToRawData', 'section_7D6kqN5h_SizeOfRawData', 'section_7D6kqN5h_VirtualAddress', 'section_7D6kqN5h_VirtualSize', 'section_86046_out_of_bounds', 'section_873e1_PointerToRawData', 'section_915142_VirtualAddress']
34


In [38]:
learn_trim.shape

(6000, 34)

In [39]:
# Sort the feature importance in descending order
sorted_indices = np.argsort(feature_importances)[::-1]

learn_data.columns[sorted_indices]

Index(['section_7D6kqN5h_SizeOfRawData', 'SizeOfUninitializedData',
       'section_7D6kqN5h_VirtualSize', 'AddressOfEntryPoint',
       'section_7D6kqN5h_VirtualAddress', 'SizeOfHeaders',
       'section_794558_exec', 'section_7D6kqN5h_PointerToRawData',
       'SizeOfImage', 'section_780358_out_of_bounds',
       ...
       'section_3003_PointerToRawData', 'section_CLR_UEF_VirtualAddress',
       'section_3087f_PointerToRawData', 'section_CLR_UEF_SizeOfRawData',
       'section_6_exec', 'section_3087e_VirtualAddress',
       'section_3087e_VirtualSize', 'section_3087e_exec',
       'section_3087e_out_of_bounds', 'section_EzdtWnVc_out_of_bounds'],
      dtype='object', length=3424)

In [14]:
# try some feature reducction first, detelting columns with only unique data
count = 0
learn_copy = learn_data.copy()
for (columnName, columnData) in learn_copy.iteritems():
    if len(np.unique(columnData.values)) == 1:
        del learn_copy[columnName]
        count = count +1
print('Number of eliminated feautures: {}'.format(count)) 

Number of eliminated feautures: 8253


In [29]:
# fit the model without the features
classifier = ensemble.RandomForestClassifier()
classifier.fit(learn_copy, classes)

feature_importances = list(classifier.feature_importances_)
feature_count = len(classifier.feature_importances_)

print("Out of %i total features:" % feature_count)

for i in range(feature_count):
    feat_importance = feature_importances[i]
    if feat_importance > 0.01:
        feat_name = learn_copy.columns[i]
        print("{:30}: {:.03}".format(feat_name, feat_importance))

# Sort the feature importance in descending order
sorted_indices = np.argsort(feature_importances)[::-1]

best_features = [learn_copy.columns[i] for i in range(feature_count) if feature_importances[i] > 0.01]
print(best_features)
type(best_features)
actualList = [i for i in best_features]
type(actualList)
#learn_copy.columns[sorted_indices]

learn_copy = learn_copy[[actualList]].copy()



Out of 3424 total features:
AddressOfEntryPoint           : 0.0265
MajorLinkerVersion            : 0.016
NumberOfSections              : 0.0101
PointerToRawData              : 0.021
SizeOfCode                    : 0.0206
SizeOfImage                   : 0.0245
SizeOfInitializedData         : 0.0223
SizeOfRawData                 : 0.0212
TimeDateStamp                 : 0.031
VirtualAddress                : 0.0173
section_data_PointerToRawData : 0.0168
section_data_SizeOfRawData    : 0.0195
section_data_VirtualAddress   : 0.0152
section_data_VirtualSize      : 0.0211
section_idata_PointerToRawData: 0.01
section_idata_SizeOfRawData   : 0.0187
section_idata_VirtualSize     : 0.0135
section_idata_out_of_bounds   : 0.0118
section_rdata_PointerToRawData: 0.0173
section_rdata_SizeOfRawData   : 0.0158
section_rdata_VirtualAddress  : 0.0215
section_rdata_VirtualSize     : 0.019
section_reloc_PointerToRawData: 0.0193
section_reloc_SizeOfRawData   : 0.0239
section_reloc_VirtualAddress  : 0.0134
sec

KeyError: "None of [Index([('AddressOfEntryPoint', 'MajorLinkerVersion', 'NumberOfSections', 'PointerToRawData', 'SizeOfCode', 'SizeOfImage', 'SizeOfInitializedData', 'SizeOfRawData', 'TimeDateStamp', 'VirtualAddress', 'section_data_PointerToRawData', 'section_data_SizeOfRawData', 'section_data_VirtualAddress', 'section_data_VirtualSize', 'section_idata_PointerToRawData', 'section_idata_SizeOfRawData', 'section_idata_VirtualSize', 'section_idata_out_of_bounds', 'section_rdata_PointerToRawData', 'section_rdata_SizeOfRawData', 'section_rdata_VirtualAddress', 'section_rdata_VirtualSize', 'section_reloc_PointerToRawData', 'section_reloc_SizeOfRawData', 'section_reloc_VirtualAddress', 'section_reloc_VirtualSize', 'section_reloc_out_of_bounds', 'section_rsrc_PointerToRawData', 'section_rsrc_SizeOfRawData', 'section_rsrc_VirtualAddress', 'section_rsrc_VirtualSize', 'section_text_SizeOfRawData', 'section_text_VirtualSize', 'type_Executable')], dtype='object')] are in the [columns]"

In [30]:
actualList

['AddressOfEntryPoint',
 'MajorLinkerVersion',
 'NumberOfSections',
 'PointerToRawData',
 'SizeOfCode',
 'SizeOfImage',
 'SizeOfInitializedData',
 'SizeOfRawData',
 'TimeDateStamp',
 'VirtualAddress',
 'section_data_PointerToRawData',
 'section_data_SizeOfRawData',
 'section_data_VirtualAddress',
 'section_data_VirtualSize',
 'section_idata_PointerToRawData',
 'section_idata_SizeOfRawData',
 'section_idata_VirtualSize',
 'section_idata_out_of_bounds',
 'section_rdata_PointerToRawData',
 'section_rdata_SizeOfRawData',
 'section_rdata_VirtualAddress',
 'section_rdata_VirtualSize',
 'section_reloc_PointerToRawData',
 'section_reloc_SizeOfRawData',
 'section_reloc_VirtualAddress',
 'section_reloc_VirtualSize',
 'section_reloc_out_of_bounds',
 'section_rsrc_PointerToRawData',
 'section_rsrc_SizeOfRawData',
 'section_rsrc_VirtualAddress',
 'section_rsrc_VirtualSize',
 'section_text_SizeOfRawData',
 'section_text_VirtualSize',
 'type_Executable']

In [21]:
learn_trim = learn_copy[learn_copy.columns[sorted_indices]]

32

In [23]:
learn_copy.shape

(6000, 3424)

In [22]:
learn_trim.shape

(6000, 3424)

In [17]:
# Recursively eliminate features
rfecv = RFECV(estimator=classifier, step=1, scoring="accuracy")
rfecv.fit(learn_copy, classes)
rfecv.transform(learn_copy)

KeyboardInterrupt: 

In [None]:
# Number of best features
rfecv.n_features_

In [None]:
# Which categories are best
rfecv.support_

In [None]:
# Rank features best (1) to worst
rfecv.ranking_