In [1]:
import pandas as pd
import numpy as np

In [2]:
malData = pd.read_csv('malwaredata.csv', sep='|')

In [3]:
malData.head()

Unnamed: 0,Name,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,...,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
0,memtest.exe,631ea355665f28d4707448e442fbf5b8,332,224,258,9,0,361984,115712,0,...,4,3.262823,2.568844,3.537939,8797.0,216,18032,0,16,1
1,ose.exe,9d10f99a6712e28f8acd5641e3a7ea6b,332,224,3330,9,0,130560,19968,0,...,2,4.250461,3.420744,5.080177,837.0,518,1156,72,18,1
2,setup.exe,4d92f518527353c0db88a70fddcfd390,332,224,3330,9,0,517120,621568,0,...,11,4.426324,2.846449,5.271813,31102.272727,104,270376,72,18,1
3,DW20.EXE,a41e524f8d45f0074fd07805ff0c9b12,332,224,258,9,0,585728,369152,0,...,10,4.364291,2.669314,6.40072,1457.0,90,4264,72,18,1
4,dwtrig20.exe,c87e561258f2f8650cef999bf643a731,332,224,258,9,0,294912,247296,0,...,2,4.3061,3.421598,5.190603,1074.5,849,1300,72,18,1


In [4]:
legit = malData[0:41323].drop(["legitimate"], axis=1)
mal = malData[41323::].drop(["legitimate"], axis=1)

#show number of records, number of features
print([legit.shape[0], legit.shape[1]])
print([mal.shape[0], mal.shape[1]])

[41323, 56]
[96724, 56]


In [5]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn import cross_validation



In [6]:
#Drop Name, md5 columns. These are not useful for machine learning. Drop legitimate column, this will be used as the label
data_in = malData.drop(['Name','md5','legitimate'], axis=1).values
labels = malData['legitimate'].values

extratrees = ExtraTreesClassifier().fit(data_in,labels)
select = SelectFromModel(extratrees, prefit=True)
data_in_new = select.transform(data_in)

print(data_in.shape, data_in_new.shape)

(138047, 54) (138047, 13)


## Tree Classifier

In [7]:
features = data_in_new.shape[1]
importances = extratrees.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print("%d"%(f+1), malData.columns[2+indices[f]], importances[indices[f]])

1 DllCharacteristics 0.131088166018
2 Machine 0.115400541185
3 VersionInformationSize 0.0992766833439
4 ResourcesMaxEntropy 0.0736070430947
5 ImageBase 0.0626341824575
6 SectionsMaxEntropy 0.059730904853
7 Characteristics 0.0567015306854
8 Subsystem 0.0499294153023
9 MajorSubsystemVersion 0.0469968002487
10 SizeOfOptionalHeader 0.0442428648513
11 SectionsMeanEntropy 0.0441973333364
12 MajorOperatingSystemVersion 0.0362151080321
13 SectionsMinEntropy 0.0198574529258


In [8]:
from sklearn.ensemble import RandomForestClassifier
legit_train, legit_test, mal_train, mal_test = cross_validation.train_test_split(data_in_new, labels, test_size=0.2)
classif = RandomForestClassifier(n_estimators=50)

classif.fit(legit_train, mal_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
print("Score: ", classif.score(legit_test,mal_test)*100)

Score:  99.449474828


In [10]:
from sklearn.metrics import confusion_matrix

result = classif.predict(legit_test)
conf_mat = confusion_matrix(mal_test, result)

In [11]:
conf_mat

array([[19275,    90],
       [   62,  8183]], dtype=int64)

In [12]:
print("False positives: ", conf_mat[0][1]/sum(conf_mat[0])*100)
print("False negatives: ", conf_mat[1][0]/sum(conf_mat[1])*100)

False positives:  0.464756003098
False negatives:  0.751970891449


### Gradient Boosting Classifier

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train, mal_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [14]:
print("Score: ", grad_boost.score(legit_test,mal_test)*100)

Score:  98.8554871423


In [15]:
import tensorflow as tf

In [16]:
malData.columns

Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
       'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
       'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',
       'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
       'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
       'Impor

In [17]:
cols_to_norm = ['Machine', 'SizeOfOptionalHeader', 'Characteristics',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
       'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
       'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',
       'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
       'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
       'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',
       'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',
       'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',
       'VersionInformationSize']

In [18]:
malData[cols_to_norm] = malData[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max()-x.min()))

In [19]:
malData.head()

Unnamed: 0,Name,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,...,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
0,memtest.exe,631ea355665f28d4707448e442fbf5b8,0.0,0.0,0.005167,0.035294,0.0,0.000199,2.7e-05,0.0,...,0.00052,0.407867,0.321117,0.442242,3.641264e-06,8.940696e-08,4.198465e-06,0.0,0.615385,1
1,ose.exe,9d10f99a6712e28f8acd5641e3a7ea6b,0.0,0.0,0.067166,0.035294,0.0,7.2e-05,5e-06,0.0,...,0.00026,0.531326,0.427608,0.635022,3.46452e-07,2.144111e-07,2.691563e-07,1.676381e-08,0.692308,1
2,setup.exe,4d92f518527353c0db88a70fddcfd390,0.0,0.0,0.067166,0.035294,0.0,0.000284,0.000145,0.0,...,0.00143,0.55331,0.355818,0.658977,1.287389e-05,4.30478e-08,6.295277e-05,1.676381e-08,0.692308,1
3,DW20.EXE,a41e524f8d45f0074fd07805ff0c9b12,0.0,0.0,0.005167,0.035294,0.0,0.000322,8.6e-05,0.0,...,0.0013,0.545555,0.333676,0.80009,6.030831e-07,3.72529e-08,9.928048e-07,1.676381e-08,0.692308,1
4,dwtrig20.exe,c87e561258f2f8650cef999bf643a731,0.0,0.0,0.005167,0.035294,0.0,0.000162,5.8e-05,0.0,...,0.00026,0.538281,0.427715,0.648825,4.447583e-07,3.51419e-07,3.026844e-07,1.676381e-08,0.692308,1


In [20]:
machine = tf.feature_column.numeric_column('Machine')
sizeofoptionalheader = tf.feature_column.numeric_column('SizeOfOptionalHeader')
characteristics = tf.feature_column.numeric_column('Characteristics')
majorlinkerversion = tf.feature_column.numeric_column('MajorLinkerVersion')
minorlinkerversion = tf.feature_column.numeric_column('MinorLinkerVersion')
sizeofcode = tf.feature_column.numeric_column('SizeOfCode')
sizeofinitializeddata = tf.feature_column.numeric_column('SizeOfInitializedData')
sizeofuninitializeddata = tf.feature_column.numeric_column('SizeOfUninitializedData')
addressofentrypoint = tf.feature_column.numeric_column('AddressOfEntryPoint')
baseofcode = tf.feature_column.numeric_column('BaseOfCode')
baseofdata = tf.feature_column.numeric_column('BaseOfData')
imagebase = tf.feature_column.numeric_column('ImageBase')
sectionalignment = tf.feature_column.numeric_column('SectionAlignment')
filealignment = tf.feature_column.numeric_column('FileAlignment')
majoroperatingsystemversion = tf.feature_column.numeric_column('MajorOperatingSystemVersion')
minoroperatingsystemversion = tf.feature_column.numeric_column('MinorOperatingSystemVersion')
majorimageversion = tf.feature_column.numeric_column('MajorImageVersion')
minorimageversion = tf.feature_column.numeric_column('MinorImageVersion')
majorsubsystemversion = tf.feature_column.numeric_column('MajorSubsystemVersion')
minorsubsystemversion = tf.feature_column.numeric_column('MinorSubsystemVersion')
sizeofimage = tf.feature_column.numeric_column('SizeOfImage')
sizeofheaders = tf.feature_column.numeric_column('SizeOfHeaders')
checksum = tf.feature_column.numeric_column('CheckSum')
subsystem = tf.feature_column.numeric_column('Subsystem')
dllcharacteristics = tf.feature_column.numeric_column('DllCharacteristics')
sizeofstackreserve = tf.feature_column.numeric_column('SizeOfStackReserve')
sizeofstackcommit = tf.feature_column.numeric_column('SizeOfStackCommit')
sizeofheapreserve = tf.feature_column.numeric_column('SizeOfHeapReserve')
sizeofheapcommit = tf.feature_column.numeric_column('SizeOfHeapCommit')
loaderflags = tf.feature_column.numeric_column('LoaderFlags')
numberofrvaandsizes = tf.feature_column.numeric_column('NumberOfRvaAndSizes')
sectionsnb = tf.feature_column.numeric_column('SectionsNb')
sectionsmeanentropy = tf.feature_column.numeric_column('SectionsMeanEntropy')
sectionsminentropy = tf.feature_column.numeric_column('SectionsMinEntropy')
sectionsmaxentropy = tf.feature_column.numeric_column('SectionsMaxEntropy')
sectionsmeanrawsize = tf.feature_column.numeric_column('SectionsMeanRawsize')
sectionsminrawsize = tf.feature_column.numeric_column('SectionsMinRawsize')
sectionmaxrawsize = tf.feature_column.numeric_column('SectionMaxRawsize')
sectionsmeanvirtualsize = tf.feature_column.numeric_column('SectionsMeanVirtualsize')
sectionsminvirtualsize = tf.feature_column.numeric_column('SectionsMinVirtualsize')
sectionmaxvirtualsize = tf.feature_column.numeric_column('SectionMaxVirtualsize')
importsnbdll = tf.feature_column.numeric_column('ImportsNbDLL')
importsnb = tf.feature_column.numeric_column('ImportsNb')
importsnbordinal = tf.feature_column.numeric_column('ImportsNbOrdinal')
exportnb = tf.feature_column.numeric_column('ExportNb')
resourcesnb = tf.feature_column.numeric_column('ResourcesNb')
resourcesmeanentropy = tf.feature_column.numeric_column('ResourcesMeanEntropy')
resourcesminentropy = tf.feature_column.numeric_column('ResourcesMinEntropy')
resourcesmaxentropy = tf.feature_column.numeric_column('ResourcesMaxEntropy')
resourcesmeansize = tf.feature_column.numeric_column('ResourcesMeanSize')
resourcesminsize = tf.feature_column.numeric_column('ResourcesMinSize')
resourcesmaxsize = tf.feature_column.numeric_column('ResourcesMaxSize')
loadconfigurationsize = tf.feature_column.numeric_column('LoadConfigurationSize')
versioninformationsize = tf.feature_column.numeric_column('VersionInformationSize')

In [21]:
feat_cols = [machine,sizeofoptionalheader,characteristics,majorlinkerversion,minorlinkerversion,sizeofcode,sizeofinitializeddata,sizeofuninitializeddata,addressofentrypoint,baseofcode,baseofdata,imagebase,sectionalignment,filealignment,majoroperatingsystemversion,minoroperatingsystemversion,majorimageversion,minorimageversion,majorsubsystemversion,minorsubsystemversion,sizeofimage,sizeofheaders,checksum,subsystem,dllcharacteristics,sizeofstackreserve,sizeofstackcommit,sizeofheapreserve,sizeofheapcommit,loaderflags,numberofrvaandsizes,sectionsnb,sectionsmeanentropy,sectionsminentropy,sectionsmaxentropy,sectionsmeanrawsize,sectionsminrawsize,sectionmaxrawsize,sectionsmeanvirtualsize,sectionsminvirtualsize,sectionmaxvirtualsize,importsnbdll,importsnb,importsnbordinal,exportnb,resourcesnb,resourcesmeanentropy,resourcesminentropy,resourcesmaxentropy,resourcesmeansize,resourcesminsize,resourcesmaxsize,loadconfigurationsize,versioninformationsize]

In [22]:
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
x_data = malData.drop(['Name','md5','legitimate'], axis=1)
labels = malData['legitimate']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(x_data, labels, test_size = 0.2)

In [25]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10,num_epochs=1000,shuffle=True)

In [26]:
model = tf.estimator.LinearClassifier(feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_log_step_count_steps': 100, '_model_dir': 'C:\\Users\\M20210~1\\AppData\\Local\\Temp\\tmp9fqe7qp2', '_keep_checkpoint_max': 5, '_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_tf_random_seed': 1}


In [27]:
model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\M20210~1\AppData\Local\Temp\tmp9fqe7qp2\model.ckpt.
INFO:tensorflow:loss = 6.93147, step = 1
INFO:tensorflow:global_step/sec: 134.843
INFO:tensorflow:loss = 2.78763, step = 101 (0.750 sec)
INFO:tensorflow:global_step/sec: 128.206
INFO:tensorflow:loss = 1.33322, step = 201 (0.791 sec)
INFO:tensorflow:global_step/sec: 129.866
INFO:tensorflow:loss = 1.45699, step = 301 (0.775 sec)
INFO:tensorflow:global_step/sec: 129.199
INFO:tensorflow:loss = 2.44562, step = 401 (0.767 sec)
INFO:tensorflow:global_step/sec: 117.509
INFO:tensorflow:loss = 0.78918, step = 501 (0.849 sec)
INFO:tensorflow:global_step/sec: 138.157
INFO:tensorflow:loss = 2.76817, step = 601 (0.724 sec)
INFO:tensorflow:global_step/sec: 136.432
INFO:tensorflow:loss = 2.69982, step = 701 (0.731 sec)
INFO:tensorflow:global_step/sec: 130.719
INFO:tensorflow:loss = 1.28007, step = 801 (0.774 sec)
INFO:tensorflow:global_step/sec: 130.394

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1e4a8ba39b0>

In [28]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,y=y_test,batch_size=10,num_epochs=1,shuffle=False)

In [29]:
results = model.evaluate(eval_input_func)

INFO:tensorflow:Starting evaluation at 2022-04-16-08:04:52
INFO:tensorflow:Restoring parameters from C:\Users\M20210~1\AppData\Local\Temp\tmp9fqe7qp2\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2022-04-16-08:05:14
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.938645, accuracy_baseline = 0.701087, auc = 0.980935, auc_precision_recall = 0.965395, average_loss = 0.181692, global_step = 1000, label/mean = 0.298913, loss = 1.81692, prediction/mean = 0.289894


In [30]:
results

{'accuracy': 0.93864542,
 'accuracy_baseline': 0.70108652,
 'auc': 0.98093516,
 'auc_precision_recall': 0.96539485,
 'average_loss': 0.18169199,
 'global_step': 1000,
 'label/mean': 0.29891345,
 'loss': 1.8169198,
 'prediction/mean': 0.28989446}

In [31]:
pred_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,batch_size=10,num_epochs=1,shuffle=False)

In [32]:
predictions = model.predict(pred_input_func)

In [33]:
my_pred = list(predictions)

INFO:tensorflow:Restoring parameters from C:\Users\M20210~1\AppData\Local\Temp\tmp9fqe7qp2\model.ckpt-1000


In [34]:
my_pred

[{'class_ids': array([1], dtype=int64),
  'classes': array([b'1'], dtype=object),
  'logistic': array([ 0.99233401], dtype=float32),
  'logits': array([ 4.86326838], dtype=float32),
  'probabilities': array([ 0.00766597,  0.99233401], dtype=float32)},
 {'class_ids': array([1], dtype=int64),
  'classes': array([b'1'], dtype=object),
  'logistic': array([ 0.89043212], dtype=float32),
  'logits': array([ 2.09516239], dtype=float32),
  'probabilities': array([ 0.1095679 ,  0.89043212], dtype=float32)},
 {'class_ids': array([1], dtype=int64),
  'classes': array([b'1'], dtype=object),
  'logistic': array([ 0.9959746], dtype=float32),
  'logits': array([ 5.51108599], dtype=float32),
  'probabilities': array([ 0.00402545,  0.9959746 ], dtype=float32)},
 {'class_ids': array([0], dtype=int64),
  'classes': array([b'0'], dtype=object),
  'logistic': array([ 0.04682266], dtype=float32),
  'logits': array([-3.01343369], dtype=float32),
  'probabilities': array([ 0.95317739,  0.04682266], dtype=floa