In [1]:
import pandas as pd
import numpy as np

In [2]:
malData = pd.read_csv('malwaredata.csv', sep='|')

In [3]:
malData.head()

Unnamed: 0,Name,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,...,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
0,memtest.exe,631ea355665f28d4707448e442fbf5b8,332,224,258,9,0,361984,115712,0,...,4,3.262823,2.568844,3.537939,8797.0,216,18032,0,16,1
1,ose.exe,9d10f99a6712e28f8acd5641e3a7ea6b,332,224,3330,9,0,130560,19968,0,...,2,4.250461,3.420744,5.080177,837.0,518,1156,72,18,1
2,setup.exe,4d92f518527353c0db88a70fddcfd390,332,224,3330,9,0,517120,621568,0,...,11,4.426324,2.846449,5.271813,31102.272727,104,270376,72,18,1
3,DW20.EXE,a41e524f8d45f0074fd07805ff0c9b12,332,224,258,9,0,585728,369152,0,...,10,4.364291,2.669314,6.40072,1457.0,90,4264,72,18,1
4,dwtrig20.exe,c87e561258f2f8650cef999bf643a731,332,224,258,9,0,294912,247296,0,...,2,4.3061,3.421598,5.190603,1074.5,849,1300,72,18,1


In [4]:
#Drop Name, md5 columns. These are not useful for machine learning. Drop legitimate column, this will be used as the label
data_in = malData.drop(['Name','md5','legitimate'], axis=1).values
labels = malData['legitimate'].values

## Random Forest classifier

In [5]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn import cross_validation



In [6]:
extratrees = ExtraTreesClassifier().fit(data_in,labels)
select = SelectFromModel(extratrees, prefit=True)
data_in_new = select.transform(data_in)

print(data_in.shape, data_in_new.shape)

(138047, 54) (138047, 12)


In [7]:
from sklearn.ensemble import RandomForestClassifier
legit_train, legit_test, mal_train, mal_test = cross_validation.train_test_split(data_in_new, labels, test_size=0.2)
classif = RandomForestClassifier(n_estimators=50)

classif.fit(legit_train, mal_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
print("Accuracy: ", classif.score(legit_test,mal_test)*100)

Accuracy:  99.340818544


In [9]:
from sklearn.metrics import confusion_matrix

result = classif.predict(legit_test)
conf_mat = confusion_matrix(mal_test, result)

In [10]:
conf_mat

array([[19156,   101],
       [   81,  8272]], dtype=int64)

In [11]:
print("False positives: ", conf_mat[0][1]/sum(conf_mat[0])*100)
print("False negatives: ", conf_mat[1][0]/sum(conf_mat[1])*100)

False positives:  0.524484603002
False negatives:  0.969711480905


## Gradient Boosting classifier

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train, mal_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [13]:
print("Accuracy: ", grad_boost.score(legit_test,mal_test)*100)

Accuracy:  98.8047808765


In [14]:
result = grad_boost.predict(legit_test)
conf_mat = confusion_matrix(mal_test, result)

In [15]:
print("False positives: ", conf_mat[0][1]/sum(conf_mat[0])*100)
print("False negatives: ", conf_mat[1][0]/sum(conf_mat[1])*100)

False positives:  0.877602949577
False negatives:  1.92745121513
