## ML INFOSEC 9: Malware Detector
### Reference: Saxe/Sanders: Malware data science, Chapter 8


In [0]:
import os
import sys
import re
import numpy
import random
from matplotlib import pyplot
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction import FeatureHasher


In [0]:
def get_string_features(path,hasher):
    # extract strings from binary file using regular expressions
    chars = r" -~"
    min_length = 5
    string_regexp = '[%s]{%d,}' % (chars, min_length)
    with open(path, encoding="latin-1") as file_object:
      data = file_object.read()
      pattern = re.compile(string_regexp)
      strings = pattern.findall(data)
    
    # store string features in dictionary form using bag of words
      features = dict()
      for string in strings:
        features[string] = features.get(string, 0) + 1    
    
    # hash the features using the hashing trick
      hashed_features = hasher.transform([features])

    # do some data munging to get the feature array
      hashed_features = hashed_features.todense()
      hashed_features = numpy.asarray(hashed_features)
      hashed_features = hashed_features[0]

    # return hashed string features
    #print("Extracted {0} strings from {1}".format(len(features),path))
    return hashed_features

In [0]:
hasher = FeatureHasher(20000)

benign_path = # 
malicious_path = # 


def get_training_paths(directory):
    targets = []
    for path in os.listdir(directory):
        targets.append(os.path.join(directory,path))
    return targets

malicious_paths = get_training_paths(malicious_path)
benign_paths = get_training_paths(benign_path)

X = [get_string_features(path,hasher) for path in malicious_paths + benign_paths]
y = [1 for i in range(len(malicious_paths))] + [0 for i in range(len(benign_paths))]

X, y = numpy.array(X), numpy.array(y)

### Cross validation (short version)

In [0]:
clf = RandomForestClassifier(64)

scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

### Cross validation (with KFold and ROC curves)

In [0]:
kf = KFold(n_splits=5, random_state=None, shuffle= True)

fold_counter = 0
for train_index, test_index in kf.split(X):
    training_X, training_y = X[train_index], y[train_index]
    test_X, test_y = X[test_index], y[test_index]
    classifier = RandomForestClassifier(64)
    classifier.fit(training_X,training_y)
    scores = classifier.predict_proba(test_X)[:,-1]
    fpr, tpr, thresholds = metrics.roc_curve(test_y, scores)
    pyplot.semilogx(fpr,tpr,label="Fold number {0}".format(fold_counter))
    fold_counter += 1

pyplot.xlabel("detector false positive rate")
pyplot.ylabel("detector true positive rate")
pyplot.title("Detector ROC curve")
pyplot.legend()
pyplot.grid()
pyplot.show()

### Train classifier for production

In [0]:
# train the detector on the specified training data (here: the full data set)

clf = RandomForestClassifier(64)
clf.fit(X,y)

### Classify a file

In [0]:
path_to_file = #
features = get_string_features(path_to_file,hasher)

result_proba = clf.predict_proba([features])[:,1]

if result_proba > 0.5:
   print("It appears this file is malicious!", result_proba)
else:
   print("It appears this file is benign.",result_proba)
