In this exercise, we build on the previous exercises to prepare a labeled dataset of binary feature vectors, and use it to train a *Random Forest* binary classifier of malware/benign feature vectors. 

In [1]:
!pip install nltk 
!pip install pefile

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.0/774.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25
Collecting pefile
  Downloading pefile-2023.2.7-py3-none-any.whl.metadata (1.4 kB)
Downloading pefile-2023.2.7-py3-none-any.whl (71 kB)
[2K   [90m━━

In [2]:
!pip install scikit-learn==1.2.1

Collecting scikit-learn==1.2.1
  Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.1.post1
    Uninstalling scikit-learn-1.4.1.post1:
      Successfully uninstalled scikit-learn-1.4.1.post1
Successfully installed scikit-learn-1.2.1


In [3]:
!wget https://storage.googleapis.com/aiec-s24/4-%20Training%20a%20Static%20Malware%20Detector.zip

--2024-04-03 16:50:49--  https://storage.googleapis.com/aiec-s24/4-%20Training%20a%20Static%20Malware%20Detector.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.62.207, 142.251.163.207, 142.251.167.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.62.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 97064832 (93M) [application/x-zip-compressed]
Saving to: ‘4- Training a Static Malware Detector.zip’


2024-04-03 16:50:50 (71.6 MB/s) - ‘4- Training a Static Malware Detector.zip’ saved [97064832/97064832]



In [4]:
#!unzip "4- Training a Static Malware Detector.zip"

In [5]:
import os
directoriesWithLabels = [("Training a Static Malware Detector/Code/Samples/Benign",0), ("Training a Static Malware Detector/Code/Samples/Malware",1)]
listOfSamples = []
labels = []
for datasetPath, label in directoriesWithLabels:
    samples = [f for f in os.listdir(datasetPath)]
    for file in samples:
        filePath = os.path.join(datasetPath, file)
        listOfSamples.append(filePath)
        labels.append(label)

In [6]:
# labels # y_train

In [7]:
#Train-Test data split
from sklearn.model_selection import train_test_split
samples_train, samples_test, labels_train, labels_test = train_test_split(listOfSamples, labels, test_size=0.33, stratify=labels, random_state=42)

In [8]:
#samples_train, labels_train

In [9]:
import collections
from nltk import ngrams
import numpy as np
import pefile

def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
    return data

def byteSequenceToNgrams(byteSequence, n):
    Ngrams = ngrams(byteSequence, n)
    return list(Ngrams)
    
def extractNgramCounts(file, N):
    fileByteSequence = readFile(file)
    fileNgrams = byteSequenceToNgrams(fileByteSequence, N)
    return collections.Counter(fileNgrams)

def getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list):
    K1 = len(K1_most_common_Ngrams_list)
    fv = K1*[0]
    print(N)
    fileNgrams = extractNgramCounts(file, N)
    for i in range(K1):
        fv[i]=fileNgrams[K1_most_common_Ngrams_list[i]]
    return fv

def preprocessImports(listOfDLLs):
    processedListOfDLLs = []
    temp = [x.decode().split(".")[0].lower() for x in listOfDLLs]
    return " ".join(temp)

def getImports(pe):
    listOfImports = []
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        listOfImports.append(entry.dll)
    return preprocessImports(listOfImports)

def getSectionNames(pe):
    listOfSectionNames = []
    for eachSection in pe.sections:
        refined_name = eachSection.Name.decode().replace('\x00','').lower()
        listOfSectionNames.append(refined_name)
    return " ".join(listOfSectionNames)

In [10]:
# Generate 2-Grams, 
# and produce feature vectors based on the frequency method
# This may take a few minutes to run
N=2
totalNgramCount = collections.Counter([])
for file in samples_train:
    totalNgramCount += extractNgramCounts(file, N)
K1 = 100
K1_most_common_Ngrams = totalNgramCount.most_common(K1)
K1_most_common_Ngrams_list = [x[0] for x in K1_most_common_Ngrams]

In [11]:
K1_most_common_Ngrams_list

[(0, 0),
 (255, 255),
 (204, 204),
 (2, 100),
 (1, 0),
 (0, 139),
 (131, 196),
 (2, 0),
 (68, 36),
 (139, 69),
 (0, 131),
 (255, 117),
 (133, 192),
 (255, 139),
 (46, 46),
 (254, 255),
 (139, 77),
 (141, 77),
 (7, 0),
 (69, 252),
 (255, 21),
 (8, 139),
 (76, 36),
 (0, 1),
 (4, 0),
 (137, 69),
 (4, 139),
 (141, 69),
 (255, 131),
 (0, 137),
 (51, 192),
 (0, 255),
 (80, 232),
 (255, 141),
 (85, 139),
 (3, 100),
 (8, 0),
 (0, 232),
 (0, 116),
 (15, 182),
 (139, 236),
 (100, 0),
 (80, 141),
 (64, 0),
 (15, 132),
 (12, 139),
 (255, 0),
 (65, 68),
 (73, 78),
 (84, 36),
 (80, 65),
 (68, 68),
 (253, 255),
 (78, 71),
 (68, 73),
 (0, 204),
 (16, 0),
 (198, 69),
 (192, 116),
 (199, 69),
 (204, 139),
 (4, 137),
 (80, 255),
 (3, 0),
 (139, 68),
 (116, 36),
 (100, 139),
 (101, 0),
 (139, 76),
 (0, 8),
 (64, 2),
 (106, 0),
 (196, 12),
 (139, 70),
 (36, 8),
 (196, 4),
 (69, 8),
 (117, 8),
 (32, 0),
 (0, 89),
 (86, 139),
 (100, 232),
 (95, 94),
 (139, 255),
 (0, 16),
 (131, 192),
 (0, 117),
 (0, 80),
 (

In [12]:
# Extract N-gram features based on the frequency method
# Also, extracts some metadata such as DLL imports, 
# and PE Sections. We will combine these with
# our N-gram features to enrich the sample representation.
# This will take a few minutes to run.
# Some samples will generate errors such as 'not a PE file',
# 'DOS header not found', and 'invalid attribute'. These are OK.
importsCorpus_train = []
numSections_train = []
sectionNames_train = []
NgramFeaturesList_train = []
y_train = []
for i in range(len(samples_train)):
    file = samples_train[i]
    try:
        NGramFeatures = getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list)
        pe = pefile.PE(file)
        imports = getImports(pe)
        nSections = len(pe.sections)
        secNames = getSectionNames(pe)
        importsCorpus_train.append(imports)
        numSections_train.append(nSections)
        sectionNames_train.append(secNames)
        NgramFeaturesList_train.append(NGramFeatures)
        y_train.append(labels_train[i])
    except Exception as e: 
        print(file+":")
        print(e)

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/pmsort.exe:
'Invalid e_lfanew value, probably not a PE file'
2
Training a Static Malware Detector/Code/Samples/Malware/VirusShare_1a89b7d4fb8ded72e1f8e81ee9352262.exe:
'utf-8' codec can't decode byte 0xb1 in position 0: invalid start byte
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/aspnetca.exe:
'DOS Header magic not found.'
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/LogCollector.exe:
'DOS Header magic not found.'
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/FixSqlRegistryKey_x64.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samp

In [13]:
importsCorpus_train

['ntoskrnl hal',
 'kernel32 user32 advapi32 ole32 msvcrt loadperf ws2_32',
 'mscoree',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'kernel32 msvcrt user32',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'libgimpui-2 kernel32 msvcrt libgimp-2 libgimpbase-2 libgimpwidgets-2 libglib-2 libgobject-2 libgtk-win32-2 libintl-8',
 'kernel32 user32 advapi32 ole32 msvcrt loadperf ws2_32',
 'ws2_32 kernel32 user32 advapi32 ole32 oleaut32',
 'libgimpui-2 kernel32 msvcrt libgimp-2 libgimpbase-2 libglib-2 libintl-8',
 'kernel32 msvcr110 mfc110u user32 ole32 rpcrt4',
 'advapi32 kernel32 user32 msvcrt ntdll shell32 shlwapi uxtheme',
 'mscoree',
 'ntoskrnl hal',
 'advapi32 kernel32 msvcrt ntdll',
 'ntoskrnl hal',
 'libglib-2 kernel32 msvcrt shell32',
 'ntoskrnl hal',
 'ntoskrnl hal',
 'msys-1 msys-intl-8 kernel32',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 

In the following lines, we define a pipeline of sequential transforms (HashingVectorizer and TfidfTransformer) to extract N-gram featurs and construct feature vectors from the DLL imports and Section names extracted for each sample. 

In [14]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
imports_featurizer = Pipeline([('vect', HashingVectorizer(input='content', ngram_range=(1, 2))),('tfidf', TfidfTransformer(use_idf=True, )),])
section_names_featurizer = Pipeline([('vect', HashingVectorizer(input='content', ngram_range=(1, 2))),('tfidf', TfidfTransformer(use_idf=True, )),])
importsCorpus_train_transformed = imports_featurizer.fit_transform(importsCorpus_train)
sectionNames_train_transformed = section_names_featurizer.fit_transform(sectionNames_train)

In [15]:
# Combine the binary N-gram features with 
# the DLL imports and section names features to create
# vectorized training samples
from scipy.sparse import hstack, csr_matrix
X_train = hstack([NgramFeaturesList_train, importsCorpus_train_transformed,sectionNames_train_transformed, csr_matrix(numSections_train).transpose()])

In [16]:
# Convert X_train to CSR format
X_train_csr = X_train.tocsr()

# Extract the first row
first_row = X_train_csr[0]

# Extracting non-zero elements and their indices
nonzero_indices = first_row.indices
nonzero_values = first_row.data

# Printing nonzero indices and values
for index, value in zip(nonzero_indices, nonzero_values):
    print(f"Column Index: {index}, Value: {value}")


Column Index: 0, Value: 10433.0
Column Index: 1, Value: 1922.0
Column Index: 2, Value: 1973.0
Column Index: 3, Value: 3.0
Column Index: 4, Value: 1014.0
Column Index: 5, Value: 375.0
Column Index: 6, Value: 60.0
Column Index: 7, Value: 110.0
Column Index: 8, Value: 14.0
Column Index: 9, Value: 376.0
Column Index: 10, Value: 104.0
Column Index: 11, Value: 350.0
Column Index: 12, Value: 133.0
Column Index: 13, Value: 327.0
Column Index: 14, Value: 3.0
Column Index: 15, Value: 138.0
Column Index: 16, Value: 300.0
Column Index: 17, Value: 119.0
Column Index: 18, Value: 687.0
Column Index: 19, Value: 189.0
Column Index: 20, Value: 89.0
Column Index: 21, Value: 280.0
Column Index: 22, Value: 7.0
Column Index: 23, Value: 138.0
Column Index: 24, Value: 223.0
Column Index: 25, Value: 143.0
Column Index: 26, Value: 333.0
Column Index: 27, Value: 244.0
Column Index: 28, Value: 44.0
Column Index: 29, Value: 134.0
Column Index: 30, Value: 132.0
Column Index: 31, Value: 135.0
Column Index: 32, Value

In [17]:
#Train the Random Forest classifier
# This may take a few minutes.
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=1)
clf = clf.fit(X_train,y_train)

In [18]:
# Training accuracy
clf.score(X_train, y_train)

0.9957983193277311

In [19]:
# Generate feature vectors for the test samples
# This may take a few minutes
importsCorpus_test = []
numSections_test = []
sectionNames_test = []
NgramFeaturesList_test = []
y_test = []
for i in range(len(samples_test)):
    file = samples_test[i]
    try:
        NGramFeatures = getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list)
        pe = pefile.PE(file)
        imports = getImports(pe)
        nSections = len(pe.sections)
        secNames = getSectionNames(pe)
        importsCorpus_test.append(imports)
        numSections_test.append(nSections)
        sectionNames_test.append(secNames)
        NgramFeaturesList_test.append(NGramFeatures)
        y_test.append(labels_test[i])
    except Exception as e: 
        print(file+":")
        print(e)

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/BootExpCfg.exe:
'DOS Header magic not found.'
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/LockAppHost.exe:
'DOS Header magic not found.'
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Malware/VirusShare_7a30183b105b4200fc201925aba4886c.exe:
'utf-8' codec can't decode byte 0xb8 in position 0: invalid start byte
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/oisicon.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/malias.exe:
'Invalid e_lfanew value, probably not a PE file'
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Training a Static Malware Detector/Code/Samples/Benign/evntwin.exe:
'DOS Header mag

In [20]:
importsCorpus_test_transformed = imports_featurizer.transform(importsCorpus_test)
sectionNames_test_transformed = section_names_featurizer.transform(sectionNames_test)
X_test = hstack([NgramFeaturesList_test, importsCorpus_test_transformed,sectionNames_test_transformed, csr_matrix(numSections_test).transpose()])

In [21]:
X_test

<234x2097253 sparse matrix of type '<class 'numpy.float64'>'
	with 26387 stored elements in COOrdinate format>

In [22]:
clf.score(X_test, y_test)

0.9786324786324786

In [23]:
import joblib
#saving model
joblib.dump(clf, "model.joblib", protocol=2)

['model.joblib']

In [24]:
# load
loaded_model = joblib.load("model.joblib")

In [25]:
loaded_model.score(X_train, y_train)

0.9957983193277311

In [26]:
importsCorpus_pred = []
numSections_pred = []
sectionNames_pred = []
NgramFeaturesList_pred = []

NGramFeatures_pred = getNGramFeaturesFromSample('MSTeamsSetup_c_l_.exe', K1_most_common_Ngrams_list)
pe_pred = pefile.PE('MSTeamsSetup_c_l_.exe')
imports_pred = getImports(pe_pred)
nSections_pred = len(pe_pred.sections)
secNames_pred = getSectionNames(pe_pred)
importsCorpus_pred.append(imports_pred)
numSections_pred.append(nSections_pred)
sectionNames_pred.append(secNames_pred)
NgramFeaturesList_pred.append(NGramFeatures_pred)

2


In [27]:
importsCorpus_pred_transformed = imports_featurizer.transform(importsCorpus_pred)
sectionNames_pred_transformed = section_names_featurizer.transform(sectionNames_pred)
X_pred = hstack([NgramFeaturesList_pred, importsCorpus_pred_transformed,sectionNames_pred_transformed, csr_matrix(numSections_pred).transpose()])

In [28]:
X_pred

<1x2097253 sparse matrix of type '<class 'numpy.float64'>'
	with 129 stored elements in COOrdinate format>

In [29]:
numSections_pred

[5]

In [30]:
{
        "NgramFeaturesList_pred": NgramFeaturesList_pred,
        "importsCorpus_pred_transformed": importsCorpus_pred,
        "sectionNames_pred_transformed": sectionNames_pred,
        "numSections_pred": numSections_pred
}

{'NgramFeaturesList_pred': [[106318,
   10490,
   2550,
   14,
   1779,
   1590,
   771,
   828,
   189,
   1279,
   807,
   1627,
   1001,
   1031,
   12,
   1064,
   821,
   390,
   335,
   684,
   791,
   498,
   68,
   944,
   1278,
   770,
   225,
   570,
   608,
   404,
   702,
   991,
   634,
   277,
   989,
   14,
   735,
   717,
   840,
   541,
   980,
   203,
   283,
   585,
   449,
   376,
   885,
   15,
   28,
   177,
   22,
   14,
   635,
   118,
   21,
   242,
   392,
   133,
   470,
   197,
   91,
   142,
   2893,
   755,
   119,
   49,
   26,
   731,
   56,
   503,
   29,
   367,
   284,
   301,
   134,
   18,
   465,
   594,
   499,
   546,
   386,
   9,
   342,
   560,
   352,
   242,
   454,
   505,
   161,
   18,
   326,
   46,
   366,
   290,
   76,
   208,
   559,
   142,
   425,
   191]],
 'importsCorpus_pred_transformed': ['kernel32 user32 advapi32 shell32 ole32 oleaut32 urlmon version shlwapi comctl32'],
 'sectionNames_pred_transformed': ['.text .rdata .data .r

In [31]:
loaded_model.predict(X_pred)

array([1])

In [32]:
joblib.dump(imports_featurizer, 'imports_featurizer.pkl')
joblib.dump(section_names_featurizer, 'section_names_featurizer.pkl')

['section_names_featurizer.pkl']