In [1]:
import pandas as pd
import numpy as np
import os
from scipy.stats import entropy
from tqdm.auto import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Global variables
seed = 42
# Path to original train.csv
csv_input_train_path = '.../train.csv'
# Path to original test.csv
csv_input_test_path = '.../test.csv'
# Path to training data
csv_train_data_path = '.../train-data.csv'
# Path to testing data
csv_test_data_path = '.../test-data.csv'
# Path to training labels
csv_train_labels_path = '.../train-labels.csv'
# Path to testing labels
csv_test_labels_path = '.../test-labels.csv'
# Path to training samples directory
dir_train = '.../train'
# Path to testing samples directory
dir_test = '.../test'

"\ncsv_input_train_path = './train.csv'\ncsv_input_test_path = './test.csv'\ncsv_train_data_path = './train-data.csv'\ncsv_test_data_path = './test-data.csv'\ncsv_train_labels_path = './train-labels.csv'\ncsv_test_labels_path = './test-labels.csv'\ndir_train = './train'\ndir_test = './test'\n"

In [3]:
# Read the CSV files as DataFrames
df_train = pd.read_csv(csv_input_train_path)
df_test = pd.read_csv(csv_input_test_path)

df_train.head()

Unnamed: 0,id,md5,sha1,sha256,total,positives,list,filetype,submitted,user_id,length,entropy,imphash
0,28334,037e2811993e89e7ab29f3f6223389be,8e03ee0e57d2713497ebe05ed3883acb08b3423d,2c117cab05082964bc9b8a69941ae2f61aa32255e31f4a...,41,37,Blacklist,exe,2018-03-22 13:55:41.237023-05,1,152191,6.92442,6039c26165040db47e28057ca34786ef
1,423707,87c5e175cb7be1eef9b3e72de8a140ef,9072647a5a499075f992f48618e3554a0492bcf9,dcc1200d5a9881c170e1e0521ae1214e95b775a2826df7...,68,35,Blacklist,exe,2018-08-23 13:45:18.78216-05,1,305488,7.367747,f4fd5e474d548b6e56174e1335f360a9
2,27320,04932ff07a4d08cecca693d50e1d2b56,2b757c39e26c4cfd998f52a5eecbe64f8964f6e6,0d2cab743747d18cf4646f3bdc6a53589d23dc16b01779...,52,46,Blacklist,exe,2018-03-22 13:55:07.143493-05,1,135680,4.478637,aef439f1829f69458eb455e4017c94f9
3,286131,6e67fb3835da739a11570bba44a19dbc,5d640560134b2dbddeb9957b711f8e115b73e282,40accff9b9d71053d4d6f95e6efd7eca1bb1ef5af77c31...,68,59,Blacklist,exe,2018-07-15 20:17:32.131236-05,1,57344,6.459846,fd387f4c0fe3c64fed0e1a90cad09db7
4,40716,021484080d230917bcdda81be020fa90,e371c5875ee07f5a04594695667f3ee079f8a066,b22b013ca558497303e88e943b837d7760d097f9f9b7e9...,56,48,Blacklist,exe,2018-03-25 23:22:43.397561-05,1,328192,7.693209,79cd79c1d39c6ba0201242a6e6738c57


In [4]:
# Rename id to path and list to malicious
df_train = df_train.rename(columns={'id': 'path', 'list': 'malicious'})
df_test = df_test.rename(columns={'id': 'path', 'list': 'malicious'})

In [5]:
# Change id to path and sort by path
df_train['path'] = df_train['path'].apply(lambda path: os.path.join(dir_train, str(path)))
df_train = df_train.sort_values(by='path').reset_index(drop=True)

df_test['path'] = df_test['path'].apply(lambda path: os.path.join(dir_test, str(path)))
df_test = df_test.sort_values(by='path').reset_index(drop=True)

df_train.head()

Unnamed: 0,path,md5,sha1,sha256,total,positives,malicious,filetype,submitted,user_id,length,entropy,imphash
0,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,0000e1cccca9ca2ae5d1dfb124686920,7704b8f6a9538f9bc2366696cf85eeec1f15e224,e1bde0b326a91688e8718086d06d1377394f2cb7feddec...,64,42,Blacklist,exe,2018-02-15 19:40:22.182416-06,1,67328,7.26516,621035f939a89d24a029ff9852d1c1ef
1,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,000a1a3e8204261b7408ff51ffa5b440,fb4a5e7801cf6d0f293b6051569e325c46271309,9d5fc038b67068fa5fba1bbb34b938db83b26638b4e22f...,52,23,Blacklist,exe,2018-02-15 19:40:24.546965-06,1,773024,7.935332,f1331430ada73b1c300686e59b460e0d
2,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,00a1a53ae5ed39cbc04df7bd5bb3f9d1,249042d8478015cefdcd114a2009db14c2c20d48,3a3d86e16b03f98d10e3bd28d69f8486b868ce107b3332...,54,51,Blacklist,exe,2018-02-18 15:10:37.143773-06,1,57856,7.860059,d02d7c876403d019c2d57da6a6033ccd
3,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,018ec08e4531507a8d6854067e46c010,bd763783f514932f55ad82f7bfb5af4a44b930fe,5d8b6abcbc43efaa24738f2e9529ac7fa2667c7be607c7...,50,44,Blacklist,exe,2018-03-02 05:52:12.465901-06,1,548864,6.650733,7dbc18688de2f0cb7db845986420a463
4,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,ad1ff5e5b3e1ac3c76c3ee6fc25212b4,ab8492ec0b15bd760438f8fefe301e6a4623719c,a17ecd67504e4753193175eadcd307ae9ec1f62459bd39...,55,35,Blacklist,exe,2018-04-07 11:37:09.638431-05,1,1107336,6.823947,d6c5fbe4e6736ca89e1da08b01daf547


In [6]:
# Change list to malicious and change column type to bool
df_train['malicious'] = df_train['malicious'].eq('Blacklist')
df_test['malicious'] = df_test['malicious'].eq('Blacklist')

df_train.head()

Unnamed: 0,path,md5,sha1,sha256,total,positives,malicious,filetype,submitted,user_id,length,entropy,imphash
0,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,0000e1cccca9ca2ae5d1dfb124686920,7704b8f6a9538f9bc2366696cf85eeec1f15e224,e1bde0b326a91688e8718086d06d1377394f2cb7feddec...,64,42,True,exe,2018-02-15 19:40:22.182416-06,1,67328,7.26516,621035f939a89d24a029ff9852d1c1ef
1,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,000a1a3e8204261b7408ff51ffa5b440,fb4a5e7801cf6d0f293b6051569e325c46271309,9d5fc038b67068fa5fba1bbb34b938db83b26638b4e22f...,52,23,True,exe,2018-02-15 19:40:24.546965-06,1,773024,7.935332,f1331430ada73b1c300686e59b460e0d
2,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,00a1a53ae5ed39cbc04df7bd5bb3f9d1,249042d8478015cefdcd114a2009db14c2c20d48,3a3d86e16b03f98d10e3bd28d69f8486b868ce107b3332...,54,51,True,exe,2018-02-18 15:10:37.143773-06,1,57856,7.860059,d02d7c876403d019c2d57da6a6033ccd
3,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,018ec08e4531507a8d6854067e46c010,bd763783f514932f55ad82f7bfb5af4a44b930fe,5d8b6abcbc43efaa24738f2e9529ac7fa2667c7be607c7...,50,44,True,exe,2018-03-02 05:52:12.465901-06,1,548864,6.650733,7dbc18688de2f0cb7db845986420a463
4,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,ad1ff5e5b3e1ac3c76c3ee6fc25212b4,ab8492ec0b15bd760438f8fefe301e6a4623719c,a17ecd67504e4753193175eadcd307ae9ec1f62459bd39...,55,35,True,exe,2018-04-07 11:37:09.638431-05,1,1107336,6.823947,d6c5fbe4e6736ca89e1da08b01daf547


In [7]:
# List the number of missing values in each column
df_train.isna().sum()

path            0
md5             0
sha1            0
sha256          0
total           0
positives       0
malicious       0
filetype        0
submitted       0
user_id         0
length          0
entropy         0
imphash      8511
dtype: int64

In [8]:
# List the number of unique values in each column
df_train.nunique()

path         138896
md5          138896
sha1         138896
sha256       138896
total            56
positives        70
malicious         2
filetype          1
submitted    138383
user_id           1
length        45897
entropy      130493
imphash       50544
dtype: int64

In [9]:
# Store the paths of all of the files in train and test
df_train_data = pd.DataFrame({'path': df_train['path']})
df_test_data = pd.DataFrame({'path': df_test['path']})

df_train_data.head()

Unnamed: 0,path
0,/Volumes/ARCHIMEDES/pe-machine-learning-datase...
1,/Volumes/ARCHIMEDES/pe-machine-learning-datase...
2,/Volumes/ARCHIMEDES/pe-machine-learning-datase...
3,/Volumes/ARCHIMEDES/pe-machine-learning-datase...
4,/Volumes/ARCHIMEDES/pe-machine-learning-datase...


In [10]:
# Store the sizes of the files
#df_train_data['size'] = df_train_data['path'].apply(os.path.getsize).astype('uint64')
#df_test_data['size'] = df_test_data['path'].apply(os.path.getsize).astype('uint64')

#df_train_data.head()

In [11]:
def get_entropy(path):
    byte_counts = np.zeros(256, dtype=np.uint64)
    with open(path, 'rb') as file:
        while chunk := file.read(4096):
            byte_counts += np.bincount(np.frombuffer(chunk, dtype=np.uint8), minlength=256).astype(np.uint64)
    size = os.path.getsize(path)
    return entropy(byte_counts / os.path.getsize(path), base=2) if size else 0

In [12]:
# Store the Shannon entropies of the files
tqdm.pandas()
df_train_data['entropy'] = df_train_data['path'].progress_apply(get_entropy)
df_test_data['entropy'] = df_test_data['path'].progress_apply(get_entropy)

df_train_data.head()

  0%|          | 0/138896 [00:00<?, ?it/s]

100%|██████████| 138896/138896 [23:47<00:00, 97.33it/s] 
100%|██████████| 34724/34724 [09:56<00:00, 58.26it/s] 


Unnamed: 0,path,entropy
0,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,7.26516
1,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,7.935332
2,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,7.860059
3,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,6.650733
4,/Volumes/ARCHIMEDES/pe-machine-learning-datase...,6.823947


In [13]:
df_train_data.to_csv(csv_train_data_path, index=False)
df_test_data.to_csv(csv_test_data_path, index=False)

In [14]:
df_train[['path', 'malicious']].to_csv(csv_train_labels_path, index=False)
df_test[['path', 'malicious']].to_csv(csv_test_labels_path, index=False)

*** IMPORTANT ***

The cells above will generate the csv files needed for training and testing. After running the cells above once, you don't need to run them again.

In [3]:
df_train_data = pd.read_csv(csv_train_data_path)
df_test_data = pd.read_csv(csv_test_data_path)

In [4]:
df_train_labels = pd.read_csv(csv_train_labels_path)
df_test_labels = pd.read_csv(csv_test_labels_path)

In [5]:
# Z-score normalize the input data
scaler = StandardScaler()
x_train = scaler.fit_transform(df_train_data[['entropy']])
x_test = scaler.transform(df_test_data[['entropy']])

x_train[:5]

array([[0.83732684],
       [1.44593871],
       [1.37758008],
       [0.27933922],
       [0.43664271]])

In [6]:
# Get the training labels
y_train = np.array(df_train_labels['malicious'])
y_test = np.array(df_test_labels['malicious'])

print(df_train_labels['malicious'].value_counts(), end='\n\n')
print(df_test_labels['malicious'].value_counts())

malicious
True     69448
False    69448
Name: count, dtype: int64

malicious
True     17362
False    17362
Name: count, dtype: int64


In [7]:
models = {
    'Adaptive Boosting Classifier': AdaBoostClassifier(random_state=seed),
    'Bagging Classifier': BaggingClassifier(random_state=seed),
    'Extra-Trees Classifier': ExtraTreesClassifier(random_state=seed),
    'GBM': GradientBoostingClassifier(random_state=seed),
    'Random Forest Classifier': RandomForestClassifier(random_state=seed),
    'Histogram-based GBM': HistGradientBoostingClassifier(random_state=seed),
    'GPC': GaussianProcessClassifier(random_state=seed),
    'Passive Aggressive Classifier': PassiveAggressiveClassifier(random_state=seed),
    'Ridge Classifier': RidgeClassifier(random_state=seed),
    'SGD': SGDClassifier(random_state=seed),
    'MLP': MLPClassifier(random_state=seed),
    'Linear SVC': LinearSVC(dual='auto', random_state=seed),
    'Nu-SVC': NuSVC(random_state=seed),
    'C-SVC': SVC(random_state=seed),
    'Decision Tree Classifier': DecisionTreeClassifier(random_state=seed),
    'Extremely Randomized Tree Classifier': ExtraTreeClassifier(random_state=seed),
}

for name, model in models.items():
    model = model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(f'{name}: {accuracy_score(y_test, y_pred)}')

Decision Tree Classifier: 0.7151537841262527
Extremely Randomized Tree Classifier: 0.7181200322543486
