In [1]:
import joblib
import json
import numpy as np
import os
import pandas as pd
import pefile
import time
from io import DEFAULT_BUFFER_SIZE
from scipy.stats import entropy
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Global variables
seed = 42
# Path to original train.csv
csv_input_train_path = 'train.csv'
# Path to original test.csv
csv_input_test_path = 'test.csv'
# Path to training data
csv_train_data_path = 'train-data.csv'
# Path to testing data
csv_test_data_path = 'test-data.csv'
# Path to training labels
csv_train_labels_path = 'train-labels.csv'
# Path to testing labels
csv_test_labels_path = 'test-labels.csv'
# Path to training samples directory
dir_train = 'train'
# Path to testing samples directory
dir_test = 'test'

In [3]:
# Read the CSV files as DataFrames
df_train = pd.read_csv(csv_input_train_path)
df_test = pd.read_csv(csv_input_test_path)

# Rename `id` to `path` and `list` to `malicious`
df_train = df_train.rename(columns={'id': 'path', 'list': 'malicious'})
df_test = df_test.rename(columns={'id': 'path', 'list': 'malicious'})

# Change `malicious` column to bool type
df_train['malicious'] = df_train['malicious'].eq('Blacklist')
df_test['malicious'] = df_test['malicious'].eq('Blacklist')

# Change file name to relative path
df_train['path'] = df_train['path'].apply(lambda path: os.path.join(dir_train, str(path)))
df_test['path'] = df_test['path'].apply(lambda path: os.path.join(dir_test, str(path)))

# Sort by path
df_train = df_train.sort_values(by='path').reset_index(drop=True)
df_test = df_test.sort_values(by='path').reset_index(drop=True)

# Preview training data
df_train.head()

Unnamed: 0,path,md5,sha1,sha256,total,positives,malicious,filetype,submitted,user_id,length,entropy
0,train\1,0000e1cccca9ca2ae5d1dfb124686920,7704b8f6a9538f9bc2366696cf85eeec1f15e224,e1bde0b326a91688e8718086d06d1377394f2cb7feddec...,64,42,True,exe,40:22.2,1,67328,7.26516
1,train\10,000a1a3e8204261b7408ff51ffa5b440,fb4a5e7801cf6d0f293b6051569e325c46271309,9d5fc038b67068fa5fba1bbb34b938db83b26638b4e22f...,52,23,True,exe,40:24.5,1,773024,7.935332
2,train\100,013539d21eb2a5db8d0554affb3541b8,7d03a5db7166724ce98840ecb925bdab488e41e6,321d5a6d4f78f64b579f4182d18857e342da95c80dd30c...,51,46,True,exe,23:36.9,1,237766,5.571339
3,train\1000,00a1a53ae5ed39cbc04df7bd5bb3f9d1,249042d8478015cefdcd114a2009db14c2c20d48,3a3d86e16b03f98d10e3bd28d69f8486b868ce107b3332...,54,51,True,exe,10:37.1,1,57856,7.860059
4,train\1001,00a1abf0d976822bd049a72194af7440,cc96a187b2eca3f1b05fe0ed4c2ed7d005c3cb5a,e61e81190ca872eead63880a86a76f5c12563d51bd62d3...,51,40,True,exe,10:37.2,1,288256,7.016225


In [4]:
# Store labels in new dataframe
df_train_labels = df_train[['path', 'malicious']].copy()
df_test_labels = df_test[['path', 'malicious']].copy()

# Save labels to CSV
df_train_labels.to_csv(csv_train_labels_path, index=False)
df_test_labels.to_csv(csv_test_labels_path, index=False)

# Store input data in new dataframe, saving only the `path` column
df_train_data = df_train[['path']].copy()
df_test_data = df_test[['path']].copy()

# Preview training data
df_train_data.head()

Unnamed: 0,path
0,train\1
1,train\10
2,train\100
3,train\1000
4,train\1001


In [5]:
print(f'# train rows:               {len(df_train_data)}')
print(f'# test rows:                {len(df_test_data)}')
print(f"# NaN train path values:    {df_train_data['path'].isna().sum()}")
print(f"# NaN test path values:     {df_test_data['path'].isna().sum()}")
print(f"# unique train path values: {df_train_data['path'].nunique()}")
print(f"# unique test path values:  {df_test_data['path'].nunique()}")

# train rows:               4000
# test rows:                1000
# NaN train path values:    0
# NaN test path values:     0
# unique train path values: 4000
# unique test path values:  1000


In [6]:
def get_entropy(path):
    byte_counts = np.zeros(256, dtype=np.uint64)
    with open(path, 'rb') as file:
        while chunk := file.read(DEFAULT_BUFFER_SIZE):
            byte_counts += np.bincount(np.frombuffer(chunk, dtype=np.uint8), minlength=256).astype(np.uint64)
    file_size = os.path.getsize(path)
    return entropy(byte_counts / file_size, base=2) if file_size else 0

In [7]:
# Store the Shannon entropies of the files
tqdm.pandas()
df_train_data['entropy'] = df_train_data['path'].progress_apply(get_entropy)
df_test_data['entropy'] = df_test_data['path'].progress_apply(get_entropy)

# Preview training data
df_train_data.head()

100%|██████████| 4000/4000 [00:20<00:00, 193.28it/s]
100%|██████████| 1000/1000 [00:04<00:00, 224.59it/s]


Unnamed: 0,path,entropy
0,train\1,7.26516
1,train\10,7.935332
2,train\100,5.571339
3,train\1000,7.860059
4,train\1001,7.016225


In [8]:
def parse_pe(path):
    d = dict()
    try:
        d = pefile.PE(path).dump_dict()
        d.pop('LOAD_CONFIG', None)
        d.pop('TLS', None)
        d['Parsing Warnings'] = 'Parsing Warnings' in d
        _keys = list(d.keys())
        for key in _keys:
            if isinstance(d[key], list):
                d.pop(key, None)

    except:
        d['Parsing Warnings'] = True

    return d

In [9]:
# Make copies of the dataframes because the next operation takes a long time
_temp_df_train_data = df_train_data.copy()
_temp_df_test_data = df_test_data.copy()

# Get the portable executable file information
# This excludes LOAD_CONFIG and TLS data and columns whose values are lists
_temp_train = _temp_df_train_data['path'].progress_apply(parse_pe)
_temp_test = _temp_df_test_data['path'].progress_apply(parse_pe)

100%|██████████| 4000/4000 [08:56<00:00,  7.45it/s]
100%|██████████| 1000/1000 [02:00<00:00,  8.33it/s]


In [10]:
# Add the portable executable file information to the data
df_train_data = pd.concat([_temp_df_train_data, pd.json_normalize(_temp_train)], axis=1)
df_test_data = pd.concat([_temp_df_test_data, pd.json_normalize(_temp_test)], axis=1)

# Convert bools to np.int
df_train_data[df_train_data.select_dtypes(include=bool).columns] = df_train_data.select_dtypes(include=bool).astype(np.int)
df_test_data[df_test_data.select_dtypes(include=bool).columns] = df_test_data.select_dtypes(include=bool).astype(np.int)

# Replace all NaN values with 0
df_train_data = df_train_data.fillna(0)
df_test_data = df_test_data.fillna(0)

# Exclude columns with no unique values
_cols = df_train_data.columns[df_train_data.nunique() <= 1].tolist()
df_train_data = df_train_data.drop(columns=_cols)
df_test_data = df_test_data.drop(columns=_cols)

# Exclude columns that end with 'Offset' (this removes a lot of useless data)
_cols = [_col for _col in df_train_data.columns if _col.endswith('Offset')]
df_train_data = df_train_data.drop(columns=_cols)
df_test_data = df_test_data.drop(columns=_cols)

In [11]:
# Preview training data
df_train_data.head()

Unnamed: 0,path,entropy,DOS_HEADER.e_cblp.Value,DOS_HEADER.e_cp.Value,DOS_HEADER.e_crlc.Value,DOS_HEADER.e_cparhdr.Value,DOS_HEADER.e_minalloc.Value,DOS_HEADER.e_maxalloc.Value,DOS_HEADER.e_ss.Value,DOS_HEADER.e_sp.Value,...,OPTIONAL_HEADER.SizeOfHeaders.Value,OPTIONAL_HEADER.CheckSum.Value,OPTIONAL_HEADER.Subsystem.Value,OPTIONAL_HEADER.DllCharacteristics.Value,OPTIONAL_HEADER.SizeOfStackReserve.Value,OPTIONAL_HEADER.SizeOfStackCommit.Value,OPTIONAL_HEADER.SizeOfHeapReserve.Value,OPTIONAL_HEADER.SizeOfHeapCommit.Value,OPTIONAL_HEADER.LoaderFlags.Value,OPTIONAL_HEADER.NumberOfRvaAndSizes.Value
0,train\1,7.26516,144.0,3.0,0.0,4.0,0.0,65535.0,0.0,184.0,...,1152.0,105926.0,1.0,0.0,262144.0,4096.0,1048576.0,4096.0,0.0,16.0
1,train\10,7.935332,64.0,1.0,0.0,2.0,0.0,65535.0,0.0,184.0,...,512.0,830116.0,2.0,33024.0,1048576.0,53248.0,1048576.0,4096.0,0.0,16.0
2,train\100,5.571339,144.0,3.0,0.0,4.0,0.0,65535.0,0.0,184.0,...,1024.0,0.0,2.0,32768.0,262144.0,4096.0,1048576.0,4096.0,0.0,16.0
3,train\1000,7.860059,144.0,3.0,0.0,4.0,0.0,65535.0,0.0,184.0,...,1024.0,0.0,2.0,0.0,1048576.0,4096.0,1048576.0,4096.0,0.0,16.0
4,train\1001,7.016225,144.0,3.0,0.0,4.0,0.0,65535.0,0.0,184.0,...,1024.0,145923.0,3.0,32832.0,1310720.0,4096.0,2097152.0,4096.0,0.0,16.0


In [12]:
# Some of the columns' data types are numerical, but many are strings (object)
# We may want to one-hot encode the columns whose data types are not numeric
# https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics
df_train_data.dtypes

path                                                  object
entropy                                              float64
DOS_HEADER.e_cblp.Value                              float64
DOS_HEADER.e_cp.Value                                float64
DOS_HEADER.e_crlc.Value                              float64
DOS_HEADER.e_cparhdr.Value                           float64
DOS_HEADER.e_minalloc.Value                          float64
DOS_HEADER.e_maxalloc.Value                          float64
DOS_HEADER.e_ss.Value                                float64
DOS_HEADER.e_sp.Value                                float64
DOS_HEADER.e_csum.Value                              float64
DOS_HEADER.e_ip.Value                                float64
DOS_HEADER.e_cs.Value                                float64
DOS_HEADER.e_lfarlc.Value                            float64
DOS_HEADER.e_ovno.Value                              float64
DOS_HEADER.e_res.Value                                object
DOS_HEADER.e_oemid.Value

In [13]:
# Or... we can just drop non-numeric data for simplicity
_cols = df_train_data.select_dtypes(exclude=np.number).drop(columns='path')
df_train_data = df_train_data.drop(columns=_cols)
df_test_data = df_test_data.drop(columns=_cols)

# All of the columns should be numeric except for `path`
df_train_data.dtypes

path                                                  object
entropy                                              float64
DOS_HEADER.e_cblp.Value                              float64
DOS_HEADER.e_cp.Value                                float64
DOS_HEADER.e_crlc.Value                              float64
DOS_HEADER.e_cparhdr.Value                           float64
DOS_HEADER.e_minalloc.Value                          float64
DOS_HEADER.e_maxalloc.Value                          float64
DOS_HEADER.e_ss.Value                                float64
DOS_HEADER.e_sp.Value                                float64
DOS_HEADER.e_csum.Value                              float64
DOS_HEADER.e_ip.Value                                float64
DOS_HEADER.e_cs.Value                                float64
DOS_HEADER.e_lfarlc.Value                            float64
DOS_HEADER.e_ovno.Value                              float64
DOS_HEADER.e_oemid.Value                             float64
DOS_HEADER.e_oeminfo.Val

In [14]:
# Checkpoint progress
df_train_data.to_csv(csv_train_data_path, index=False)
df_test_data.to_csv(csv_test_data_path, index=False)

*** IMPORTANT ***

The cells above will generate the csv files needed for training and testing. After running the cells above once, you don't need to run them again.

In [15]:
# Load save state
df_train_data = pd.read_csv(csv_train_data_path)
df_test_data = pd.read_csv(csv_test_data_path)

df_train_labels = pd.read_csv(csv_train_labels_path)
df_test_labels = pd.read_csv(csv_test_labels_path)

In [16]:
# Z-score normalize all the data (except for `path`, which isn't numeric)
scaler = StandardScaler()
x_train = scaler.fit_transform(df_train_data.drop(columns=['path']))
joblib.dump(scaler, 'scaler.pkl')
scaler = joblib.load('scaler.pkl')
x_test = scaler.transform(df_test_data.drop(columns=['path']))

# Preview the input data
x_train[:5]

array([[ 8.42332569e-01, -6.20624604e-02, -7.34767209e-02,
        -7.04036935e-02, -6.30046152e-02, -6.86799332e-02,
         1.85405551e-01, -2.86972504e-02, -6.11441838e-02,
        -5.67933396e-02, -8.46921441e-02, -7.94451193e-02,
        -8.01580409e-02, -7.88597302e-02, -8.97287396e-02,
        -8.65410040e-02, -1.43974306e+00, -4.55385856e-01,
         1.88692055e+00, -6.09072100e-02, -7.33710321e-02,
        -3.24801716e-01, -6.79522636e-01, -4.52841806e-01,
        -1.17510416e-02, -2.83145337e-01, -3.00496835e-02,
        -2.67172076e-02, -7.08613498e-02, -6.56800773e-02,
        -4.45222481e-02, -3.37364583e-01, -1.99843691e-01,
        -2.30021475e+00, -7.55484664e-01, -4.48849835e-02,
        -4.15350331e-02, -5.39171951e-02, -5.52618206e-02,
         1.67445230e-01,  4.18040654e-01, -4.25690532e-02,
        -2.97523788e-01, -4.19588306e-01, -1.49193380e-01,
        -2.70900839e+00, -9.03034454e-01, -7.69458623e-01,
        -4.56408928e-02,  5.46366544e-02, -2.30262665e-0

In [17]:
# Get the training labels
y_train = np.array(df_train_labels['malicious'])
y_test = np.array(df_test_labels['malicious'])

# Verify that the training and testing datasets are balanced
print(df_train_labels['malicious'].value_counts(), end='\n\n')
print(df_test_labels['malicious'].value_counts())

malicious
True     2000
False    2000
Name: count, dtype: int64

malicious
True     500
False    500
Name: count, dtype: int64


In [18]:
models = {
    'Adaptive Boosting Classifier': AdaBoostClassifier(random_state=seed),
    'GBM': GradientBoostingClassifier(random_state=seed),
    'Histogram-based GBM': HistGradientBoostingClassifier(random_state=seed),
    'Random Forest Classifier': RandomForestClassifier(random_state=seed),
    'MLP': MLPClassifier(max_iter=1000, random_state=seed),
    'C-SVC': SVC(random_state=seed),
}

results = dict()

for name, model in models.items():
    # Train the model
    model = model.fit(x_train, y_train)
    # Test the model
    start_time = time.perf_counter_ns()
    y_pred = model.predict(x_test)
    end_time = time.perf_counter_ns()
    # Save the results
    results[name] = dict()
    results[name]['accuracy'] = accuracy_score(y_test, y_pred)
    results[name]['time_ns'] = end_time = start_time
    # Print the results of the test
    print(f'{name}: {results[name]['accuracy']}')

# Save results
path_results = 'test_results.json'
with open(path_results, 'w') as file:
    json.dump(results, file)

# Use the model with the highest accuracy
# If two models tie, take the faster one
best_model_name = max(results, key=lambda m: (results[m]['accuracy'], -results[m]['time_ns']))
print(f'The best model is {best_model_name}')



Adaptive Boosting Classifier: 0.949
GBM: 0.961
Histogram-based GBM: 0.961
Random Forest Classifier: 0.967
MLP: 0.945
C-SVC: 0.929
The best model is Random Forest Classifier


In [19]:
# Save the models
for name, model in models.items():
    model_path = f'models/{name.replace(' ', '_')}_model.pkl'
    joblib.dump(model, model_path)
    print(f'{name} model saved in {model_path}')

# Save the best model as `best_model.pkl`
with open('models/best_model_name.txt', 'w') as file:
    file.write(best_model_name)
best_model_path = f'models/best_model.pkl'
joblib.dump(results[best_model_name], best_model_path)

Adaptive Boosting Classifier model saved in models/Adaptive_Boosting_Classifier_model.pkl
GBM model saved in models/GBM_model.pkl
Histogram-based GBM model saved in models/Histogram-based_GBM_model.pkl
Random Forest Classifier model saved in models/Random_Forest_Classifier_model.pkl
MLP model saved in models/MLP_model.pkl
C-SVC model saved in models/C-SVC_model.pkl


['models/best_model.pkl']