In [1]:
# Imports.
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from math import ceil 


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# loading the datasets
malware_train = pd.read_csv('kaggle_dataset_malware.csv')
malware_val = pd.read_csv('kaggle_dataset_test.csv')

In [3]:
malware_train.head()

Unnamed: 0,Name,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,...,SectionMaxChar,SectionMainChar,DirectoryEntryImport,DirectoryEntryImportSize,DirectoryEntryExport,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity
0,VirusShare_a878ba26000edaac5c98eff4432723b3,23117,144,3,0,4,0,65535,0,184,...,3758096608,0,7,152,0,0,54440,77824,73728,0
1,VirusShare_ef9130570fddc174b312b2047f5f4cf0,23117,144,3,0,4,0,65535,0,184,...,3791650880,0,16,311,0,0,262276,294912,0,346112
2,VirusShare_ef84cdeba22be72a69b198213dada81a,23117,144,3,0,4,0,65535,0,184,...,3221225536,0,6,176,0,0,36864,40960,0,0
3,VirusShare_6bf3608e60ebc16cbcff6ed5467d469e,23117,144,3,0,4,0,65535,0,184,...,3224371328,0,8,155,0,0,356352,1003520,0,14109472
4,VirusShare_2cc94d952b2efb13c7d6bbe0dd59d3fb,23117,144,3,0,4,0,65535,0,184,...,3227516992,0,2,43,0,0,61440,73728,0,90624


In [4]:
pd.isnull(malware_train).values.any()

False

In [5]:
pd.isnull(malware_val).values.any()

False

In [6]:
malware_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 78 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          17 non-null     object 
 1   e_magic                       17 non-null     int64  
 2   e_cblp                        17 non-null     int64  
 3   e_cp                          17 non-null     int64  
 4   e_crlc                        17 non-null     int64  
 5   e_cparhdr                     17 non-null     int64  
 6   e_minalloc                    17 non-null     int64  
 7   e_maxalloc                    17 non-null     int64  
 8   e_ss                          17 non-null     int64  
 9   e_sp                          17 non-null     int64  
 10  e_csum                        17 non-null     int64  
 11  e_ip                          17 non-null     int64  
 12  e_cs                          17 non-null     int64  
 13  e_lfarl

In [7]:
[col for col in malware_train.columns if col not in malware_val.columns]

['Malware']

In [8]:
# Target, 0 - benign, 1 - malicious
y = malware_train.Malware
y.value_counts()

1    14599
0     5012
Name: Malware, dtype: int64

In [9]:
# # Drop columns with the one one value, name and target
X =  malware_train.drop(["Name", "Malware"], axis=1)
X_val = malware_val.drop(["Name"], axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
mlp_gs = MLPClassifier()
parameter_space = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5)
clf.fit(X_train, y_train) # X is train samples and y is the corresponding labels

GridSearchCV(cv=5, estimator=MLPClassifier(), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.05],
                         'hidden_layer_sizes': [(10, 30, 10), (20,)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']})

In [16]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate': 'constant', 'solver': 'adam'}


In [12]:
stat_range = range(30)

def collect_statistics(create_classifier, list_initializers, Xtrain = X_train, ytrain = y_train, Xval = X_test, yval = y_test):
    
    stats = [ [] for l in range(len(list_initializers)) ]

    for i in stat_range:
        model = create_classifier(i)
        model.fit(Xtrain, ytrain)

        y = model.predict(Xval)
        p = model.predict_proba(Xval)
        p0 = p[yval == 0, 0]
        p1 = p[yval == 1, 1]
        
        for l in zip(stats, list_initializers):
            l[0].append(l[1](y, p, p0, p1))
    
    return tuple(stats)

In [20]:
# Define the MLP model
mlp = MLPClassifier(activation= 'relu', alpha=0.0001, hidden_layer_sizes=(20,), learning_rate= 'constant', solver= 'adam')

# Train the model
mlp.fit(X_train, y_train)

# Make predictions on the test set
y_pred = mlp.predict(X_test)

# Evaluate the model
print(f"\nClassification Report\n {classification_report(y_test, y_pred)}")
print(f"\nConfusion Matrix\n {confusion_matrix(y_test, y_pred)}")


Classification Report
               precision    recall  f1-score   support

           0       0.74      0.85      0.79       995
           1       0.94      0.90      0.92      2928

    accuracy                           0.89      3923
   macro avg       0.84      0.87      0.86      3923
weighted avg       0.89      0.89      0.89      3923


Confusion Matrix
 [[ 841  154]
 [ 293 2635]]


This is the result of evaluating a classification model on a test set of data. The classification report shows the precision, recall, and f1-score for each class, as well as the overall accuracy of the model. The Confusion matrix also provides a detailed breakdown of the model's performance.

The precision for class 0 is 0.91, which means that 91% of the instances that the model predicted to be class 0 were actually class 0. The recall for class 0 is 0.78, which means that 78% of the actual class 0 instances were correctly predicted by the model. The F1-score is a harmonic mean of precision and recall, and it is 0.84 for class 0.

Similarly, The precision for class 1 is 0.93, which means that 93% of the instances that the model predicted to be class 1 were actually class 1. The recall for class 1 is 0.97, which means that 97% of the actual class 1 instances were correctly predicted by the model. The F1-score is a harmonic mean of precision and recall, and it is 0.95 for class 1.

The overall accuracy of the model is 0.92, which means that 92% of the test instances were correctly classified by the model.

The confusion matrix shows the number of true positives, false positives, true negatives, and false negatives for each class. In this case, the model correctly predicted 777 instances of class 0, 218 instances of class 0 were incorrectly predicted as class 1, 2848 instances of class 1 were correctly predicted and 80 instances of class 1 were incorrectly predicted as class 0.

Overall, this is considered a good result. The model has a high accuracy, precision, and recall for both classes, and a high F1-score.

In [14]:
# Validation on a test datas
y_valid = mlp.predict(X_val)
y_valid

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=int64)

In [15]:
malware_val['Name']

0                                    Skype-8.10.0.9.exe
1                                   vlc-3.0.2-win64.exe
2                                         stinger32.exe
3                                  SpotifyFullSetup.exe
4                                      uftp_english.exe
5     161a59f2525518f799c63f916c80fe85f50c5b09c74dc2...
6     eaa478e65696ad5cbdb42c1b4bd6954f2a876fdde2e519...
7                                     reverse_shell.exe
8     873b9eaef6ea5ed6126086594529a3395bdbc5d63c97d8...
9                               ScratchInstaller1.4.exe
10    69eb27dd3bbf5077dcd795872535b89af9a898254b90ad...
11    3334686141a400bb522824fa6f7faf30614372fe11837a...
12    3ec4cb928846f8298e5a13b3e96bfc2a709cb3b005a31e...
13    252f705dc15d7a305afd3e0619fa014c10b679248f71b7...
14                                         wordweb8.exe
15    c89f1e55b418a4447394994498971c6e6f3848bfe39ef9...
16                                   winrar-x64-550.exe
Name: Name, dtype: object