# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../dataset/dataset_merged.csv')

In [3]:
# Remove specific columns if they exist
columns_to_remove = ['Unnamed: 0', 'filename', 'SectionNames', 'ImportedFunctions', 'ImportedDLL', 'e_res', 'e_res2', 'TimeDateStamp']
for col in columns_to_remove:
    if col in df.columns:
        del df[col]
        print(f"[*] Column '{col}' removed successfully")
    else:
        print(f"[!] Column '{col}' does not exist in DataFrame")

[*] Column 'Unnamed: 0' removed successfully
[*] Column 'filename' removed successfully
[*] Column 'SectionNames' removed successfully
[*] Column 'ImportedFunctions' removed successfully
[*] Column 'ImportedDLL' removed successfully
[*] Column 'e_res' removed successfully
[*] Column 'e_res2' removed successfully
[*] Column 'TimeDateStamp' removed successfully


In [4]:
for col in df.columns:
    df[col]=df.groupby("Malware")[col].transform(lambda x: x.fillna(x.mean()))

In [5]:
df.head()

Unnamed: 0,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,e_csum,...,BoundImportSize,IATRVA,IATSize,DelayImportRVA,DelayImportSize,COMDescriptorRVA,COMDescriptorSize,NumberOfImportDLL,NumberOfImportFunctions,Malware
0,23117,144,3,0,4,0,65535,0,184,0,...,0.0,266240.0,1960.0,0.0,0.0,0.0,0.0,10,235,0
1,23117,144,3,0,4,0,65535,0,184,0,...,0.0,38332.0,1208.0,0.0,0.0,0.0,0.0,12,139,0
2,23117,144,3,0,4,0,65535,0,184,0,...,0.0,54108.0,760.0,0.0,0.0,0.0,0.0,4,91,0
3,23117,144,3,0,4,0,65535,0,184,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,67,0
4,23117,80,2,0,4,15,65535,0,184,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,82,0


In [6]:
X = df[['e_magic', 'e_cblp', 'e_cp', 'e_crlc', 'e_cparhdr', 'e_minalloc', 'e_maxalloc', 'e_ss', 'e_sp', 'e_csum', 'e_ip', 'e_cs', 'e_lfarlc', 'e_ovno', 'e_oemid', 'e_oeminfo', 'e_lfanew', 'Machine', 'NumberOfSections', 'PointerToSymbolTable', 'NumberOfSymbols', 'SizeOfOptionalHeader', 'Characteristics', 'Magic', 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode', 'SizeOfInitializedData', 'SizeOfUninitializedData', 'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase', 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion', 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion', 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage', 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics', 'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve', 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'ExportRVA', 'ExportSize', 'ImportRVA', 'ImportSize', 'ResourceRVA', 'ResourceSize', 'ExceptionRVA', 'ExceptionSize', 'SecurityRVA', 'SecuritySize', 'BaserelocRVA', 'BaserelocSize', 'DebugRVA', 'DebugSize', 'ArchitectureRVA', 'ArchitectureSize', 'GlobalPtrRVA', 'GlobalPtrSize', 'TLSRVA', 'TLSSize', 'LoadConfigRVA', 'LoadConfigSize', 'BoundImportRVA', 'BoundImportSize', 'IATRVA', 'IATSize', 'DelayImportRVA', 'DelayImportSize', 'COMDescriptorRVA', 'COMDescriptorSize', 'NumberOfImportDLL', 'NumberOfImportFunctions']]
y= df['Malware']

In [7]:
# After run BER
# X = df[['DebugSize', 'DebugRVA', 'OSVersion', 'ExportRVA', 'NumberOfSections', 'StackReserveSize', 'Dll', 'AddressOfEntryPoint', 'NumberOfImportDLL', 'NumberOfImportFunctions']]
# y= df['Malware']

# Doing train_test_split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Training the model Decision Tree


In [75]:
from sklearn.tree import DecisionTreeClassifier

In [76]:
clf = DecisionTreeClassifier(random_state=0,criterion='gini', max_depth=5, min_samples_split=20, min_samples_leaf=10)
clf.fit(X_train,y_train)

# Checking accuracy of training dataset

In [77]:
from sklearn.metrics import accuracy_score

In [78]:
predictions_train = clf.predict(X_train)
accuracy_score(y_train,predictions_train)

0.980349344978166

# Checking accuracy of testing dataset

In [79]:
predictions_test=clf.predict(X_test)
accuracy_score(y_test, predictions_test)

0.9779286926994907

#### Here we can clearly see that for training dataset our accuracy is very high whereas for test dataset it is very low,hence our model is overfitted and to avoid this we will use Pruning method later.

# Evaluating our training dataset

In [81]:
from sklearn.metrics import classification_report,confusion_matrix

In [82]:
print(classification_report(y_train,predictions_train))
print(confusion_matrix(y_train,predictions_train))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       675
           1       0.97      0.99      0.98       699

    accuracy                           0.98      1374
   macro avg       0.98      0.98      0.98      1374
weighted avg       0.98      0.98      0.98      1374

[[655  20]
 [  7 692]]


# Evaluating our test dataset

In [83]:
print(classification_report(y_test,predictions_test))
print(confusion_matrix(y_test,predictions_test))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       288
           1       0.97      0.99      0.98       301

    accuracy                           0.98       589
   macro avg       0.98      0.98      0.98       589
weighted avg       0.98      0.98      0.98       589

[[278  10]
 [  3 298]]


# Training Initial Algorithm - Naive Bayes

In [35]:
from sklearn.naive_bayes import GaussianNB

# create Gaussian Naive Bayes model object and train it with the data
nb_model = GaussianNB()

nb_model.fit(X_train, y_train.ravel())



## Testing the Accuracy

### Performance on Training Data

In [36]:
# predict the values using the training data
nb_predict_train = nb_model.predict(X_train)

# import the performance metrics library
from sklearn import metrics

# Accuracy
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))
print()



Training Accuracy: 0.5808



In [37]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_train, nb_predict_train)))
print()

print("Classification Report")
print(metrics.classification_report(y_train, nb_predict_train))

Confusion Matrix
[[673   2]
 [574 125]]

Classification Report
              precision    recall  f1-score   support

           0       0.54      1.00      0.70       675
           1       0.98      0.18      0.30       699

    accuracy                           0.58      1374
   macro avg       0.76      0.59      0.50      1374
weighted avg       0.77      0.58      0.50      1374



### Performance on Testing Data

In [38]:
# predict the values using the testing data
nb_predict_test = nb_model.predict(X_test)

# Accuracy
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, nb_predict_test)))
print()




Testing Accuracy: 0.5484



In [39]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, nb_predict_test)))
print()

print("Classification Report")
print(metrics.classification_report(y_test, nb_predict_test))



Confusion Matrix
[[288   0]
 [266  35]]

Classification Report
              precision    recall  f1-score   support

           0       0.52      1.00      0.68       288
           1       1.00      0.12      0.21       301

    accuracy                           0.55       589
   macro avg       0.76      0.56      0.45       589
weighted avg       0.77      0.55      0.44       589



In [40]:
# import scikitplot as skplt
# skplt.metrics.plot_confusion_matrix(y_test, nb_predict_test, normalize=True)
# plt.show()

# Random Forest

In [85]:
# random forests
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, max_depth=5, min_samples_split=20, min_samples_leaf=10)
rf_model.fit(X_train, y_train.ravel())

# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=None, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
#             oob_score=False, random_state=42, verbose=0, warm_start=False)

### Performance on Training Data

In [86]:
# predict the values using the training data
rf_predict_train = rf_model.predict(X_train)

# Accuracy
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))
print()



Training Accuracy: 0.9869



In [87]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_train, rf_predict_train)))
print()

print("Classification Report")
print(metrics.classification_report(y_train, rf_predict_train))

Confusion Matrix
[[665  10]
 [  8 691]]

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       675
           1       0.99      0.99      0.99       699

    accuracy                           0.99      1374
   macro avg       0.99      0.99      0.99      1374
weighted avg       0.99      0.99      0.99      1374



### Performance on Testing Data

In [88]:
# predict the values using the testing data
rf_predict_test = rf_model.predict(X_test)


# Accuracy
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))
print()

Testing Accuracy: 0.9898



In [89]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, rf_predict_test)))
print()

print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))



Confusion Matrix
[[284   4]
 [  2 299]]

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       288
           1       0.99      0.99      0.99       301

    accuracy                           0.99       589
   macro avg       0.99      0.99      0.99       589
weighted avg       0.99      0.99      0.99       589



In [46]:
# skplt.metrics.plot_confusion_matrix(y_test, rf_predict_test, normalize=True)
# plt.show()



# Logistic Regression

In [47]:
#logistic regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C=0.7, random_state=42, solver='liblinear')
lr_model.fit(X_train, y_train.ravel())
# lr_model = LogisticRegression(C=0.7, random_state=42)
# lr_model.fit(X_train, y_train.ravel())

# LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
#           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
#           penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
#           verbose=0, warm_start=False)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Performance on Training Data

In [48]:
# predict the values using the training data
lr_predict_train = lr_model.predict(X_train)

# Accuracy
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, lr_predict_train)))
print()



Training Accuracy: 0.7627



In [49]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_train, lr_predict_train)))
print()

print("Classification Report")
print(metrics.classification_report(y_train, lr_predict_train))

Confusion Matrix
[[559 116]
 [210 489]]

Classification Report
              precision    recall  f1-score   support

           0       0.73      0.83      0.77       675
           1       0.81      0.70      0.75       699

    accuracy                           0.76      1374
   macro avg       0.77      0.76      0.76      1374
weighted avg       0.77      0.76      0.76      1374



### Performance on Testing Data

In [50]:
# predict the values using the testing data
lr_predict_test = lr_model.predict(X_test)


# Accuracy
print("Testing Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
print()




Testing Accuracy: 0.7402



In [51]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, lr_predict_test)))
print()

print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))



Confusion Matrix
[[233  55]
 [ 98 203]]

Classification Report
              precision    recall  f1-score   support

           0       0.70      0.81      0.75       288
           1       0.79      0.67      0.73       301

    accuracy                           0.74       589
   macro avg       0.75      0.74      0.74       589
weighted avg       0.75      0.74      0.74       589



In [52]:
import joblib
# Save the model to a file
joblib.dump(clf, './Decision_Tree.pkl')

['./Decision_Tree.pkl']

In [53]:
import joblib
# Save the model to a file
joblib.dump(nb_model, './Naive_Bayes.pkl')

['./Naive_Bayes.pkl']

In [54]:
import joblib
# Save the model to a file
joblib.dump(rf_model, './Random_Forest.pkl')

['./Random_Forest.pkl']

In [55]:
import joblib
# Save the model to a file
joblib.dump(lr_model, './Logistic_Regression.pkl')

['./Logistic_Regression.pkl']