In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./cic-pdfmalware2022-part-2-simple-models.ipynb
./PDFMalware2022.csv
./PDFMalware2022.parquet
./pdf_detection.ipynb
./pdf_malware.ipynb
./.ipynb_checkpoints\cic-pdfmalware2022-part-2-simple-models-checkpoint.ipynb
./.ipynb_checkpoints\pdf_detection-checkpoint.ipynb
./.ipynb_checkpoints\pdf_malware-checkpoint.ipynb


In [2]:
df = pd.read_parquet('./PDFMalware2022.parquet')


In [3]:
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold, train_test_split
from scipy.stats import uniform, randint


In [4]:
# install xgboost if not available
try: import xgboost
except ModuleNotFoundError:
    !pip install -Uq xgboost
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping

In [5]:
df.columns

Index(['FileName', 'PdfSize', 'MetadataSize', 'Pages', 'XrefLength',
       'TitleCharacters', 'isEncrypted', 'EmbeddedFiles', 'Images', 'Text',
       'Header', 'Obj', 'Endobj', 'Stream', 'Endstream', 'Xref', 'Trailer',
       'StartXref', 'PageNo', 'Encrypt', 'ObjStm', 'JS', 'Javascript', 'AA',
       'OpenAction', 'Acroform', 'JBIG2Decode', 'RichMedia', 'Launch',
       'EmbeddedFile', 'XFA', 'Colors', 'Class'],
      dtype='object')

In [6]:
df['Text'].unique()

['No', 'Yes', 'unclear', '-1', '0']
Categories (5, object): ['-1', '0', 'No', 'Yes', 'unclear']

In [7]:
# Extract version from header column
df['Version'] = df['Header'].str.extract(r'%PDF-(\d+\.\d+)', expand=False)

# Drop original header column
df = df.drop(['Header'], axis=1)

# Fill missing values with mode
df['Version'] = df['Version'].fillna(df['Version'].mode()[0])

# Convert version to float
df['Version'] = df['Version'].astype(float)

In [8]:
dep = 'Class'
cats = df.select_dtypes(include='category').columns
conts = df.columns.difference([dep]+list(cats))
cats  

Index(['FileName', 'Images', 'Text', 'Obj', 'Endobj', 'Endstream', 'Xref',
       'StartXref', 'PageNo', 'JS', 'Javascript', 'AA', 'OpenAction',
       'Acroform', 'JBIG2Decode', 'RichMedia', 'Launch', 'EmbeddedFile', 'XFA',
       'Class'],
      dtype='object')

In [9]:
df['Version'].unique()

array([1.3  , 1.6  , 1.5  , 1.4  , 1.   , 1.7  , 1.1  , 1.2  , 1.544,
       1.344, 0.9  , 2.4  , 1.8  ])

In [10]:
# Creating holders to store the model performance results
list1  = []
list2  = []
list3  = []
list4  = []
list5  = []

#function to call for storing the results
def storeResults(model, a,b,c,d):
  list1.append(model)
  list2.append(round(a, 3))
  list3.append(round(b, 3))
  list4.append(round(c, 3))
  list5.append(round(d, 3))

In [11]:
def xs_y(df_, targ):    
    if not isinstance(targ, list):
        xs = df_[df_.columns.difference([targ])].copy()
    else:
        xs = df_[df_.columns.difference(targ)].copy()
    y = df_[targ].copy()
    return xs, y

In [12]:
trn_df  , val_df  = train_test_split(df  , test_size=0.30)
val_df , test_df  = train_test_split(df,test_size=0.75)
trn_df[cats] = trn_df[cats].apply(lambda x:x.cat.codes)
val_df[cats] = val_df[cats].apply(lambda x:x.cat.codes)
test_df[cats] = test_df[cats].apply(lambda x:x.cat.codes)

In [13]:
X_train, y_train = xs_y(trn_df, dep)
X_val, y_val = xs_y(val_df, dep)
X_test, y_test = xs_y(test_df, dep)

## Part 1 XGBoost once, without parameter optimization on a 70% train, 22.5% test split
The separate *X_val, y_val*, another 7.5% validation set remains unused, but is required later to avoid model contamination.

In [14]:
xgb = XGBClassifier(
    n_estimators= 100,
    use_label_encoder= False,
    max_depth= 8,
    booster= 'gbtree',
    tree_method= 'hist',
    subsample= 0.5,
    colsample_bytree= 0.5,
    importance_type= 'gain',
    objective='binary:logistic',
    eval_metric='logloss',
    predictor= 'cpu_predictor',
    n_jobs= -1)

xgb.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='logloss', feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=-1, num_parallel_tree=1, predictor='cpu_predictor',
              random_state=0, ...)

In [15]:
xgb_preds = xgb.predict(X_test)
xgb_preds.shape

roc_xgb  = roc_auc_score(y_true=y_test, y_score=xgb_preds)
acu_xgb  = accuracy_score(y_true=y_test, y_pred=xgb_preds)
pre_xgb  = precision_score(y_true=y_test, y_pred=xgb_preds) 
rec_xgb  = recall_score(y_true=y_test, y_pred=xgb_preds) 

print("The XGBoost model")
print("ROC-AUC",roc_xgb )
print("Accuracy", acu_xgb)
print("Precision",pre_xgb)
print("Recall", rec_xgb)


The XGBoost model
ROC-AUC 0.9970993097994721
Accuracy 0.9972067039106145
Precision 0.9968862275449102
Recall 0.9980815347721822


In [16]:

storeResults('XGboost  ',roc_xgb ,acu_xgb,
             pre_xgb,rec_xgb)

In [17]:
from sklearn import tree

dt_model  =tree.DecisionTreeClassifier(random_state=101, max_depth=23,criterion="entropy")
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=23, random_state=101)

In [18]:
dt_preds = dt_model.predict(X_test)
dt_preds.shape

roc_dt  = roc_auc_score(y_true=y_test, y_score=dt_preds)
acu_dt  = accuracy_score(y_true=y_test, y_pred=dt_preds)
pre_dt  = precision_score(y_true=y_test, y_pred=dt_preds) 
rec_dt  = recall_score(y_true=y_test, y_pred=dt_preds) 

print("Decision tree model")
print("ROC-AUC",roc_dt )
print("Accuracy", acu_dt)
print("Precision",pre_dt)
print("Recall", rec_dt)

Decision tree model
ROC-AUC 0.9952124322047738
Accuracy 0.9952114924181963
Precision 0.9961593855016803
Recall 0.9952038369304557


In [19]:
storeResults('Decision tree',roc_dt ,acu_dt,
             pre_dt,rec_dt)

In [20]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(n_estimators=30, random_state=42)
ab.fit(X_train,y_train), 

(AdaBoostClassifier(n_estimators=30, random_state=42),)

In [21]:
ab_preds = ab.predict(X_test)
ab_preds.shape

roc_ab  = roc_auc_score(y_true=y_test, y_score=ab_preds)
acu_ab  = accuracy_score(y_true=y_test, y_pred=ab_preds)
pre_ab  = precision_score(y_true=y_test, y_pred=ab_preds) 
rec_ab  = recall_score(y_true=y_test, y_pred=ab_preds) 

print("Ada boost  model")
print("ROC-AUC",roc_ab )
print("Accuracy", acu_ab)
print("Precision",pre_ab)
print("Recall", rec_ab)

Ada boost  model
ROC-AUC 0.9861485005543953
Accuracy 0.9862995477520617
Precision 0.9877668505636843
Recall 0.9875299760191847


In [22]:
storeResults('Ada boost ',roc_ab ,acu_ab,
             pre_ab,rec_ab)

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
# Fit the model to the training data
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=2)

In [24]:
knn_preds = knn.predict(X_test)
knn_preds.shape

roc_knn  = roc_auc_score(y_true=y_test, y_score=knn_preds)
acu_knn  = accuracy_score(y_true=y_test, y_pred=knn_preds)
pre_knn  = precision_score(y_true=y_test, y_pred=knn_preds) 
rec_knn  = recall_score(y_true=y_test, y_pred=knn_preds) 

print("KNN model")
print("ROC-AUC",roc_knn )
print("Accuracy", acu_knn)
print("Precision",pre_knn)
print("Recall", rec_knn)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN model
ROC-AUC 0.9130134601995823
Accuracy 0.905559989358872
Precision 0.9824316787506971
Recall 0.8448441247002398


In [25]:
storeResults('knn  ',roc_knn ,acu_knn,
             pre_knn,rec_knn)

In [26]:
from sklearn.linear_model import LogisticRegression

lg_model  = LogisticRegression( random_state=101, max_iter=300)
lg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=300, random_state=101)

In [27]:
lg_preds = lg_model.predict(X_test)
lg_preds.shape

roc_lg  = roc_auc_score(y_true=y_test, y_score=lg_preds)
acu_lg  = accuracy_score(y_true=y_test, y_pred=lg_preds)
pre_lg  = precision_score(y_true=y_test, y_pred=lg_preds) 
rec_lg  = recall_score(y_true=y_test, y_pred=lg_preds) 

print("LogisticRegression model ")
print("ROC-AUC",roc_lg )
print("Accuracy", acu_lg)
print("Precision",pre_lg)
print("Recall", rec_lg)

LogisticRegression model 
ROC-AUC 0.8935346346578651
Accuracy 0.897047086991221
Precision 0.8926919518963923
Recall 0.9256594724220624


In [28]:
storeResults('LogisticRegression',roc_lg ,acu_lg,
             pre_lg,rec_lg)

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train, y_train)

In [30]:
gbc_preds = gbc.predict(X_test)
gbc_preds.shape

roc_gbc  = roc_auc_score(y_true=y_test, y_score=gbc_preds)
acu_gbc  = accuracy_score(y_true=y_test, y_pred=gbc_preds)
pre_gbc  = precision_score(y_true=y_test, y_pred=gbc_preds) 
rec_gbc  = recall_score(y_true=y_test, y_pred=gbc_preds) 

print("GradientBoostingClassifier model ")
print("ROC-AUC",roc_gbc )
print("Accuracy", acu_gbc)
print("Precision",pre_gbc)
print("Recall", rec_gbc)

GradientBoostingClassifier model 
ROC-AUC 0.9923411808187858
Accuracy 0.9924181963288109
Precision 0.9932837610937875
Recall 0.9930455635491606


In [31]:
storeResults('GradientBoostingClassifier',roc_gbc ,acu_gbc,
             pre_gbc,rec_gbc)

In [32]:
from sklearn import svm
svc = svm.SVC()
svc.fit(X_train, y_train)

SVC()

In [33]:
svc_preds = svc.predict(X_test)
svc_preds.shape

roc_svc  = roc_auc_score(y_true=y_test, y_score=svc_preds)
acu_svc  = accuracy_score(y_true=y_test, y_pred=svc_preds)
pre_svc  = precision_score(y_true=y_test, y_pred=svc_preds) 
rec_svc  = recall_score(y_true=y_test, y_pred=svc_preds) 

print("Support vector machine  model ")
print("ROC-AUC",roc_svc )
print("Accuracy", acu_svc)
print("Precision",pre_svc)
print("Recall", rec_svc)

Support vector machine  model 
ROC-AUC 0.5688780158668764
Accuracy 0.6142591114658154
Precision 0.5915513264129181
Recall 0.9839328537170264


In [34]:
storeResults('Support vector machine',roc_svc ,acu_svc,
             pre_svc,rec_svc)

In [35]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=20, random_state=100,max_depth=15,criterion='entropy')
RF.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=15, n_estimators=20,
                       random_state=100)

In [36]:
rf_preds = RF.predict(X_test)
rf_preds.shape

roc_rf  = roc_auc_score(y_true=y_test, y_score=rf_preds)
acu_rf  = accuracy_score(y_true=y_test, y_pred=rf_preds)
pre_rf  = precision_score(y_true=y_test, y_pred=rf_preds) 
rec_rf  = recall_score(y_true=y_test, y_pred=rf_preds) 

print("RandomForestClassifier model ")
print("ROC-AUC",roc_rf )
print("Accuracy", acu_rf)
print("Precision",pre_rf)
print("Recall", rec_rf)

RandomForestClassifier model 
ROC-AUC 0.9973075303198301
Accuracy 0.9973397180101091
Precision 0.9976019184652278
Recall 0.9976019184652278


In [37]:
storeResults('Support vector machine',roc_rf ,acu_rf,
             pre_rf,rec_rf)

In [38]:
result = pd.DataFrame({ 'ML Model' : list1,
                        'Accuracy' : list2,
                        'f1_score' : list3,
                        'Recall'   : list4,
                        'Precision': list5,
                      })

In [39]:
#Sorting the datafram on accuracy
sorted_result=result.sort_values(by=['Accuracy', 'f1_score'],ascending=False).reset_index(drop=True)

In [40]:
# dispalying total result
sorted_result

Unnamed: 0,ML Model,Accuracy,f1_score,Recall,Precision
0,XGboost,0.997,0.997,0.997,0.998
1,Support vector machine,0.997,0.997,0.998,0.998
2,Decision tree,0.995,0.995,0.996,0.995
3,GradientBoostingClassifier,0.992,0.992,0.993,0.993
4,Ada boost,0.986,0.986,0.988,0.988
5,knn,0.913,0.906,0.982,0.845
6,LogisticRegression,0.894,0.897,0.893,0.926
7,Support vector machine,0.569,0.614,0.592,0.984
