# SIT ICT 3204 Coursework 2 Classification Machine Learning Python Notebook 
Machine Learning Notebook for the models
<br>
Check the other notebook datacleaning.ipynb for the data-preprocessing phase.

In [10]:
!pip install explainerdashboard #Install explainerdashboard libary for visualisation

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
from sklearn.neighbors import KNeighborsClassifier #Dependency for KNN algo
from sklearn.linear_model import LogisticRegression #Dependency for logistic regression algo
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score #Accuracy score to evaluate models
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn import set_config

from sklearn.model_selection import train_test_split #Dependency to split dataset into training and testing
import pandas as pd #Dependency to for Pandas dataframe
import numpy as np 

import sys, glob, os.path

from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from explainerdashboard.custom import ClassifierModelStatsComposite

import re
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from joblib import dump, load
set_config(display="diagram")

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# User Defined Functions

In [13]:
def getFilePath():
  fileName = 'Coursework 2/Deliverables/datacleaning.ipynb' #According to file name of this current notebook

  filePath = glob.glob('/content/drive/**/' + fileName, recursive = True) #A recursive fetch to the notebook

  fullPath = [my_i
  for my_i in filePath if len(my_i) in [max([len(my_i)
  for my_i in filePath])]
  ][0]; print(fullPath)
  return os.path.dirname(fullPath)

**Load the pre-processed dataset csv file as a Pandas Dataframe**

---



In [14]:
def ReadDataset(fileName):
  '''Function to read the combined dataset csv from data pre-processing as a Pandas Dataframe'''
  try:
    print("Reading dataset")
    df = pd.read_csv(fileName)
    print("Reading dataset complete")
    return df
  except FileNotFoundError:
    print("File %s not found" % fileName)
    return
  except Exception as err:
    print(err)

**Splitting the dataset**

In [15]:
def SplitDataset(df):
    """Split Pandas Dataframe into random train and test subsets."""
    '''Possible split percentages: [Train: 80%, Test: 20%] [Train: 67%, Test: 33%] [Train: 50%, Test: 50%]'''
    #Dividing Data Into Features and Labels
    print("Splitting the dataset into training and test set")
    X = df.drop("data_exfil", axis=1)  #Every column is a feature except data_exfil column
    y = df["data_exfil"]  # Target feature is only data_exfil
    #Splitting the Data into Training and Testing Dataset, random_state to mix the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) #test_size tweak to 20%
    train_set = [X_train, y_train]
    test_set = [X_test,  y_test]
    return train_set, test_set #Two datasets

# Execute main function to perform the training and evaluation



In [16]:
def main():
  print("Training and creating ML Classification models")
  ProjectPath = getFilePath() 
  df = ReadDataset(ProjectPath + "/datasets/processed/DataExfil.csv")
  
  cat_cols = ['host_name', 'src_ip-1', 'src_ip-2', 'src_ip-3', 'src_ip-4', 
              'fn', 'ext', 'dest_port', 'type', 'network_transport', 
              'src_mac']
  num_cols = ['bytes_in']
  # Transform DF before training ML
  '''
  Don't need to scale, if have to, use MinMax instead of Standard as
  MultinomialNB cannot handle negative values
  '''
  # mms = MinMaxScaler()
  # df['bytes_in'] = mms.fit_transform(df['bytes_in'].values.reshape(-1,1))

  ## Encoding Categorical
  ohe = OneHotEncoder(handle_unknown='ignore')
  ohe_df = pd.get_dummies(df[cat_cols])
  df = pd.concat([df, ohe_df], axis=1).drop(cat_cols, axis=1)
  ### Shift 2 cols to the back
  # cols_to_move = ['filename', 'bytes_in', 'data_exfil']
  cols_to_move = ['bytes_in', 'data_exfil']
  new = df.columns.difference(cols_to_move).to_list()+cols_to_move
  df = df[new]

  '''
  Get all duplicated rows - Needed for any tree based algorithm to ensure that
  we don't have same rows in test and train set
  '''
  df = df.drop_duplicates(subset=df.columns)

  outputFullFilePath = ProjectPath + '/datasets/processed/DataExfil_Encoded.csv'
  df.to_csv(outputFullFilePath, index=False, header=True) #Output to a CSV

  # Getting Train and Test Set
  X = df.drop("data_exfil", axis=1)  #Every column is a feature except data_exfil column
  y = df["data_exfil"]  # Target feature is only data_exfil
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

  # Morphin Time - Pipeline to run models
  pipeline = Pipeline(
    steps=[("clf", LogisticRegression())]
  )

  trained_models = []
  clfs = []
  clfs.append(LogisticRegression(solver='lbfgs', max_iter=400))
  clfs.append(KNeighborsClassifier(n_neighbors=3))
  clfs.append(MultinomialNB())
  clfs.append(RandomForestClassifier(n_jobs=-1, oob_score= False))
 
  for classifier in clfs:
    pipeline.set_params(clf = classifier)      
    pipeline.fit(X_train, y_train)
    trained_models.append(pipeline)
    predict = pipeline.predict(X_test)
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    # display(pipeline)
    f_1=f1_score(y_test, predict, average='macro')            
    pr=precision_score(y_test, predict, average='macro')
    rc=recall_score(y_test, predict, average='macro')
    print(f'{pipeline.score(X_test, y_test):.3f}')
    print("F1-Score:", f1_score(y_test, predict, average='macro'))
    print("Precision Score:", precision_score(y_test, predict, average='macro'))
    print("Recall Score:", recall_score(y_test, predict, average='macro'))

  # Get Model Objects for DataVisualisation
  KNN_Model = trained_models[0]
  LR_Model = trained_models[1]
  MNB_Model = trained_models[2]
  RF_Model = trained_models[3]

  print("\nVisualising the models")
  print("Dashboard for K-Nearest Neighbour")
  KNNexplainer = ClassifierExplainer(KNN_Model, X_test, y_test, cats=cat_cols, shap='kernel', target='data_exfil')
  ExplainerDashboard(KNNexplainer, ClassifierModelStatsComposite, mode='inline', title='K-Nearest Neighbour').run()

  print("\nDashboard for Multinomial Naive Bayes")
  GNSexplainer = ClassifierExplainer(MNB_Model, X_test, y_test, cats=cat_cols, shap='kernel', target='data_exfil')
  ExplainerDashboard(GNSexplainer, ClassifierModelStatsComposite, mode='inline', title='Multinomial Naive Bayes').run(port='8051')

  print("\nDashboard for Random Forest")
  RFexplainer = ClassifierExplainer(RF_Model, X_test, y_test, cats=cat_cols, shap='kernel', target='data_exfil')
  ExplainerDashboard(RFexplainer, ClassifierModelStatsComposite, mode='inline', title='Random Forest').run(port='8052')

  print("\nDashboard for Logistic Regression")
  logRegexplainer = ClassifierExplainer(LR_Model, X_test, y_test, cats=cat_cols, shap='kernel', target='data_exfil')
  ExplainerDashboard(logRegexplainer, ClassifierModelStatsComposite, mode='inline', title='Logistic Regression').run(port='8053')

main()

Training and creating ML Classification models
/content/drive/MyDrive/ICT3204-SA/Coursework 2/Deliverables/datacleaning.ipynb
Reading dataset
Reading dataset complete
---------------------------------
LogisticRegression(max_iter=400)
-----------------------------------
0.678
F1-Score: 0.6778119601030179
Precision Score: 0.7384106868553872
Recall Score: 0.7532931165122108
---------------------------------
KNeighborsClassifier(n_neighbors=3)
-----------------------------------
0.970
F1-Score: 0.9656023222060958
Precision Score: 0.9638127457783947
Recall Score: 0.9674622603019176
---------------------------------
MultinomialNB()
-----------------------------------
0.891
F1-Score: 0.8633998439774491
Precision Score: 0.9259142730233936
Recall Score: 0.8353296030774611
---------------------------------
RandomForestClassifier(n_jobs=-1)
-----------------------------------
0.997
F1-Score: 0.9971278784838107
Precision Score: 0.9961538461538462
Recall Score: 0.9981203007518797

Visualising the m

<IPython.core.display.Javascript object>


Dashboard for Multinomial Naive Bayes
Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')
Building ExplainerDashboard..
Generating layout...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating pred_percentiles...
Calculating predictions...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard inline (terminate it with ExplainerDashboard.terminate(8051))


<IPython.core.display.Javascript object>


Dashboard for Random Forest
Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')
Building ExplainerDashboard..
Generating layout...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating pred_percentiles...
Calculating predictions...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard inline (terminate it with ExplainerDashboard.terminate(8052))


<IPython.core.display.Javascript object>


Dashboard for Logistic Regression
Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')
Building ExplainerDashboard..
Generating layout...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating pred_percentiles...
Calculating predictions...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard inline (terminate it with ExplainerDashboard.terminate(8053))


<IPython.core.display.Javascript object>