# **Import Necessary Library**
Commonly used libraries are imported here. Libraries for models may be imported when they will be used in their cell.

In [1]:
import numpy as np
import pandas as pd
import os, re, time, math, tqdm, itertools
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import plotly.express as px
import plotly.offline as pyo
import seaborn as sns

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# !pip install interpret
# from interpret.blackbox import LimeTabular
# from interpret import show

import lime
import lime.lime_tabular
from lime.lime_tabular import LimeTabularExplainer
import graphviz
import shap

import pickle


AttributeError: module 'pandas.core.strings' has no attribute 'StringMethods'

# **Load Dataset**

In [None]:
# for dirname, _, filenames in os.walk('/kaggle/input/ids-intrusion-csv'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# **Read Dataset**

In [None]:
%%time
network_data_d1 = pd.read_csv("02-14-2018.csv", low_memory=False)

# **EDA**

## **Data Properties**

In [None]:
network_data_d1.shape
print ('Number of rows (Samples): ' , network_data_d1.shape[0])
print ('Number of columns (Features): ' , network_data_d1.shape[1])

In [None]:
network_data_d1

In [None]:
network_data_d1.info()

In [None]:
network_data_d1['Label'].value_counts()

## **Data Visualizations**

### **Bar Chart**

In [None]:
# bar chart of packets label
plt.figure(figsize=(5, 5))
plt.title('Packet Distribution')
plt.bar(x=['Benign', 'FTP-BruteForce', 'SSH-Bruteforce'], height=network_data_d1['Label'].value_counts(), color=['blue', 'magenta', 'cyan'])
p = plt.gcf()

### **Pie Chart**

In [None]:
# pie chart of packets label
plt.figure(figsize=(5, 5))
circle = plt.Circle((0, 0), 0.7, color='white')
plt.title('Packet Distribution')
plt.pie(network_data_d1['Label'].value_counts(), labels=['Benign', 'FTP-BruteForce', 'SSH-Bruteforce'], colors=['blue', 'magenta', 'cyan'])
p = plt.gcf()
p.gca().add_artist(circle)

## **Data Preprocessing**

### **Temporarily Hold Data**
This helps to code while manupulating the data. If something unexpected operation happend on data, strat a run from this cell is enough. Don't have to run full notebook.

In [None]:
# while working comment one line and uncomment another as needed
temp_network_data_d1 = network_data_d1
#network_data_d1 = temp_network_data_d1

### **Drop Infinate and Null**

In [None]:
print (network_data_d1.shape)

# replace infinity value as null value
network_data_d1 = network_data_d1.replace(["Infinity", "infinity"], np.inf)
network_data_d1 = network_data_d1.replace([np.inf, -np.inf], np.nan)

# drop all null values
network_data_d1.dropna(inplace=True)

print (network_data_d1.shape)

### **Drop Unnecessary Column**

In [None]:
network_data_d1.drop(columns="Timestamp", inplace=True)
print (network_data_d1.shape)

### **Transform Target Label into Binary Class**

In [None]:
%%time
# encode the target feature
network_data_d1['Label'] = network_data_d1['Label'].apply(lambda x: "Benign" if x == 'Benign' else "Malicious")
print(network_data_d1['Label'].unique())

In [None]:
network_data_d1['Label'].value_counts()

### **Data Balancing**

In [None]:
# split data into features and target
X=network_data_d1.drop(["Label"], axis=1)
y=network_data_d1["Label"]

#### **Oversampling**

In [None]:
%%time
# applying oversampling
ros = RandomOverSampler()
X_balanced, y_balanced = ros.fit_resample(X, y)

#### **SMOTE**

In [None]:
%%time
# # applying SMOTE
# smote = SMOTE(random_state=42)
# X_balanced, y_balanced = smote.fit_resample(X, y)

In [None]:
network_data_d1 = pd.concat([X_balanced, y_balanced], axis=1)
print (network_data_d1.shape)
print(network_data_d1['Label'].value_counts())

### **Data Normalization**

In [None]:
# list numeric columns
numeric_cols = network_data_d1.select_dtypes(include=[np.number]).columns.tolist()
print(numeric_cols)

#### **Z Score**

In [None]:
%%time
# apply z-score normalization
# std = StandardScaler()
# network_data_d1[numeric_cols] = std.fit_transform(network_data_d1[numeric_cols])

#### **Min Max**

In [None]:
%%time
# apply min-max normalization
# mnmx = MinMaxScaler()
# network_data_d1[numeric_cols] = mnmx.fit_transform(network_data_d1[numeric_cols])

#### **Robust**

In [None]:
%%time
# apply robust normalization
rbst = RobustScaler()
network_data_d1[numeric_cols] = rbst.fit_transform(network_data_d1[numeric_cols])

In [None]:
network_data_d1

## **Feature Selection**

### **Drop Constant Column**

In [None]:
# drop the constant columns (which varience is 0)
variances = network_data_d1.var(numeric_only=True)
constant_columns = variances[variances == 0].index
network_data_d1 = network_data_d1.drop(constant_columns, axis=1)

print (network_data_d1.shape)

### **Check and Drop Duplicate Column**

In [None]:
duplicates = set()
for i in range(0, len(network_data_d1.columns)):
    col1 = network_data_d1.columns[i]
    for j in range(i+1, len(network_data_d1.columns)):
        col2 = network_data_d1.columns[j]
        if(network_data_d1[col1].equals(network_data_d1[col2])):
            duplicates.add(col2)

print (duplicates)
network_data_d1.drop(duplicates, axis=1, inplace=True)
print (network_data_d1.shape)

### **Encode Target Label**

In [None]:
# encode the target feature
network_data_d1['Label'] = network_data_d1['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
print(network_data_d1['Label'].unique())

### **Drop Column Based on Correlations**

#### **Correlations of Data**

In [None]:
# pearson correlation heatmap
plt.figure(figsize=(70, 70))
corr = network_data_d1.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True
plt.show()

#### **Drop Columns**

In [None]:
correlated_col = set()
is_correlated = [True] * len(corr.columns)
threshold = 0.85
for i in range (len(corr.columns)):
    if(is_correlated[i]):
        for j in range(i):
          if (corr.iloc[i, j] >= threshold) and (is_correlated[j]):
            colname = corr.columns[j]
            is_correlated[j]=False
            correlated_col.add(colname)

print(correlated_col)
print(len(correlated_col))

In [None]:
network_data_d1.drop(correlated_col, axis=1, inplace=True)
print (network_data_d1.shape)

#### **Correlations of Data**

In [None]:
# pearson correlation heatmap
plt.figure(figsize=(70, 70))
corr = network_data_d1.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True
plt.show()

### **Recursive Feature Elimination (RFE) with Cross-Vlidation**

In [None]:
# # split data into features and target
# X=network_data_d1.drop(["Label"], axis=1)
# y=network_data_d1["Label"]

In [None]:
# %%time
# # applying RFE with CV
# dt = DecisionTreeClassifier()
# cv = StratifiedKFold(5)
# min_features = 1

# rfecv = RFECV(estimator=dt, step=1, cv=cv, scoring="accuracy", min_features_to_select=min_features, n_jobs=2)
# rfecv.fit(X, y)

In [None]:
# num_selected_features = rfecv.n_features_
# selected_features = X.columns[rfecv.support_]
# print(num_selected_features)
# print(selected_features)

In [None]:
# network_data_d1 = network_data_d1.loc[:, selected_features]
# network_data_d1

In [None]:
# # presenting number of feature vs accuracy
# num_scores = len(rfecv.cv_results_["mean_test_score"])
# plt.figure()
# plt.xlabel("Number of Selected Features")
# plt.ylabel("Mean Test Accuracy")
# plt.errorbar(
#     range(min_features, num_scores + min_features),
#     rfecv.cv_results_["mean_test_score"],
#     yerr=rfecv.cv_results_["std_test_score"],
# )
# plt.title("Recursive Feature Elimination")
# plt.show()

### **Sequential Feature Selection**

In [None]:
# # split data into features and target
# X=network_data_d1.drop(["Label"], axis=1)
# y=network_data_d1["Label"]

#### **Forward Feature Selection**

In [None]:
# %%time
# # applying forward feature selection
# dt = DecisionTreeClassifier()
# sfs_forward = SequentialFeatureSelector(
#     dt, n_features_to_select=35, direction="forward"
# ).fit(X, y)

In [None]:
# selected_features = X.columns[sfs_forward.get_support()]
# print(selected_features)

#### **Backward Feature Selection**

In [None]:
# %%time
# # applying backward feature selection
# dt = DecisionTreeClassifier()
# sfs_backward = SequentialFeatureSelector(
#     dt, n_features_to_select=35, direction="backward"
# ).fit(X, y)

In [None]:
# selected_features = X.columns[sfs_backward.get_support()]
# print(selected_features)

In [None]:
# network_data_d1 = network_data_d1.loc[:, selected_features] # add target lebel
# network_data_d1

# **Classification**
Here many algoritms will be used to classify the data.

### **Split Data**

In [None]:
X=network_data_d1.drop(["Label"], axis=1)
y=network_data_d1["Label"]

# split the data for evaluation
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state =42, shuffle=True)

# K-fold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

### **Confusion Metrix HeatMap**

In [None]:
def confusionMatrixHeatMap(cm, title):
    # box lebels
    group_counts = ["{0:0.0f}\n".format(value) for value in cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
    box_labels = [f"{v1}{v2}".strip() for v1, v2 in zip(group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cm.shape[0],cm.shape[1])

    # categories
    categories = ['Benign', 'Malicious']

    # create a heatmap of the confusion matrix
    sns.heatmap(cm, annot=box_labels, fmt='', cmap='Blues', cbar=False, xticklabels=categories, yticklabels=categories)

    # create and add rectangle patch
    ax = plt.gca()
    rect = patches.Rectangle((0, 0), len(cm[0]), len(cm), linewidth=2, edgecolor='black', facecolor='none')
    ax.add_patch(rect)

    # set labels, title, and axis ticks
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    title_font = { 'fontsize': 16, 'fontname': 'Times New Roman' }
#     plt.title('Decision Tree\n', **title_font)
    plt.title(title + '\n', **title_font)

    # calculate accuracy and misclassification rate
    total_samples = len(y)
    correct_predictions = sum(y == y_pred)
    incorrect_predictions = total_samples - correct_predictions
    accuracy = correct_predictions / total_samples
    misclassification_rate = incorrect_predictions / total_samples

    # add accuracy and misclassification rate to the heatmap
    plt.text(0.5, -0.1, f'\n\n\nAccuracy: {accuracy:.2f}', ha='center', va='center', transform=plt.gca().transAxes)
    plt.text(0.5, -0.2, f'\nMisclassification Rate: {misclassification_rate:.4f}', ha='center', va='center', transform=plt.gca().transAxes)

    # Show the plot
    plt.show()

## **LazyClassifier**

In [None]:
# LazyClassifier
# clf = LazyClassifier(verbose = 0,
#                      ignore_warnings = True,
#                      custom_metric = None,
#                      predictions = False,
#                      random_state = 12,
#                      classifiers = 'all')

# models, predictions = clf.fit(X_train, X_test, y_train, y_test)
# models

## **Decision Tree**

In [None]:
%%time
# create a Decision Tree model
dt = DecisionTreeClassifier()

# predict
y_pred = cross_val_predict(dt, X, y, cv=kf)


print ("Dicision Tree")
# generate report
cm=confusion_matrix(y, y_pred)
cr=classification_report(y, y_pred)
auc = roc_auc_score(y, y_pred)

print("Confusion Matrix:")
print(cm)

print("Performance Matrix:")
print(cr)

print("AUC:", auc)

confusionMatrixHeatMap(cm, title="Decision Tree")

## **Extra Tree**

In [None]:
# %%time
# # create a Extra Trees model
# et = ExtraTreeClassifier()

# # predict
# y_pred = cross_val_predict(et, X, y, cv=kf)

# print ("Extra Tree")
# # generate report
# cm=confusion_matrix(y, y_pred)
# cr=classification_report(y, y_pred)

# print("Confusion Matrix:")
# print(cm)

# print("Performance Matrix:")
# print(cr)

## **Random Forest**

In [None]:
# %%time
# # create a Random Forest model
# rf = RandomForestClassifier()

# # predict
# y_pred = cross_val_predict(rf, X, y, cv=kf)

# print ("Random Forest")
# # generate report
# cm=confusion_matrix(y, y_pred)
# cr=classification_report(y, y_pred)

# print("Confusion Matrix:")
# print(cm)

# print("Performance Matrix:")
# print(cr)

## **Extra Trees**

In [None]:
# %%time
# # create a Extra Trees model
# ett = ExtraTreesClassifier()

# # predict
# y_pred = cross_val_predict(ett, X, y, cv=kf)

# print ("Extra Trees")
# # generate report
# print("Confusion Matrix:")
# print(confusion_matrix(y, y_pred))

# print("Performance Matrix:")
# print(classification_report(y, y_pred))

## **Light Gradient Boosting Machine (LightGBM)**

In [None]:
# %%time
# # create a LightGBM model
# lgb = LGBMClassifier()

# # predict
# y_pred = cross_val_predict(lgb, X, y, cv=kf)

# print ("LightGBM")
# # generate report
# print("Confusion Matrix:")
# print(confusion_matrix(y, y_pred))

# print("Performance Matrix:")
# print(classification_report(y, y_pred))

## **eXtreme Gradient Boosting (XGBoost)**

In [None]:
# %%time
# # create a XGBoost model
# xgb = XGBClassifier()

# # predict
# y_pred = cross_val_predict(xgb, X, y, cv=kf)

# print ("XGBoost")
# # generate report
# print("Confusion Matrix:")
# print(confusion_matrix(y, y_pred))

# print("Performance Matrix:")
# print(classification_report(y, y_pred))

## **Cat Boost**

In [None]:
# %%time
# # create a Cat Boost model
# cb = CatBoostClassifier()

# # predict
# y_pred = cross_val_predict(cb, X, y, cv=kf)

# print ("Cat Boost")
# # generate report
# print("Confusion Matrix:")
# print(confusion_matrix(y, y_pred))

# print("Performance Matrix:")
# print(classification_report(y, y_pred))

## **Ada Boost**

In [None]:
# %%time
# # create a Ada Boost model
# ada = AdaBoostClassifier()

# # predict
# y_pred = cross_val_predict(ada, X, y, cv=kf)

# print ("Ada Boost")
# # generate report
# print("Confusion Matrix:")
# print(confusion_matrix(y, y_pred))

# print("Performance Matrix:")
# print(classification_report(y, y_pred))

## **K-Nearest Neighbors (KNN)**

In [None]:
# %%time
# # create a KNN model
# knn = KNeighborsClassifier()

# # predict
# y_pred = cross_val_predict(knn, X, y, cv=kf)

# print ("KNN")
# # generate report
# print("Confusion Matrix:")
# print(confusion_matrix(y, y_pred))

# print("Performance Matrix:")
# print(classification_report(y, y_pred))

## **Gaussian Naive Bayes**

In [None]:
# %%time
# # create a Gaussian Naive Bayes model
# gnb = GaussianNB()

# # predict
# y_pred = cross_val_predict(gnb, X, y, cv=kf)

# print ("Gaussian Naive Bayes")
# # generate report
# print("Confusion Matrix:")
# print(confusion_matrix(y, y_pred))

# print("Performance Matrix:")
# print(classification_report(y, y_pred))

## **Support Vector Classifier (SVC)**

In [None]:
# %%time
# # create a SVC model
# svc = SVC()

# # predict
# y_pred = cross_val_predict(svc, X, y, cv=kf)

# print ("Support Vector Classifier")
# # generate report
# print("Confusion Matrix:")
# print(confusion_matrix(y, y_pred))

# print("Performance Matrix:")
# print(classification_report(y, y_pred))

# **Explainable Artificial Intelligence (XAI)**

### **Split Data**

In [None]:
X=network_data_d1.drop(["Label"], axis=1)
y=network_data_d1["Label"]

# split the data for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state =42, shuffle=True)

# K-fold
# kf = KFold(n_splits=10, shuffle=True, random_state=42)

## **Classifier Model**

### **Decision Tree**

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
# Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump(dtc, file)

# # Load the model
# with open('model.pkl', 'rb') as file:
#     loaded_model = pickle.load(file)

## **LIME**

In [None]:
test_sample = X_test.iloc[0, :]
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X.columns.tolist(), verbose=True, mode='classification')
exp = explainer.explain_instance(test_sample.values, dtc.predict_proba)
exp.show_in_notebook()

In [None]:
# lime = LimeTabular(data=X_train, random_state=1, model = dtc)
# lime_local = lime.explain_local(X_test[-5:], y_test[-5:], name='LIME')

# show(lime_local)

## **GSM (IDT)**

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_test, y_pred)

In [None]:
dot_data = tree.export_graphviz(dt, out_file=None, feature_names=X.columns, class_names=["0", "1"], filled=True)
graph = graphviz.Source(dot_data)
graph.render("induced_dt_tree_view")
graph

## **SHAP**

In [None]:
explainer = shap.Explainer(dtc)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type='bar')
plt.show()
