In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nfuqnids:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2768123%2F4782263%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240514%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240514T200645Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8dd601c0bee47437f104ed01d360f3e40966d73e0fcef330a865fe01a3009355004080d9c8143c45470b71c2a14b703f6033f461201b84061498169be11796e420b17ddbe6f77919fddb9fc4285de8af8cbca7c036616c31720ebedbb9fa1faa6cf967d2bd8258de7a57681cf0b14fa76d3357be94f499bd1ad5b1839e6943e1d0ba8fe82674085de410f38e9561b06ba1159267f1b9f4494a9968739c7f0fb3cb86383e2330402361e7264061e79ed9a9b122a31e96dfe6fa30e92baaff9feedd589059df86ff65bd9e80b7cfa81e775a915446d3a783d7d2773deb018c4fb7fd5872b93fda43fd015a95c8d74a25f75b057ad173176b0b865d79cd6a7022b8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading nfuqnids, 113750249 bytes compressed
Downloaded and uncompressed: nfuqnids
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfuqnids/NF-UQ-NIDS.csv


In [None]:
from fastai.tabular.all import df_shrink

In [None]:
df = pd.read_csv('/kaggle/input/nfuqnids/NF-UQ-NIDS.csv', sep=',', encoding='utf-8')
df.shape

(11994893, 15)

In [None]:
df.dtypes

IPV4_SRC_ADDR                  object
L4_SRC_PORT                     int64
IPV4_DST_ADDR                  object
L4_DST_PORT                     int64
PROTOCOL                        int64
L7_PROTO                      float64
IN_BYTES                        int64
OUT_BYTES                       int64
IN_PKTS                         int64
OUT_PKTS                        int64
TCP_FLAGS                       int64
FLOW_DURATION_MILLISECONDS      int64
Label                           int64
Attack                         object
Dataset                        object
dtype: object

In [None]:
df.Label.value_counts()

Label
0    9208048
1    2786845
Name: count, dtype: int64

In [None]:
df.Attack.value_counts()

Attack
Benign            9208048
DDoS               763285
Reconnaissance     482946
injection          468575
DoS                348962
Brute Force        291955
password           156299
xss                 99944
Infilteration       62072
Exploits            24736
scanning            21467
Fuzzers             19463
Backdoor            19029
Bot                 15683
Generic              5570
Analysis             1995
Theft                1909
Shellcode            1365
mitm                 1295
Worms                 153
ransomware            142
Name: count, dtype: int64

In [None]:
df = df.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'])

In [None]:
df = df_shrink(df, obj2cat=False, int2uint=False)

In [None]:
df.dtypes

L4_SRC_PORT                     int32
L4_DST_PORT                     int32
PROTOCOL                        int16
L7_PROTO                      float32
IN_BYTES                        int32
OUT_BYTES                       int32
IN_PKTS                         int32
OUT_PKTS                        int32
TCP_FLAGS                       int16
FLOW_DURATION_MILLISECONDS      int32
Label                            int8
Attack                         object
Dataset                        object
dtype: object

In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
df.dropna(inplace=True)

0 rows with at least one NaN to remove


In [None]:
print(df.duplicated().sum(), "fully duplicate rows to remove")
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)

2838568 fully duplicate rows to remove


In [None]:
df.to_csv(f"NF-UQ-NIDS pre.csv")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Import necessary libraries
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/precleaning.csv'
# Replace with your dataset path
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
df.head()


Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,0,62073,56082,6,0.0,9672,416,11,8,25,15,0,Benign
1,1,32284,1526,6,0.0,1776,104,6,2,25,0,0,Benign
2,2,21,21971,6,1.0,1842,1236,26,22,25,1111,0,Benign
3,3,23800,46893,6,0.0,528,8824,10,12,27,124,0,Benign
4,4,63062,21,6,1.0,1786,2340,32,34,25,1459,0,Benign


In [None]:
df.tail()

Unnamed: 0.1,Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
549994,549994,56173,60992,6,11.0,4376,3080,28,30,27,376,0,Benign
549995,549995,35440,445,6,10.16,2664,3080,22,20,19,1188,1,Exploits
549996,549996,49896,4422,6,36.0,2230,15236,34,36,27,8,0,Benign
549997,549997,43617,3370,6,36.0,2542,21294,40,42,27,11,0,Benign
549998,549998,6146,26310,6,0.0,424,8824,8,12,27,223,0,Benign


In [None]:
df["Attack"].value_counts()

Attack
Benign            514205
Exploits           13245
Fuzzers             9760
Reconnaissance      6233
Generic             2468
DoS                 2205
Shellcode            769
Backdoor             520
Analysis             507
Worms                 87
Name: count, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

# Assuming your dataset is stored in a DataFrame called 'data'
X = df.drop(columns=['Label', 'Attack'])  # Features
y = df['Label']  # Target variable

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (439999, 11) (439999,)
Testing set shape: (110000, 11) (110000,)


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Define a list of classifiers
classifiers = [

    KNeighborsClassifier(),
    GaussianNB(),
    MLPClassifier(),
    XGBClassifier()

]

# Iterate over each classifier
for clf in classifiers:
    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate accuracy and F1 score using cross-validation
    accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
    f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()

    # Print results
    print("Classifier:", clf.__class__.__name__)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print()

Classifier: KNeighborsClassifier
Accuracy: 0.9494317029429051
F1 Score: 0.4410292533403265

Classifier: GaussianNB
Accuracy: 0.9172293549565129
F1 Score: 0.1521950880724074





Classifier: MLPClassifier
Accuracy: 0.940045307513825
F1 Score: 0.1251368753089051

Classifier: XGBClassifier
Accuracy: 0.9895090672001438
F1 Score: 0.9179393307597697



In [None]:
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Define a list of classifiers
classifiers = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
]

# Iterate over each classifier
for clf in classifiers:
    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate accuracy and F1 score using cross-validation
    accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
    f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()

    # Print results
    print("Classifier:", clf.__class__.__name__)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print()

    # Save the trained Random Forest Classifier as a pickle file
    if isinstance(clf, RandomForestClassifier):
        with open('random_forest_classifier.pkl', 'wb') as f:
            pickle.dump(clf, f)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier: LogisticRegression
Accuracy: 0.936661219084101
F1 Score: 0.07252840780834552

Classifier: DecisionTreeClassifier
Accuracy: 0.991079525258035
F1 Score: 0.9311433029101988

Classifier: RandomForestClassifier
Accuracy: 0.9911408891526661
F1 Score: 0.9304045493317183

Classifier: GradientBoostingClassifier
Accuracy: 0.9781726769880649
F1 Score: 0.8230817931776734



In [None]:
import pickle
from sklearn.ensemble import RandomForestClassifier

# Assuming you have already trained your RandomForestClassifier and stored it in a variable called 'rf_classifier'

# Save the model to a file
with open('random_forest_classifier0032.pkl', 'wb') as f:
    pickle.dump(clf, f)
