# Import Required Modules

In [1]:
import os
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Dowload the Datasets

In [2]:
# The Data Directory which it would be saved
DATA_DIR = "data"

# Check if the path already exist if not create one
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# The Dataset URLS
URLS = {
    'KDDTrain+.txt':'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt',
    'KDDTest+.txt' : 'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest+.txt'
}

# Function to download the Data's
def download_file(url,filename):
    local_path = os.path.join(DATA_DIR, filename)

    if os.path.exists(local_path):
        print(f"Correct file found: {filename} already exists")
        return
    print(f"Downloading {filename}...")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(local_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size = 8000):
                f.write(chunk)
        print(f"Success saved to {local_path}")
    except Exception as e:
        print(f"Failed to download {filename} :{e}")
for filename, url in URLS.items():
    download_file(url, filename)       
                    

Correct file found: KDDTrain+.txt already exists
Correct file found: KDDTest+.txt already exists


# Create and save a new Labeled Dataset

In [3]:
# Define the path to your data
DATA_DIR = 'data'
TRAIN_PATH = os.path.join(DATA_DIR, 'KDDTrain+.txt')

# Define the standard column names for the NSL-KDD dataset
COLUMNS = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", 
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "class", "difficulty_level"
]

# Read the dataset
df = pd.read_csv(TRAIN_PATH, header=None, names=COLUMNS)

# Save to CSV with a proper filename
df.to_csv("labeled_dataset.csv", index=False)

print("Dataset saved as labeled_dataset.csv")


Dataset saved as labeled_dataset.csv


In [4]:
train_path = os.path.join(DATA_DIR, 'KDDTrain+.txt')
test_path = os.path.join(DATA_DIR, 'KDDTest+.txt')

# Read CSV with our column names
train_df = pd.read_csv(train_path, names=COLUMNS)
test_df = pd.read_csv(test_path, names=COLUMNS)

print("Data Loaded Successfully!")

Data Loaded Successfully!


# Basic Data Overview

In [5]:
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [6]:
test_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty_level
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11


In [7]:
# Columns in the Dataset
print(f"Columns Present:\n{train_df.columns}\n\n")

# No of rows and columns
print(f"Shape:\n{train_df.shape}\n\n")

# Displays the Datatypes of each columns
print(f"Data Types:\n{train_df.dtypes}\n\n")

# Basic Information of the Dataset
print("Basic Info:")
train_df.info()

# Description of the Dataset
print("\n\nDescribed Data:")
train_df.describe()

Columns Present:
Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class', 'difficulty_level'],
      dtype='object')


Shape:
(125973, 43)


Data Types:
duration                         int64
protocol_type                   objec

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,difficulty_level
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,...,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,287.14465,45566.74,19779.11,0.000198,0.022687,0.000111,0.204409,0.001222,0.395736,0.27925,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,19.50406
std,2604.51531,5870331.0,4021269.0,0.014086,0.25353,0.014366,2.149968,0.045239,0.48901,23.942042,...,110.702741,0.448949,0.188922,0.308997,0.112564,0.444784,0.445669,0.306557,0.319459,2.291503
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0,20.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0,21.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0


## Checking Duplicates

In [8]:
# Function to calculate the number of duplicate values each column
def check_duplicates(df):
    duplicates = {}
    check = {}
    for col in df.columns:
        duplicates[col] = df[col].duplicated().sum()
        check[col] = df[col].unique().sum()
    return pd.Series(duplicates), pd.Series(check)

# Applying the Function
result, check = check_duplicates(train_df)
print(result, "\n\n\n", check)

duration                       122992
protocol_type                  125970
service                        125903
flag                           125962
src_bytes                      122632
dst_bytes                      116647
land                           125971
wrong_fragment                 125970
urgent                         125969
hot                            125945
num_failed_logins              125967
logged_in                      125971
num_compromised                125885
root_shell                     125971
su_attempted                   125970
num_root                       125891
num_file_creations             125938
num_shells                     125970
num_access_files               125963
num_outbound_cmds              125972
is_host_login                  125971
is_guest_login                 125971
count                          125461
srv_count                      125464
serror_rate                    125884
srv_serror_rate                125887
rerror_rate 

In [9]:
train_df['class'].value_counts()

class
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: count, dtype: int64

# All the Columns Information

- **duration:** The length of the connection (in seconds).  
- **protocol_type:** The protocol used in the connection (e.g., tcp, udp, icmp).  
- **service:** The network service requested at the destination (e.g., http, ftp, smtp, telnet).  
- **flag:** The status of the connection (e.g., SF means normal connection setup and teardown, S0 means connection attempt but no reply).  
- **src_bytes:** Number of data bytes sent from the source to the destination.  
- **dst_bytes:** Number of data bytes sent from the destination back to the source.  
- **land:** A binary flag (1 or 0). It is 1 if the source IP/port and destination IP/port are the same (indicates a "Land" DoS attack).  
- **wrong_fragment:** Number of fragmented packets that are incorrectly formatted (often seen in teardrop attacks).  
- **urgent:** Number of packets with the "urgent" bit set.  
- **hot:** Number of "hot" indicators (e.g., entering system directories, executing specific root commands).  
- **num_failed_logins:** Number of failed login attempts.  
- **logged_in:** Binary (1 or 0). 1 if the user successfully logged into the system.  
- **num_compromised:** Number of conditions indicating the system was compromised.  
- **root_shell:** Binary. 1 if a root shell was successfully obtained.  
- **su_attempted:** Binary. 1 if the user tried to switch to root user using the su command.  
- **num_root:** Number of root accesses or operations performed.  
- **num_file_creations:** Number of file creation operations.  
- **num_shells:** Number of shell prompts generated.  
- **num_access_files:** Number of attempts to access critical system files (e.g., /etc/passwd).  
- **num_outbound_cmds:** Number of outbound commands in an FTP session.  
- **is_host_login:** Binary. 1 if the login belongs to a specific host administration list.  
- **is_guest_login:** Binary. 1 if the login is a "guest" login.  
- **count:** The number of connections to the same destination host as the current connection in the past 2 seconds.  
- **srv_count:** The number of connections to the same service (e.g., HTTP) as the current connection in the past 2 seconds.  
- **serror_rate:** The percentage of connections that had "SYN" errors (connection request errors) to the same host.  
- **srv_serror_rate:** The percentage of connections that had "SYN" errors to the same service.  
- **rerror_rate:** The percentage of connections that had "REJ" (rejected) errors to the same host.  
- **srv_rerror_rate:** The percentage of connections that had "REJ" errors to the same service.  
- **same_srv_rate:** The percentage of connections to the same host that also went to the same service.  
- **diff_srv_rate:** The percentage of connections to the same host that went to a different service.  
- **srv_diff_host_rate:** The percentage of connections to the same service that came from different hosts.  
- **dst_host_count:** Number of connections to the same destination host out of the last 100.  
- **dst_host_srv_count:** Number of connections to the same destination host and same service out of the last 100.  
- **dst_host_same_srv_rate:** Percentage of connections that accessed the same service.  
- **dst_host_diff_srv_rate:** Percentage of connections that accessed different services.  
- **dst_host_same_src_port_rate:** Percentage of connections that came from the exact same source port.  
- **dst_host_srv_diff_host_rate:** Percentage of connections to the same service that involved different destination hosts.  
- **dst_host_serror_rate:** Percentage of "SYN" errors to this specific destination host.  
- **dst_host_srv_serror_rate:** Percentage of "SYN" errors to this specific destination host and service.  
- **dst_host_rerror_rate:** Percentage of "REJ" (rejected) errors to this destination host.  
- **dst_host_srv_rerror_rate:** Percentage of "REJ" errors to this destination host and service.  
- **class:** This is your target variable! It contains the label stating whether the traffic is normal or a specific attack (e.g., neptune, smurf, satan).  
- **difficulty_level:** This is a score (usually 0 to 21) indicating how difficult it is for standard machine learning algorithms to correctly classify this record. A higher number usually implies an "easier" classification, while a lower number indicates a tricky or borderline packet.  
