In [None]:
import os
import requests

In [None]:
DATA_DIR = "data"
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
URLS = {
    'KDDTrain+.txt':'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt',
    'KDDTest+.txt' : 'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest+.txt'
}

def download_file(url,filename):
    local_path = os.path.join(DATA_DIR, filename)

    if os.path.exists(local_path):
        print(f"Correct file found: {filename} already exists")
        return
    print(f"Downloading {filename}...")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(local_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size = 8000):
                f.write(chunk)
        print(f"Success saved to {local_path}")
    except Exception as e:
        print(f"Failed to download {filename} :{e}")
        

In [None]:
for filename, url in URLS.items():
    download_file(url,filename)
print("\nAll files should now be in 'data' folder.")

In [None]:
import pandas as pd
import os

# Define the path to your data
DATA_DIR = 'data'
TRAIN_PATH = os.path.join(DATA_DIR, 'KDDTrain+.txt')


COLUMNS = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", 
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "class", "difficulty_level"
]


df = pd.read_csv(TRAIN_PATH, header=None, names=COLUMNS)

# Display the first few rows
df
# df.head()

In [None]:
train_path = os.path.join(DATA_DIR, 'KDDTrain+.txt')
test_path = os.path.join(DATA_DIR, 'KDDTest+.txt')

# Read CSV with our column names
train_df = pd.read_csv(train_path, names=COLUMNS)
test_df = pd.read_csv(test_path, names=COLUMNS)

print("Data Loaded Successfully!")

In [None]:
# Display first 5 rows
train_df.head()
train_df.tail()

test_df.tail()

In [None]:
train_df.describe()

In [None]:
print("\n---Data Info---")
train_df.info()

In [None]:
print(f"Missing Values in teh Training Data: {train_df.isnull().sum().sum()}")
print(f"Missing Values in teh Testing Data: {test_df.isnull().sum()}")

In [None]:
print(f"Duplicate data in the training dataset {train_df.duplicated().sum()}")
print(f"Duplicate data in the test dataset {test_df.duplicated()}")

In [None]:
train_df['class']

In [None]:
print(f"Duplicate data in the training dataset {train_df.duplicated().sum()}")
print(f"Duplicate data in the test dataset {test_df.duplicated()}")

In [None]:
train_df['class']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
unique_labels = train_df['class'].unique()
print(f" There are {len(unique_labels)} unique lables including 'normal'.")
print(unique_labels)

In [None]:

train_df['class'].value_counts()

In [None]:
# Save labeled training data as CSV
train_df.to_csv("../data/labeled_train.csv", index=False)

# Save labeled test data as CSV
test_df.to_csv("../data/labeled_test.csv", index=False)

print("CSV files created successfully ✅")


In [None]:
TRAIN_PATH = "labeled_CSV/KDDTrain+.txt"
TEST_PATH  = "labeled_CSV/KDDTest+.txt"


In [None]:
COLUMNS = [
'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
'root_shell','su_attempted','num_root','num_file_creations','num_shells',
'num_access_files','num_outbound_cmds','is_host_login','is_guest_login',
'count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate',
'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count',
'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate',
'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate',
'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate',
'class','difficulty'
]


In [None]:
import os
import pandas as pd

TRAIN_PATH = "../data/KDDTrain+.txt"
TEST_PATH  = "../data/KDDTest+.txt"

print("Train path exists:", os.path.exists(TRAIN_PATH))
print("Test path exists:", os.path.exists(TEST_PATH))


In [None]:
df = pd.read_csv(TRAIN_PATH, header=None)
df.head()


In [None]:
import pandas as pd
df = pd.read_csv(TRAIN_PATH, header=None, names=COLUMNS)

df #shows table
df.duplicated().sum() 

In [None]:
df['protocol_type'].unique()

In [None]:
df['protocol_type'].unique()
df['protocol_type']

In [None]:
tab = pd.read_csv(train_path,names = COLUMNS)
# df.head()
df.shape
# tab

df['protocol_type']
df[(df['protocol_type'] == 'tcp') & (df['src_bytes']>500)]
# x.unique().sum()

In [None]:
subset = df[['duration','protocol_type','class']]
subset.head()

df['protocol_type'].unique()

In [None]:
tcp_traffic = df[df['protocol_type'] == 'tcp']
print(f"Traffic of the TCP Count is: {len(tcp_traffic)}")

In [None]:
tcp_traffic = df[df['protocol_type'] == 'udp']
print(f"Traffic of the UDP Count is: {len(tcp_traffic)}")

In [None]:
tcp_traffic = df[df['protocol_type'] == 'icmp']
print(f"Traffic of the ICMP Count is: {len(tcp_traffic)}")

In [None]:
df['land'].unique()

In [None]:
land_attack = df[df['land']==1]
print(f"Land attacks Found are: {len(land_attack)}")

In [None]:
first_ten_rows = df.iloc[0:30]
first_ten_rows

In [None]:
df['service'].unique()

In [None]:
print(df['class'].value_counts())

In [None]:
print("\nAverage Duration by the Protocols")
print(df.groupby('protocol_type')['duration'].mean())

In [None]:
df[df['protocol_type']=='icmp']['src_bytes']

In [None]:
print("\nAverage Duration by the Protocols")
print(df.groupby('protocol_type')['duration'].mean())

In [None]:
df[df['protocol_type']=='icmp']['src_bytes']

In [None]:
protocol_counts = df['protocol_type'].value_counts()
protocol_counts

In [None]:
import matplotlib.pyplot as plt

protocol_counts = df['protocol_type'].value_counts()
protocol_counts

def set_plot(title,xlabel,ylabel):
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
plt.figure(figsize=(8,4))
plt.bar(protocol_counts.index,protocol_counts.values,color = "red",edgecolor = "black")
set_plot("Traffic Volume by protocol","protocol","Count")
plt.show()

In [None]:
import seaborn as sns
small_bytes = df[df['src_bytes']<6000]
small_bytes

plt.figure(figsize = (8,4))
sns.histplot(small_bytes['src_bytes'],bins = 50, kde=True)
plt.title("Distribution of Source Bytes (>1500)")
plt.show()

In [None]:
plt.figure(figsize = (8,4))
small_bytes = df[df['src_bytes']<100000]

sns.boxplot(x = 'protocol_type',y = 'src_bytes',data = small_bytes)
plt.title("Source Byte outliers by teh protocols")
plt.show()

In [None]:
print(df['protocol_type'].value_counts())

In [None]:
ct = pd.crosstab(df['protocol_type'],df['flag'])
ct

In [None]:
plt.figure(figsize=(12,5))
sns.heatmap(ct,annot = True, fmt='d', cmap='Blues')
plt.title("Protocol vs Connection Flag")
plt.show()

In [None]:
missing_count = df['src_bytes'].isnull().sum()
print("Missing values in src_bytes:", missing_count)

In [None]:
df.isnull().sum()


In [None]:
df['src_bytes'].isna().sum()


In [None]:
#to find percentage of missing values
(df['src_bytes'].isna().mean()) * 100


In [None]:
#to check how many rows have src_bytes=0
(df['src_bytes'] == 0).sum( )

In [None]:
#to see the rows where src_bytes=0
df[df['src_bytes']==0]


In [None]:
((df['src_bytes']==0).mean())*100

In [None]:
#check zero values by protocol
pd.crosstab(df['protocol_type'],df['src_bytes']==0)

In [None]:
import numpy as np
df['src_bytes_log'] = np.log1p(df['src_bytes'])


In [None]:
import numpy as np
df['src_bytes_log']=np.log1p(df['src_bytes'])

In [None]:
df[['src_bytes','src_bytes_log']].head()


In [None]:
#Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['src_bytes_scaled'] = scaler.fit_transform(df[['src_bytes_log']])


In [None]:
df['dst_bytes'].isnull().sum()


In [None]:
#to check how many values are ZERO
(df['dst_bytes']==0).sum()

In [None]:
import numpy as np

df['dst_bytes_log'] = np.log1p(df['dst_bytes'])


In [None]:
df[['dst_bytes', 'dst_bytes_log']].describe()


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df[['src_bytes_log','dst_bytes_log']] = scaler.fit_transform(
    df[['src_bytes_log','dst_bytes_log']]
)


In [None]:
#to see scaled values
df[['src_bytes_log','dst_bytes_log']].head()


In [None]:
df['src_bytes']


In [None]:
df['dst_bytes']

In [None]:
df['src_bytes_log']

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.boxplot(df['src_bytes'])
plt.title("Before preprocessing")

plt.subplot(1,2,2)
plt.boxplot(df['src_bytes_log'])
plt.title("After preprocessing")

plt.show()

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.boxplot(df['dst_bytes'])
plt.title("Before preprocessing")

plt.subplot(1,2,2)
plt.boxplot(df['dst_bytes_log'])
plt.title("After preprocessing")

plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
df['log_src_bytes']=np.log1p(df['src_bytes'])
plt.figure(figsize=(8,4))
plt.hist(df['log_src_bytes'],bins=60,color="blue")
plt.title("Distribution of Log Transformed Source Bytes")
plt.xlabel("log_src_bytes")
plt.ylabel("Frequency")
plt.show()

#used log1p transformation to handle skewness in src_bytes and plotted the distribution using matplotlib.”


In [None]:
numerical_cols = ['duration','src_bytes','dst_bytes','count','srv_count']

corr = df[numerical_cols].corr()   # ← add ()
print("\nCorrelation Matrix:\n", corr)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')

plt.title("Correlation Matrix (Numerical Features)")
plt.show()
#“Seaborn is built on matplotlib. Seaborn creates advanced statistical plots, while matplotlib handles the figure and display.”
#this heatmap is only showing relationship between input features not class(dark red=close to +1(strong positive relation),ligh red=+0.3 to +0.7(Moderate positive),white=around 0 (No relation),lightBlue=-0.3 to-0.7(Moderate negative),darkBlue=close to -1(Strong negative relation)





In [None]:
df['class'].unique( )
df['duration']

In [None]:
df['duration']

In [None]:
fig, axes = plt.subplots(1,3, figsize=(12,5))

sns.countplot(x='protocol_type', data=df[df['class']=='phf'], 
              ax=axes[0], color='red')

sns.countplot(x='protocol_type', data=df[df['class']=='multihop'], 
              ax=axes[1], color='orange')

sns.countplot(x='protocol_type', data=df[df['class']=='normal'], 
              ax=axes[2], color='green')

plt.tight_layout()
plt.show()


In [None]:
protocol_class = pd.crosstab(df['protocol_type'], df['class'])

ax = protocol_class.plot(kind='bar', figsize=(8,5))

plt.title("Protocol Usage by Traffic Type")
plt.xlabel("Protocol Type")
plt.ylabel("Number of Connections")

# Rename legend labels
new_labels = ['Multihop Attack', 'Normal Traffic', 'PHF Attack']
ax.legend(new_labels, title="Traffic Type")

plt.xticks(rotation=0)
plt.show()


In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='class',y='duration',data=df)
plt.xticks(rotation=90)
plt.show() 


In [None]:
import pandas as pd
import numpy as np


# Create log-transformed column
df['log_src_bytes'] = np.log1p(df['src_bytes'])  # log1p handles zero values

# Create a new dataset with 6 columns
new_df = df[['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count', 'log_src_bytes']]

#save to CSV
new_df.to_csv("new_dataset.csv", index=False)

# Check first few rows
print(new_df.head())


In [None]:
avg_distribution = df.groupby('class')['duration'].mean().sort_values()
avg_distribution

In [None]:
avg_distribution = df.groupby('class')['duration'].mean().sort_values()
avg_distribution

avg_distribution.plot(kind = 'bar')
plt.show()

In [None]:
avg_distribution = df.groupby('class')['duration'].mean().sort_values()
avg_distribution

avg_distribution.plot(kind = 'area')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,5))

sns.violinplot(
    x='protocol_type',
    y='duration',
    hue='class',
    data=df,
    split=True
)

plt.title("Distribution of Connection Duration by Protocol and Traffic Type")
plt.xlabel("Protocol Type")
plt.ylabel("Connection Duration")

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Set style
sns.set(style="whitegrid")
plt.figure(figsize=(12,6))
# Violin plot with log-transformed duration
sns.violinplot(x='protocol_type', y=np.log1p(df['duration']),hue='class',data=df,split=True,inner='box',palette='Set2')
# Overlay individual data points
sns.swarmplot( x='protocol_type', y=np.log1p(df['duration']), hue='class', data=df, dodge=True,alpha=0.3)                   # slightly transparent
# Add title and labels
plt.title("Connection Duration by Protocol Type and Traffic Class", fontsize=16)
plt.xlabel("Protocol Type", fontsize=12)
plt.ylabel("Log(Duration + 1)", fontsize=12)
# Move legend outside the plot
plt.legend(title='Traffic Type', bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.tight_layout()
plt.show()
