In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = None

In [71]:
df = pd.concat([pd.read_csv("./testing-set.csv"), pd.read_csv("./training-set.csv"), pd.read_csv("./validation-set.csv")], axis=0, ignore_index=True)

In [15]:
df.columns

Index(['pkSeqID', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'seq', 'stddev',
       'N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max', 'attack', 'category', 'subcategory'],
      dtype='object')

In [16]:
string_columns = []
unique_values = {}

for column in df.columns:
    if df[column].dtype == 'object':  # Assuming 'object' dtype for strings
        string_columns.append(column)
        unique_values[column] = df[column].unique()

print("String Columns:", string_columns)
print("Unique Values:")
for column, values in unique_values.items():
    print(column, ":", values)

String Columns: ['proto', 'saddr', 'sport', 'daddr', 'dport', 'category', 'subcategory']
Unique Values:
proto : ['udp' 'tcp' 'icmp' 'arp' 'ipv6-icmp']
saddr : ['192.168.100.150' '192.168.100.148' '192.168.100.149' '192.168.100.147'
 '192.168.100.5' '192.168.100.3' '192.168.100.6' '192.168.100.7'
 '192.168.100.46' 'fe80::250:56ff:febe:e9d9' 'fe80::250:56ff:febe:bf1a'
 'fe80::c0c0:aa20:45b9:bdd9' '192.168.100.1' '192.168.100.4'
 'fe80::250:56ff:febe:254' '192.168.100.27' 'fe80::250:56ff:febe:c038'
 'fe80::2c6a:ff9b:7e14:166a' '192.168.100.55' 'fe80::250:56ff:febe:89ee'
 'fe80::250:56ff:febe:26db']
sport : ['48516' '22267' '28629' ... '48073' '56632' '42157']
daddr : ['192.168.100.3' '192.168.100.5' '192.168.100.7' '192.168.100.6'
 '192.168.100.150' '192.168.100.147' '192.168.100.148' '205.251.193.205'
 '199.19.57.1' '192.168.100.149' '8.8.8.8' '156.154.101.3' '192.168.217.2'
 '192.54.112.30' '205.251.196.160' '192.5.5.241' '192.12.94.30'
 '129.250.35.250' '224.0.0.251' '192.168.100.255' 

In [27]:
df.replace('-', -1, inplace=True)

In [28]:
df

Unnamed: 0,pkSeqID,proto,saddr,sport,daddr,dport,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack,category,subcategory
0,792371,udp,192.168.100.150,48516,192.168.100.3,80,175094,0.226784,100,4.100436,4,4.457383,100,0.000000,0.404711,4.719438,1,DoS,UDP
1,2056418,tcp,192.168.100.148,22267,192.168.100.3,80,143024,0.451998,100,3.439257,1,3.806172,100,0.225077,0.401397,4.442930,1,DDoS,TCP
2,2795650,udp,192.168.100.149,28629,192.168.100.3,80,167033,1.931553,73,0.000000,4,2.731204,100,0.000000,0.407287,4.138455,1,DDoS,UDP
3,2118009,tcp,192.168.100.148,42142,192.168.100.3,80,204615,0.428798,56,3.271411,1,3.626428,100,0.000000,0.343654,4.229700,1,DDoS,TCP
4,303688,tcp,192.168.100.149,1645,192.168.100.5,80,40058,2.058381,100,0.000000,3,1.188407,100,0.000000,0.135842,4.753628,1,DoS,TCP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3668517,2897013,udp,192.168.100.149,7433,192.168.100.3,80,6235,0.949105,55,2.043210,4,3.385446,100,0.000000,0.492941,4.057239,1,DDoS,UDP
3668518,3093940,udp,192.168.100.150,50641,192.168.100.3,80,203162,1.899304,88,0.000000,4,2.686019,100,0.000000,0.494864,4.031712,1,DDoS,UDP
3668519,853819,udp,192.168.100.149,51041,192.168.100.5,80,236542,2.117987,72,0.000000,4,2.117096,72,0.000000,0.225706,4.321042,1,DoS,UDP
3668520,3555645,udp,192.168.100.147,43226,192.168.100.3,80,140564,1.879053,70,0.983380,4,3.640733,100,0.000000,1.005739,4.980003,1,DDoS,UDP


In [110]:
# Filter out rows with categories 'DoS' and 'DDoS'
categories = ['DoS', 'DDoS']
filtered_df = df[df['category'].isin(categories)]

# Convert numeric columns to numeric type
numeric_cols = filtered_df.select_dtypes(include='number')

# Calculate IQR for each numeric column
Q1 = numeric_cols.quantile(0.25)
Q3 = numeric_cols.quantile(0.75)
IQR = Q3 - Q1

print("Q1:\n", Q1)
print("Q3:\n", Q3)
print("IQR:\n", IQR)

# Define a threshold for outlier detection
threshold = 1.5

# Remove rows with values outside the IQR range in numeric columns
outliers_removed_df = filtered_df[~((numeric_cols < (Q1 - threshold * IQR)) | (numeric_cols > (Q3 + threshold * IQR))).any(axis=1)]

filtered_df = pd.concat([outliers_removed_df, df[~df['category'].isin(categories)]])


print("Original DataFrame:")
print(df)

print("\nDataFrame after removing outliers from numeric columns and filtering 'DoS' and 'DDoS' categories:")
print(filtered_df)


Q1:
 pkSeqID              894221.750000
seq                   59458.000000
stddev                    0.040894
N_IN_Conn_P_SrcIP        70.000000
min                       0.000000
state_number              3.000000
mean                      0.283002
N_IN_Conn_P_DstIP       100.000000
drate                     0.000000
srate                     0.166452
max                       0.364790
attack                    1.000000
Name: 0.25, dtype: float64
Q3:
 pkSeqID              2.682663e+06
seq                  1.866930e+05
stddev               1.750794e+00
N_IN_Conn_P_SrcIP    1.000000e+02
min                  2.501472e+00
state_number         4.000000e+00
mean                 3.574361e+00
N_IN_Conn_P_DstIP    1.000000e+02
drate                0.000000e+00
srate                4.898930e-01
max                  4.308070e+00
attack               1.000000e+00
Name: 0.75, dtype: float64
IQR:
 pkSeqID              1.788442e+06
seq                  1.272350e+05
stddev               1.709900e+00


In [111]:
filtered_df['category'].value_counts()

category
DDoS              1286334
DoS                746530
Reconnaissance      91082
Normal                477
Theft                  79
Name: count, dtype: int64

In [113]:
df

Unnamed: 0,pkSeqID,proto,saddr,sport,daddr,dport,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack,category,subcategory
0,792371,udp,192.168.100.150,48516,192.168.100.3,80,175094,0.226784,100,4.100436,4,4.457383,100,0.000000,0.404711,4.719438,1,DoS,UDP
1,2056418,tcp,192.168.100.148,22267,192.168.100.3,80,143024,0.451998,100,3.439257,1,3.806172,100,0.225077,0.401397,4.442930,1,DDoS,TCP
2,2795650,udp,192.168.100.149,28629,192.168.100.3,80,167033,1.931553,73,0.000000,4,2.731204,100,0.000000,0.407287,4.138455,1,DDoS,UDP
3,2118009,tcp,192.168.100.148,42142,192.168.100.3,80,204615,0.428798,56,3.271411,1,3.626428,100,0.000000,0.343654,4.229700,1,DDoS,TCP
4,303688,tcp,192.168.100.149,1645,192.168.100.5,80,40058,2.058381,100,0.000000,3,1.188407,100,0.000000,0.135842,4.753628,1,DoS,TCP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3668517,2897013,udp,192.168.100.149,7433,192.168.100.3,80,6235,0.949105,55,2.043210,4,3.385446,100,0.000000,0.492941,4.057239,1,DDoS,UDP
3668518,3093940,udp,192.168.100.150,50641,192.168.100.3,80,203162,1.899304,88,0.000000,4,2.686019,100,0.000000,0.494864,4.031712,1,DDoS,UDP
3668519,853819,udp,192.168.100.149,51041,192.168.100.5,80,236542,2.117987,72,0.000000,4,2.117096,72,0.000000,0.225706,4.321042,1,DoS,UDP
3668520,3555645,udp,192.168.100.147,43226,192.168.100.3,80,140564,1.879053,70,0.983380,4,3.640733,100,0.000000,1.005739,4.980003,1,DDoS,UDP


In [114]:
filtered_df

Unnamed: 0,pkSeqID,proto,saddr,sport,daddr,dport,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack,category,subcategory
0,792371,udp,192.168.100.150,48516,192.168.100.3,80,175094,0.226784,100,4.100436,4,4.457383,100,0.0,0.404711,4.719438,1,DoS,UDP
2,2795650,udp,192.168.100.149,28629,192.168.100.3,80,167033,1.931553,73,0.000000,4,2.731204,100,0.0,0.407287,4.138455,1,DDoS,UDP
4,303688,tcp,192.168.100.149,1645,192.168.100.5,80,40058,2.058381,100,0.000000,3,1.188407,100,0.0,0.135842,4.753628,1,DoS,TCP
7,1064106,udp,192.168.100.150,19625,192.168.100.3,80,184672,1.788452,100,0.000000,4,3.576574,100,0.0,0.446612,4.492080,1,DoS,UDP
11,2012355,tcp,192.168.100.148,2808,192.168.100.3,80,98961,1.902401,100,0.000000,3,2.643317,100,0.0,0.271451,4.398972,1,DDoS,TCP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3668240,3598706,tcp,192.168.100.147,51614,192.168.100.3,1028,3495,0.000000,100,0.000242,1,0.000242,100,0.0,0.000000,0.000242,1,Reconnaissance,Service_Scan
3668317,3600394,udp,192.168.100.148,35008,192.168.100.3,4008,5203,0.000000,15,0.000000,4,0.000000,58,0.0,0.000000,0.000000,1,Reconnaissance,Service_Scan
3668420,3640096,icmp,192.168.100.5,0x0303,192.168.100.150,0xd007,16596,0.000000,43,0.000000,5,0.000000,10,0.0,0.000000,0.000000,1,Reconnaissance,Service_Scan
3668452,3648026,tcp,192.168.100.149,49014,192.168.100.5,57797,24742,0.000000,100,0.000000,3,0.000000,100,0.0,0.000000,0.000000,1,Reconnaissance,Service_Scan


In [115]:
df = filtered_df

In [54]:
numeric_cols

Unnamed: 0,pkSeqID,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack
0,792371,175094,0.226784,100,4.100436,4,4.457383,100,0.000000,0.404711,4.719438,1
1,2056418,143024,0.451998,100,3.439257,1,3.806172,100,0.225077,0.401397,4.442930,1
2,2795650,167033,1.931553,73,0.000000,4,2.731204,100,0.000000,0.407287,4.138455,1
3,2118009,204615,0.428798,56,3.271411,1,3.626428,100,0.000000,0.343654,4.229700,1
4,303688,40058,2.058381,100,0.000000,3,1.188407,100,0.000000,0.135842,4.753628,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3668517,2897013,6235,0.949105,55,2.043210,4,3.385446,100,0.000000,0.492941,4.057239,1
3668518,3093940,203162,1.899304,88,0.000000,4,2.686019,100,0.000000,0.494864,4.031712,1
3668519,853819,236542,2.117987,72,0.000000,4,2.117096,72,0.000000,0.225706,4.321042,1
3668520,3555645,140564,1.879053,70,0.983380,4,3.640733,100,0.000000,1.005739,4.980003,1


In [65]:
df.head()

Unnamed: 0,pkSeqID,proto,saddr,sport,daddr,dport,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack,category,subcategory
0,792371,udp,192.168.100.150,48516,192.168.100.3,80,175094,0.226784,100,4.100436,4,4.457383,100,0.0,0.404711,4.719438,1,DoS,UDP
2,2795650,udp,192.168.100.149,28629,192.168.100.3,80,167033,1.931553,73,0.0,4,2.731204,100,0.0,0.407287,4.138455,1,DDoS,UDP
4,303688,tcp,192.168.100.149,1645,192.168.100.5,80,40058,2.058381,100,0.0,3,1.188407,100,0.0,0.135842,4.753628,1,DoS,TCP
7,1064106,udp,192.168.100.150,19625,192.168.100.3,80,184672,1.788452,100,0.0,4,3.576574,100,0.0,0.446612,4.49208,1,DoS,UDP
11,2012355,tcp,192.168.100.148,2808,192.168.100.3,80,98961,1.902401,100,0.0,3,2.643317,100,0.0,0.271451,4.398972,1,DDoS,TCP


In [67]:
output = 'category'

In [69]:
df[output].unique()

array(['DoS', 'DDoS'], dtype=object)

In [70]:
df[output].value_counts()

category
DDoS    1294549
DoS      746530
Name: count, dtype: int64

In [38]:
df[output].count()

2041831

In [117]:
df_x, df_test, _, _= train_test_split(df, df[output], test_size=0.2, random_state=189)

In [118]:
def csv_save_zip(df, filename):
    compression_options = dict(method='zip', archive_name=f'{filename}.csv')
    df.to_csv(f'{filename}.zip', compression=compression_options, index=False)

In [119]:
csv_save_zip(df_test, 'testing-set')

In [120]:
df_train, df_val, _, _= train_test_split(df_x, df_x[output], test_size=0.1, random_state=189)

In [121]:
csv_save_zip(df_train, 'training-set')

In [124]:
csv_save_zip(df_train, 'BoT_IoT')

In [122]:
csv_save_zip(df_val, 'validating-set')