In [1]:
# load libraries
import pandas as pd
from pandas import DataFrame as df
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import make_column_transformer
import pickle

current_k_fold = 1

# set seed
SEED=current_k_fold**3

# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# helper functions
from model_utils import imb_ratio

### Pre-process KDD data

In [2]:
# file path
corrected_data = "./kddcup_data_corrected.csv"

# col names from https://kdd.ics.uci.edu/databases/kddcup99/kddcup.names
col_names = [
    "duration",
    "protocol_type",
    "service",
    "flag",
    "src_bytes",
    "dst_bytes",
    "land",
    "wrong_fragment",
    "urgent",
    "hot",
    "num_failed_logins",
    "logged_in",
    "num_compromised",
    "root_shell",
    "su_attempted",
    "num_root",
    "num_file_creations",
    "num_shells",
    "num_access_files",
    "num_outbound_cmds",
    "is_host_login",
    "is_guest_login",
    "count",
    "srv_count",
    "serror_rate",
    "srv_serror_rate",
    "rerror_rate",
    "srv_rerror_rate",
    "same_srv_rate",
    "diff_srv_rate",
    "srv_diff_host_rate",
    "dst_host_count",
    "dst_host_srv_count",
    "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate",
    "dst_host_srv_serror_rate",
    "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate",
    "label",
]
# from https://kdd.ics.uci.edu/databases/kddcup99/kddcup.names
categorical_cols = [
    "protocol_type",
    "service",
    "flag",
    "land",
    "logged_in",
    "is_host_login",
    "is_guest_login",
]

# read data (10 % subset)
df_data = pd.read_csv(corrected_data, names=col_names, header=None)

# summarize attack types as "attack" for binary classification
df_data["label"] = df_data["label"].apply(lambda x: "attack" if x != "normal." else 'normal')

# Create one-hot encoder instance
oh_enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype="int8") # bool/int8 saves 2.66x RAM vs int64

# Apply one-hot encoder in data
oh_encoded_data = pd.DataFrame(oh_enc.fit_transform(df_data[categorical_cols]))

oh_encoded_data.columns = oh_enc.get_feature_names_out(categorical_cols) # overwrite with meaningful column names
num_cols = df_data.drop(categorical_cols, axis=1)

# Normalize numerical features
ct = make_column_transformer(
    (StandardScaler(), num_cols.drop("label", axis=1).columns)
)
scaled_num_cols = ct.fit_transform(num_cols)

df_data = pd.concat([df(scaled_num_cols), oh_encoded_data, df_data["label"]], axis=1)
df_data.columns = df_data.columns.astype(str)

df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,service_csnet_ns,service_ctf,service_daytime,service_discard,service_domain,service_domain_u,service_echo,service_eco_i,service_ecr_i,service_efs,service_exec,service_finger,service_ftp,service_ftp_data,service_gopher,service_hostnames,service_http,service_http_443,service_icmp,service_imap4,service_iso_tsap,service_klogin,service_kshell,service_ldap,service_link,service_login,service_mtp,service_name,service_netbios_dgm,service_netbios_ns,service_netbios_ssn,service_netstat,service_nnsp,service_nntp,service_ntp_u,service_other,service_pm_dump,service_pop_2,service_pop_3,service_printer,service_private,service_remote_job,service_rje,service_shell,service_smtp,service_sql_net,service_ssh,service_sunrpc,service_supdup,service_systat,service_telnet,service_tftp_u,service_tim_i,service_time,service_urp_i,service_uucp,service_uucp_path,service_vmnet,service_whois,flag_OTH,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,land_0,land_1,logged_in_0,logged_in_1,is_host_login_0,is_host_login_1,is_guest_login_0,is_guest_login_1,label
0,-0.043918,-0.012743,-0.037344,-0.018876,-0.005238,-0.047032,-0.047272,-0.005741,-0.01412,-0.003784,-0.003861,-0.004961,-0.006465,-0.026199,0.0,-1.220225,-0.980245,-0.253194,-0.252082,-0.410243,-0.408485,0.496083,-0.228343,-0.202418,0.323695,0.546387,0.533484,-0.155751,-1.113742,-0.127644,-0.254066,-0.252326,-0.414249,-0.408842,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,normal
1,-0.043918,-0.012743,-0.037344,-0.018876,-0.005238,-0.047032,-0.047272,-0.005741,-0.01412,-0.003784,-0.003861,-0.004961,-0.006465,-0.026199,0.0,-1.220225,-0.980245,-0.253194,-0.252082,-0.410243,-0.408485,0.496083,-0.228343,-0.202418,0.323695,0.546387,0.533484,-0.155751,-1.113742,-0.127644,-0.254066,-0.252326,-0.414249,-0.408842,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,normal
2,-0.043918,-0.012743,-0.037344,-0.018876,-0.005238,-0.047032,-0.047272,-0.005741,-0.01412,-0.003784,-0.003861,-0.004961,-0.006465,-0.026199,0.0,-1.220225,-0.980245,-0.253194,-0.252082,-0.410243,-0.408485,0.496083,-0.228343,-0.202418,0.323695,0.546387,0.533484,-0.155751,-1.113742,-0.127644,-0.254066,-0.252326,-0.414249,-0.408842,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,normal
3,-0.043918,-0.012743,-0.037344,-0.018876,-0.005238,-0.047032,-0.047272,-0.005741,-0.01412,-0.003784,-0.003861,-0.004961,-0.006465,-0.026199,0.0,-1.215676,-0.976066,-0.253194,-0.252082,-0.410243,-0.408485,0.496083,-0.228343,-0.202418,0.323695,0.546387,0.533484,-0.155751,-1.113742,-0.127644,-0.254066,-0.252326,-0.414249,-0.408842,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,attack
4,-0.043918,-0.012743,-0.037344,-0.018876,-0.005238,-0.047032,-0.047272,-0.005741,-0.01412,-0.003784,-0.003861,-0.004961,-0.006465,-0.026199,0.0,-1.215676,-0.976066,-0.253194,-0.252082,-0.410243,-0.408485,0.496083,-0.228343,-0.202418,0.323695,0.546387,0.533484,-0.155751,-1.093415,-0.127644,-0.254066,-0.252326,-0.414249,-0.408842,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,attack


In [36]:
# Encode Labels
le = LabelEncoder()
df_data["label"] = le.fit_transform(df_data["label"])

# Split data sets into X, y respectively
X = df_data.drop("label", axis=1)
y = df_data["label"]

# create training and test partitions with 80-20 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

# split 20 again into 10/10 to get a test set
X_test, X_val, y_test, y_val = train_test_split(X_val, y_val, test_size=0.5, random_state=SEED, stratify=y_val)

# get imbalance ratio for each data set
IR_train = imb_ratio(y_train.value_counts())
IR_val = imb_ratio(y_val.value_counts())
IR_test = imb_ratio(y_test.value_counts())

# print imbalance ratios. They should be (nearly) the same. 
print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

Imbalance ratio in training data: 4.13
Imbalance ratio in validation data: 4.13
Imbalance ratio in test data: 4.13


In [37]:
# serialize data with pickle
kdd_preprocessed = {
    "X": X,
    "y": y,
    "X_train": X_train,
    "y_train": y_train,
    
    "X_val": X_val,
    "y_val": y_val,
    
    "X_test": X_test,
    "y_test": y_test,
    
    "label_enc": le,
    "oh_enc": oh_enc,
    
    "col_names": col_names,
    "categorical_cols": categorical_cols,
}

with open(f'kdd_preprocessed_k{current_k_fold}.pkl', 'wb') as f:
    pickle.dump(kdd_preprocessed, f)
    print(f'Data serialized to kdd_preprocessed_k{current_k_fold}.pkl')   

Data serialized to kdd_preprocessed_k1.pkl
