In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

In [None]:
# add the column labels
columns = (['duration'
,'protocol_type'
,'service'
,'flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count'
,'srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'Label'
,'level'])

In [None]:
!gzip -d /kaggle/input/kdd-cup-1999-data/kddcup.data.gz

In [None]:
df = pd.read_csv('/kaggle/input/kdd-cup-1999-data/kddcup.data/kddcup.data',header=None,names=columns)

Drop 'level' column

In [None]:
df.drop('level',axis=1,inplace=True)

In [None]:
df.value_counts('Label')

In [None]:
df.isna().sum().sum()

# Preprocessing

In [None]:
df.describe()

Drop column 'num_outbound_cmds' for being null

In [None]:
df.drop('num_outbound_cmds',axis=1,inplace=True)

## 1. Outlier Filtering

In [None]:
from scipy import stats


# Define wich columns will be filtered based on conditions:

columns_to_filter = []
for col in df.columns:
    if df[col].dtype != 'object':
        if df[col].nunique() > 100:
            if df[col].max() > 10 * df[col].mean(axis=0,numeric_only=True):
                columns_to_filter += [col]


# Define a function to filter outliers using Z-score
def filter_outliers_zscore(data, threshold):
    z_scores = np.abs(stats.zscore(data))
    outlier_mask = (z_scores > threshold).any(axis=1)
    return data[~outlier_mask], data[outlier_mask]

# Define a threshold value
threshold = 7

filtered_cols = []
removed_outliers = []
for col in columns_to_filter:    
    filtered_col, outliers = filter_outliers_zscore(df[[col]], threshold)
    
    filtered_cols.append(filtered_col)
    removed_outliers.append(outliers)

# Combine the filtered columns back into a dataframe
df_filtered = pd.concat(filtered_cols, axis=1)

# Combine the removed outliers back into a dataframe
df_outliers = pd.concat(removed_outliers, axis=1)

# Dataframe filtering comparison

print(f'Shape: {df.shape}')

# Print the number of outliers removed for each column

print('Outlier removal summary:')
n_outliers = df_outliers.shape[0]
print(f'{n_outliers} outliers rows to be removed')

# Print the original dataframe and the filtered dataframe side by side
print('\nOriginal dataframe:')
display(df.head())

# Assign filtered dataframe columns to original one
columns = columns_to_filter
df.loc[:,columns] = df_filtered.loc[:,columns]


print('\nFiltered dataframe:')
display(df.head())

# Print the removed outliers dataframe
print('\nRemoved outliers:')
display(df_outliers.head())

# Count how many rows of each attack were removed from original dataframe
values_orig = df.loc[df.index.isin(df_outliers.index), 'Label']
print(f'\n{values_orig.value_counts()}')

In [None]:

df.plot(kind='box',subplots=True,layout=(15,3),figsize=(20,40))
plt.show()

In [None]:
df.describe()

In [None]:
df.info()

Count how many NaN values there are overall in the dataset (All added in the remove outlier step):

In [None]:
df.isna().sum().sum()

Count how many rows have at leat one NaN value:

In [None]:
na_row_count = (df.isnull().sum(axis=1) > 0).sum()
print(na_row_count)

In [None]:
df = df.dropna()

## 2. Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
columns = [col for col in df.columns if col not in ['protocol_type','service','flag','Label']]
min_max_scaler = MinMaxScaler().fit(df[columns])
df[columns] = min_max_scaler.transform(df[columns])
display(df.head())

## 3. Feature Correlation Filtering

In [None]:
columns = [col for col in df.columns if col not in ['protocol_type','service','flag','Label']]

corr_matrix = df[columns].corr().abs()

threshold = 0.99
# Find features with high correlation
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# Print features to drop
print(f"The following {len(to_drop)} features will be dropped due to high correlation: {to_drop}")

df = df.drop(to_drop, axis = 1)

In [None]:
display(corr_matrix)

Substitute all attacks for "abnormal"

In [None]:
df["Label"] = df.Label.map(lambda a:"normal" if a == 'normal.' else "abnormal")

Numericaly encode categorical labels in the dataset

In [None]:
le=LabelEncoder()
clm=['protocol_type','service','flag']
for x in clm:
    df[x]=le.fit_transform(df[x])

## 4. Equalization

In [None]:
df = df.sample(frac=1) #Randomize rows's sequence

df2 = df[df["Label"] == "normal"]
df1 = df[df["Label"] == "abnormal"][:df2.shape[0]]

df_equal = pd.concat([df1,df2], axis =0)

# Save dataset

In [None]:
output_dir = '/kaggle/output/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)


    
filename = os.path.join(output_dir, 'KDD_all_attacks' + '.csv')
df.to_csv(filename, index = False)

In [50]:
df['Label'].value_counts()

abnormal    3903730
normal       934908
Name: Label, dtype: int64

In [49]:
!ls -Flash /kaggle/output/

total 12K
4.0K drwxr-xr-x 3 root root 4.0K Jul 29 23:56 ./
4.0K drwxr-xr-x 6 root root 4.0K Jul 29 23:56 ../
4.0K drwxr-xr-x 2 root root 4.0K Jul 29 23:56 kddcup.data_preprocessed.csv/
