# UNSW-NB15 Cleaning

## Setup

In [1]:
import numpy as np
import pandas as pd
import config
from pathlib import Path

In [15]:
def load_data(root=config.UNSWNB15_ROOT, shuffle_seed=0):
    file_paths = {
        'testing': 'Training and Testing Sets/UNSW_NB15_testing-set.csv',
        'training': 'Training and Testing Sets/UNSW_NB15_training-set.csv',
        'events': 'UNSW-NB15_LIST_EVENTS.csv',
    }
    data = {name: pd.read_csv(root / path) for name, path in file_paths.items()}
    data['features'] = pd.read_csv(root / 'NUSW-NB15_features.csv', encoding='cp1252')
    NB15_parts = [pd.read_csv(root / f'UNSW-NB15_{i}.csv') for i in range(1, 5)]
    for df in NB15_parts:
        df.columns = data['features']['Name']
    
    train_df = pd.concat(NB15_parts, ignore_index=True)
    if shuffle_seed:
        train_df = train_df.sample(frac=1, random_state=shuffle_seed).reset_index(drop=True)
    
    return train_df

train_df = load_data().drop_duplicates()

  pd.read_csv(root / f'UNSW-NB15_{i}.csv') for i in range(1, 5)
  pd.read_csv(root / f'UNSW-NB15_{i}.csv') for i in range(1, 5)


In [16]:
train_df.head()

Name,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
1,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
2,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
3,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0
4,59.166.0.0,32119,149.171.126.9,111,udp,CON,0.078339,568,312,31,...,0,2,4,2,3,1,1,2,,0


In [17]:
data['features']

Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value


## Impute NaN

In [18]:
print('NaN entries:')
for c in train_df.columns:
    n_nan = pd.isna(train_df[c]).sum()
    frac = n_nan / len(train_df)
    if n_nan > 0: print(f'{c}: {n_nan} ({frac:.2f})')

NaN entries:
ct_flw_http_mthd: 933603 (0.45)
is_ftp_login: 1014165 (0.49)
attack_cat: 1959771 (0.95)


In [19]:
train_df['attack_cat'].fillna('normal', inplace = True)
train_df['attack_cat'] = train_df['attack_cat'].apply(lambda x: x.strip().lower())
train_df['ct_flw_http_mthd'].fillna(0, inplace=True)
train_df['is_ftp_login'].fillna(0, inplace=True)

pd.isna(train_df).any(axis=1).sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['attack_cat'].fillna('normal', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['ct_flw_http_mthd'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are

np.int64(0)

## Unique Values

In [20]:
train_df.describe(include="O")

Name,srcip,sport,dstip,dsport,proto,state,service,ct_ftp_cmd,attack_cat
count,2059417,2059417,2059417,2059417,2059417,2059417,2059417,2059417.0,2059417
unique,43,100343,47,128297,135,16,13,13.0,11
top,59.166.0.4,0,149.171.126.3,53,tcp,FIN,-,,normal
freq,192686,20653,192515,233820,1448858,1433507,1169180,1014165.0,1959771


In [21]:
categorical = ('srcip',	'sport',	'dstip',	'dsport',	'proto',	'state',	'service',	'ct_ftp_cmd',	'attack_cat')
for c in categorical:
    print('\n' + c)
    u = train_df[c].unique()
    print(len(u))
    print(u)


srcip
43
['59.166.0.0' '59.166.0.6' '59.166.0.5' '59.166.0.3' '10.40.182.3'
 '59.166.0.7' '10.40.170.2' '59.166.0.1' '59.166.0.2' '59.166.0.4'
 '175.45.176.3' '175.45.176.2' '175.45.176.0' '59.166.0.8' '59.166.0.9'
 '175.45.176.1' '10.40.182.1' '10.40.85.1' '192.168.241.243' '10.40.85.30'
 '149.171.126.16' '149.171.126.2' '149.171.126.11' '149.171.126.4'
 '149.171.126.5' '149.171.126.17' '149.171.126.19' '149.171.126.9'
 '149.171.126.8' '149.171.126.7' '149.171.126.15' '149.171.126.6'
 '149.171.126.0' '149.171.126.1' '149.171.126.3' '149.171.126.13'
 '149.171.126.12' '149.171.126.10' '149.171.126.18' '127.0.0.1'
 '149.171.126.14' '10.40.85.10' '10.40.182.6']

sport
100343
[33661 1464 3593 ... 533 933 706]

dstip
47
['149.171.126.9' '149.171.126.7' '149.171.126.5' '149.171.126.0'
 '149.171.126.4' '10.40.182.3' '149.171.126.6' '10.40.170.2'
 '149.171.126.18' '149.171.126.16' '149.171.126.8' '149.171.126.2'
 '149.171.126.1' '149.171.126.10' '149.171.126.3' '149.171.126.15'
 '149.171.126.

In [22]:
train_df['ct_ftp_cmd'] = train_df['ct_ftp_cmd'].astype('str')
train_df['ct_ftp_cmd'] = train_df['ct_ftp_cmd'].replace(' ', '0')
train_df['ct_ftp_cmd'] = train_df['ct_ftp_cmd'].astype('int')
train_df['ct_ftp_cmd'].unique()

array([0, 1, 6, 2, 4, 8, 5, 3])

In [23]:
train_df['is_ftp_login'] = (train_df['is_ftp_login'] > 0).astype(int)
train_df.is_ftp_login.unique()

array([0, 1])

In [24]:
train_df['sport'] = train_df['sport'].astype('str')
train_df['sport'] = pd.to_numeric(train_df['sport'], errors='coerce').fillna(0).astype(int)

train_df['dsport'] = train_df['dsport'].astype('str')
train_df['dsport'] = pd.to_numeric(train_df['dsport'], errors='coerce').fillna(0).astype(int)

train_df.sport.unique(), train_df.dsport.unique()

(array([33661,  1464,  3593, ...,   533,   933,   706]),
 array([1024,   53,  111, ...,  632,  186,  518]))

## Outliers

In [28]:
# TODO
pass

In [30]:
train_df.to_csv(config.INTERM_DIR / 'full_unsw_cleaned.csv', index=False)