In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
files = [
    Path('..\\..\\IDS datasets\\NF-UQ-NIDS\\NF-UNSW-NB15-v3\\data\\NetFlow_v3_Features.csv'),
    Path('..\\..\\IDS datasets\\NF-UQ-NIDS\\NF-UNSW-NB15-v3\\data\\NF-UNSW-NB15-v3.csv')
]

In [3]:
features = pd.read_csv(files[0])
features

Unnamed: 0,Feature,Description
0,IPV4_SRC_ADDR,IPv4 source address
1,IPV4_DST_ADDR,IPv4 destination address
2,L4_SRC_PORT,IPv4 source port number
3,L4_DST_PORT,IPv4 destination port number
4,PROTOCOL,IP protocol identifier byte
5,L7_PROTO,Layer 7 protocol (numeric)
6,IN_BYTES,Incoming number of bytes
7,OUT_BYTES,Outgoing number of bytes
8,IN_PKTS,Incoming number of packets
9,OUT_PKTS,Outgoing number of packets


In [4]:
df = pd.read_csv(files[1])
df.head(3)

Unnamed: 0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,...,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label,Attack
0,1424242193040,1424242193043,59.166.0.2,4894,149.171.126.3,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
1,1424242192744,1424242193079,59.166.0.4,52671,149.171.126.6,31992,6,11.0,4704,28,...,0,91,12,19,0,90,12,19,0,Benign
2,1424242190649,1424242193109,59.166.0.0,47290,149.171.126.9,6881,6,37.0,13662,238,...,0,1843,10,119,0,1843,5,88,0,Benign


In [5]:
len(df['FTP_COMMAND_RET_CODE'].unique())

16

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2365424 entries, 0 to 2365423
Data columns (total 55 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   FLOW_START_MILLISECONDS      int64  
 1   FLOW_END_MILLISECONDS        int64  
 2   IPV4_SRC_ADDR                object 
 3   L4_SRC_PORT                  int64  
 4   IPV4_DST_ADDR                object 
 5   L4_DST_PORT                  int64  
 6   PROTOCOL                     int64  
 7   L7_PROTO                     float64
 8   IN_BYTES                     int64  
 9   IN_PKTS                      int64  
 10  OUT_BYTES                    int64  
 11  OUT_PKTS                     int64  
 12  TCP_FLAGS                    int64  
 13  CLIENT_TCP_FLAGS             int64  
 14  SERVER_TCP_FLAGS             int64  
 15  FLOW_DURATION_MILLISECONDS   int64  
 16  DURATION_IN                  int64  
 17  DURATION_OUT                 int64  
 18  MIN_TTL                      int64  
 19  

In [6]:
NF_V3_CATEGORICAL = ['TCP_FLAGS','L7_PROTO','PROTOCOL', 
                    'ICMP_IPV4_TYPE', 'FTP_COMMAND_RET_CODE']


data = df
pk_cols = (
    'IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT'
)
for k in pk_cols:
    assert k in data.columns, f'{k} not in columns {data.columns}'
    data[k] = data[k].apply(str)
    
data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']
data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'],inplace=True)

In [10]:
data = data.drop("Label") # binary 

KeyError: "['Label'] not found in axis"

In [11]:
df = data 
categorical = ['TCP_FLAGS','L7_PROTO','PROTOCOL', 
                    'ICMP_IPV4_TYPE', 'FTP_COMMAND_RET_CODE', 'Attack']
pk = ['IPV4_SRC_ADDR','IPV4_DST_ADDR']
numerical = [c for c in df.columns if (c not in categorical) and (c not in pk)]

for c in numerical:
    n = (~np.isfinite(df[c])).sum()
    if n > 0:
        print(n, c)
    
df[numerical] = df[numerical].replace([np.inf, -np.inf], np.nan)
df[numerical] = df[numerical].fillna(df[numerical].mean())

for c in numerical:
    n = (~np.isfinite(df[c])).sum()
    if n > 0:
        print(n, c)

122493 SRC_TO_DST_SECOND_BYTES
122493 DST_TO_SRC_SECOND_BYTES


In [12]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# paper used labelencoding for all categories
le = LabelEncoder()
for c in categorical:
    df[c] = le.fit_transform(df[c])

# and standradization for the rest
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

In [24]:
df

Unnamed: 0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label,Attack
0,0.961464,0.961463,59.166.0.2:4894,149.171.126.3:53,17,5,-0.063037,-0.424547,-0.220167,-0.367112,...,-0.02266,-0.254760,-0.091755,-0.225001,-0.003418,-0.282617,-0.265662,-0.321781,-0.23888,2
1,0.961463,0.961463,59.166.0.4:52671,149.171.126.6:31992,6,20,0.004290,-0.092886,-0.202124,-0.143637,...,-0.02266,-0.149352,-0.046893,-0.130660,-0.003418,-0.163214,-0.075074,-0.172734,-0.23888,2
2,0.961461,0.961463,59.166.0.0:47290,149.171.126.9:6881,6,32,0.136610,2.585908,3.313893,3.380384,...,-0.02266,1.880034,-0.054370,0.365874,-0.003418,2.162490,-0.186250,0.368541,-0.23888,2
3,0.961464,0.961463,59.166.0.8:43310,149.171.126.7:53,17,5,-0.063037,-0.424547,-0.220167,-0.367112,...,-0.02266,-0.254760,-0.091755,-0.225001,-0.003418,-0.282617,-0.265662,-0.321781,-0.23888,2
4,0.961464,0.961463,59.166.0.1:45870,149.171.126.1:53,17,5,-0.063274,-0.424547,-0.220270,-0.367112,...,-0.02266,-0.254760,-0.091755,-0.225001,-0.003418,-0.282617,-0.265662,-0.321781,-0.23888,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2365419,-1.020862,-1.020862,59.166.0.6:8647,149.171.126.8:25,6,32,0.398119,0.085700,-0.204768,-0.126447,...,-0.02266,-0.170202,-0.065586,-0.120729,-0.003418,-0.184441,-0.106839,-0.125667,-0.23888,2
2365420,-1.020862,-1.020862,59.166.0.2:4681,149.171.126.1:5190,6,32,-0.034854,-0.169423,-0.205335,-0.178018,...,-0.02266,-0.253602,-0.091755,-0.225001,-0.003418,-0.281290,-0.265662,-0.321781,-0.23888,2
2365421,-1.020862,-1.020862,59.166.0.2:47560,149.171.126.2:6125,6,32,0.066683,-0.271473,-0.219251,-0.332731,...,-0.02266,-0.054370,-0.031939,0.003405,-0.003418,-0.051771,0.433159,0.203805,-0.23888,2
2365422,-1.020861,-1.020862,59.166.0.3:44991,149.171.126.5:53,17,5,-0.063037,-0.424547,-0.220167,-0.367112,...,-0.02266,-0.254760,-0.091755,-0.225001,-0.003418,-0.282617,-0.265662,-0.321781,-0.23888,2


In [13]:
import networkx as nx

attrs = [c for c in data.columns if c not in ("IPV4_SRC_ADDR", "IPV4_DST_ADDR")]
data['h'] = data[attrs].values.tolist()

G = nx.from_pandas_edgelist(
        data, 
        "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','Attack'],
        create_using=nx.MultiGraph())


In [17]:
import pickle

with open("../../interm/NF_unsw_nb15_flowgraph.pkl", "wb") as f:
    pickle.dump(G, f)

In [18]:
for i, (u, v, k, d) in enumerate(G.edges(keys=True, data=True)):
    print(f"Edge {i}: {u} -> {v} (key={k}) | data: {d}")
    if i >= 9:
        break

for i, (n, d) in enumerate(G.nodes(data=True)):
    print(f"Node {i}: {n} | data: {d}")
    if i >= 9:
        break

Edge 0: 59.166.0.2:4894 -> 149.171.126.3:53 (key=0) | data: {'h': [0.9614635338821296, 0.9614630106534172, 17.0, 5.0, -0.06303732490899411, -0.4245466900961518, -0.2201671595362142, -0.36711179954540774, 0.0, -1.7597434162149328, -1.742715276485113, -0.16495739713973925, -0.16346637187481758, -0.260976133076301, -0.20552463882037067, -0.2244097040759245, -1.009470492915238, 0.3954197191765958, 1.5457415443662677, -1.009470492915238, -0.5146510249131897, -0.24978834267197667, -0.04671562566623494, -0.2203523083627334, -0.21705162393783953, -0.29102199797577677, -0.28556811049450975, -0.4976270855890753, -0.6479660428904961, -0.31226193864535945, -0.19973751500682901, -0.2131000207285106, -0.21011947673462403, -1.0240879933410827, -1.1200370342779653, -1.1653075172435845, 0.0, 3.0097431946953113, -0.027115790283339165, -0.0029436333405469684, 0.0, -0.022660331711157052, -0.25475986142230894, -0.09175522235181705, -0.22500094210731522, -0.0034179529678438235, -0.2826166038830273, -0.26566

In [19]:
len(G.edges)

2365424

In [21]:
len(G.nodes)

1092975