In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import dgl
from sklearn.preprocessing import LabelEncoder, StandardScaler
import h5py
import networkx as nx 
from torch_geometric.utils.convert import from_dgl
import dask.dataframe as dd


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
features = pd.read_csv('raw/NetFlow_v3_Features.csv')

category = ['TCP_FLAGS','L7_PROTO','PROTOCOL', 
     'ICMP_IPV4_TYPE', 'FTP_COMMAND_RET_CODE', 'Attack']

pk = ['IPV4_SRC_ADDR','IPV4_DST_ADDR']
numerical = [c for c in features.columns if (c not in category) and (c not in pk)]

In [5]:
features

Unnamed: 0,Feature,Description
0,IPV4_SRC_ADDR,IPv4 source address
1,IPV4_DST_ADDR,IPv4 destination address
2,L4_SRC_PORT,IPv4 source port number
3,L4_DST_PORT,IPv4 destination port number
4,PROTOCOL,IP protocol identifier byte
5,L7_PROTO,Layer 7 protocol (numeric)
6,IN_BYTES,Incoming number of bytes
7,OUT_BYTES,Outgoing number of bytes
8,IN_PKTS,Incoming number of packets
9,OUT_PKTS,Outgoing number of packets


In [6]:
dtypes = {
    'IPV4_SRC_ADDR': 'category',	                # IPv4 source address
    'IPV4_DST_ADDR': 'category',	                # IPv4 destination address
    'L4_SRC_PORT': 'category',	                # IPv4 source port number
    'L4_DST_PORT': 'category',	                # IPv4 destination port number

    'PROTOCOL': 'int32',	                # IP protocol identifier byte
    'L7_PROTO': 'category',	                # Layer 7 protocol (numeric)
    'IN_BYTES': 'int32',	                # Incoming number of bytes
    'OUT_BYTES': 'int32',	                # Outgoing number of bytes
    'IN_PKTS': 'int32',	                # Incoming number of packets
    'OUT_PKTS': 'int32',	                # Outgoing number of packets
    'FLOW_DURATION_MILLISECONDS': 'float32',	                # Flow duration in milliseconds
    'TCP_FLAGS': 'int32',	                # Cumulative of all TCP flags
    'CLIENT_TCP_FLAGS': 'int32',	                # Cumulative of all client TCP flags
    'SERVER_TCP_FLAGS': 'int32',	                # Cumulative of all server TCP flags
    'DURATION_IN': 'float32',	                # Client to Server stream duration (msec)
    'DURATION_OUT': 'float32',	                # Client to Server stream duration (msec)
    'MIN_TTL': 'float32',	                # Min flow TTL
    'MAX_TTL': 'float32',	                # Max flow TTL
    'LONGEST_FLOW_PKT': 'int32',	            # Longest packet (bytes) of the flow
    'SHORTEST_FLOW_PKT': 'int32',	            # Shortest packet (bytes) of the flow
    'MIN_IP_PKT_LEN': 'int32',	            # Len of the smallest flow IP packet observed
    'MAX_IP_PKT_LEN': 'int32',	            # Len of the largest flow IP packet observed
    'SRC_TO_DST_SECOND_BYTES': 'float32',	            # Src to dst Bytes/sec
    'DST_TO_SRC_SECOND_BYTES': 'float32',	            # Dst to src Bytes/sec
    'RETRANSMITTED_IN_BYTES': 'int32',	            # Number of retransmitted TCP flow bytes (src->dst)
    'RETRANSMITTED_IN_PKTS': 'int32',	            # Number of retransmitted TCP flow packets (src-...
    'RETRANSMITTED_OUT_BYTES': 'int32',         # 	Number of retransmitted TCP flow bytes (dst->src)
    'RETRANSMITTED_OUT_PKTS': 'int32',	            # Number of retransmitted TCP flow packets (dst-...
    'SRC_TO_DST_AVG_THROUGHPUT': 'float32',	            # Src to dst average thpt (bps)
    'DST_TO_SRC_AVG_THROUGHPUT': 'float32',         # 	Dst to src average thpt (bps)
    'NUM_PKTS_UP_TO_128_BYTES': 'int32',	            # Packets whose IP size <= 128
    'NUM_PKTS_128_TO_256_BYTES': 'int32',	            # Packets whose IP size > 128 and <= 256
    'NUM_PKTS_256_TO_512_BYTES': 'int32',	            # Packets whose IP size > 256 and <= 512
    'NUM_PKTS_512_TO_1024_BYTES': 'int32',	            # Packets whose IP size > 512 and <= 1024
    'NUM_PKTS_1024_TO_1514_BYTES': 'int32',	            # Packets whose IP size >��1024 and <= 1514
    'TCP_WIN_MAX_IN': 'int32',	            # Max TCP Window (src->dst)
    'TCP_WIN_MAX_OUT': 'int32',         # 	Max TCP Window (dst->src)

    # ?
    'ICMP_TYPE': 'category',	            # ICMP Type * 256 + ICMP code
    'ICMP_IPV4_TYPE': 'category',	            # ICMP Type

    'DNS_QUERY_ID': 'float32',	            #    DNS query transaction Id
    'DNS_QUERY_TYPE': 'category',	        #            DNS query type (e.g. 1=A, 2=NS..)
    'DNS_TTL_ANSWER': 'float32',	        #    TTL of the first A record (if any)
    'FTP_COMMAND_RET_CODE': 'category',	    #    FTP client command return code
    'FLOW_START_MILLISECONDS': 'float32',	#        Flow start timestamp in milliseconds
    'FLOW_END_MILLISECONDS': 'float32',	    #        Flow end timestamp in milliseconds
    'SRC_TO_DST_IAT_MIN': 'float32',	    #    Minimum Inter-Packet Arrval Time (src->dst)
    'SRC_TO_DST_IAT_MAX': 'float32',	    #        Maximum Inter-Packet Arrval Time (src->dst)
    'SRC_TO_DST_IAT_AVG': 'float32',	    #        Average Inter-Packet Arrval Time (src->dst)
    'SRC_TO_DST_IAT_STDDEV': 'float32',	    #        Sandard Deviaion Inter-Packet Arrval Time (src...
    'DST_TO_SRC_IAT_MIN': 'float32',	    #        Minimum Inter-Packet Arrval Time (dst > src)
    'DST_TO_SRC_IAT_MAX': 'float32',	    #        Minimum Inter-Packet Arrval Time (dst > src)
    'DST_TO_SRC_IAT_AVG': 'float32',	    #        Minimum Inter-Packet Arrval Time (dst > src)
    'DST_TO_SRC_IAT_STDDEV': 'float32',	    #        Minimum Inter-Packet Arrval Time (dst > src)
}

In [None]:
df = pd.read_csv('raw/NF-BoT-IoT-v3.csv', dtype=dtypes)
df[:2]

Unnamed: 0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,...,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label,Attack
0,1528104000000.0,1528104000000.0,192.168.100.7,365,192.168.100.3,565,17,0.0,238142,5177,...,1.0,30.0,22.0,4.0,0.0,0.0,0.0,0.0,0,Benign
1,1528104000000.0,1528104000000.0,192.168.100.46,3456,192.168.100.5,80,17,0.0,9015892,9630,...,1.0,28.0,12.0,1.0,1.0,30.0,12.0,2.0,0,Benign


In [17]:
from GraphSAGE_utils import prepare_flows

df = prepare_flows(df)

: 