In [None]:
import pandas as pd
import numpy as np
import dgl
import networkx as nx
import matplotlib.pyplot as plt
from pathlib import Path
import sklearn

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def _prepare_flows(df):
    data = df
    pk_cols = (
        'IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT'
    )
    for k in pk_cols:
        assert k in data.columns, f'{k} not in columns {data.columns}'
        data[k] = data[k].apply(str)
        
    data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
    data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']
    data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'],inplace=True)

    data = data.drop("Label") # binary 

    df = data 
    categorical = ['TCP_FLAGS','L7_PROTO','PROTOCOL', 
                        'ICMP_IPV4_TYPE', 'FTP_COMMAND_RET_CODE', 'Attack']
    pk = ['IPV4_SRC_ADDR','IPV4_DST_ADDR']
    numerical = [c for c in df.columns if (c not in categorical) and (c not in pk)]

    # mean impute infinite/nan
    def _check(df):
        for c in numerical:
            n = (~np.isfinite(df[c])).sum()
            if n > 0:
                print(n, c)
    _check(df)
    df[numerical] = df[numerical].replace([np.inf, -np.inf], np.nan)
    df[numerical] = df[numerical].fillna(df[numerical].mean())
    _check(df)

    # paper used labelencoding for all categories
    le = LabelEncoder()
    for c in categorical:
        df[c] = le.fit_transform(df[c])
    
    # and standradization for the rest
    scaler = StandardScaler()
    df[numerical] = scaler.fit_transform(df[numerical])

    attrs = [c for c in data.columns if c not in ("IPV4_SRC_ADDR", "IPV4_DST_ADDR")]
    data['h'] = data[attrs].values.tolist()
    return df

def flowgraph_encode(df):
    df = _prepare_flows(df)
    G = nx.from_pandas_edgelist(
            df, 
            "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','Attack'],
            create_using=nx.MultiGraph())
    return G


In [None]:
import os

d = Path('../raw/') / 'NetFlow_v3_Features.csv'
features_df = pd.read_csv(d)
flow_features = list(features_df['Feature'])

In [None]:
os.listdir(d)

In [None]:
# only unsw nb15 for now
df = pd.read_csv(d / 'NF-UNSW-NB15-v3.csv')
G = flowgraph_encode(df)
assert G    
with open(f"../../interm/flowgraph_NF-UNSW-NB15-v3.pkl", "wb") as g:
    pickle.dump(G, g)

In [None]:
del df

### Memory errors - consider cloud

In [None]:

d = Path('../raw/')
files = os.listdir(d)

# smallest first
files[1], temp = files[0], files[1]; files[0] = temp
files = files[:-1]# features file
print(files)


columns = {}
for f in files:
    print(f)
    df = pd.read_csv(d / f)
    print('loaded')
    columns[f] = sorted(list(df.columns))

columns

['NF-UNSW-NB15-v3.csv', 'NF-BoT-IoT-v3.csv', 'NF-ToN-IoT-v3.csv', 'NF-CICIDS2018-v3.csv']
NF-UNSW-NB15-v3.csv
loaded
NF-BoT-IoT-v3.csv


In [15]:
from tqdm import tqdm
import pickle 

for f in files:
    print(f)
    df = pd.read_csv(d / f)
    print('loaded')
    
    # NOTE
    # assert set(list(df.columns)) == set(flow_features), f"! {list(df.columns)} != {flow_features}"
    
    G = flowgraph_encode(df)
    assert G    
    with open(f"../../interm/flowgraph_{f}.pkl", "wb") as g:
        pickle.dump(G, g)

['NF-UNSW-NB15-v3.csv', 'NF-BoT-IoT-v3.csv', 'NF-ToN-IoT-v3.csv', 'NF-CICIDS2018-v3.csv', 'NetFlow_v3_Features.csv']
NF-UNSW-NB15-v3.csv
loaded


KeyError: "['Label'] not found in axis"