In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

from datetime import datetime

from pfcm.PFCM import pfcm, pfcm_predict

from combo.models.cluster_eac import EAC
from combo.utils.example import visualize_clusters

window_size = 30
shift = 30
idx = pd.date_range(start=datetime(2021, 1, 3, 13, 0, 0), end=datetime(2021, 1, 3, 13, 14, 55), freq="{}s".format(shift))

### 資料時間序列特徵

- 讀取原始資料

In [None]:
# ts        Start time              2021-02-02 13:00:00
# td        Duration                0.347
# sa        Source Address          155.150.16.184
# da        Destination Address     163.34.157.27
# sp        Source Port             80
# dp        Destination Port        50952
# pr        Protocol                TCP
# stos      Src Tos                 0
# ipkt      Input Packets           1598
# ibyt      Input Bytes             2394418

dt = {
    "ts": np.dtype("datetime64"),
    "td": "float",
    "sa": "str",
    "da": "str",
    "sp": "int",
    "dp": "int",
    "pr": "str",
    "stos": "int",
    "ipkt": "float",
    "ibyt": "float",
}

df = pd.read_csv("20210103.csv")
df = df[:df[df["ts"] == "Summary"].index.item()]
df = df[dt.keys()]
df = df.astype(dt)

- 加上 IP pair 屬性

In [None]:
# nSrcPort      Number of unique source port
# nDstPort      Number of unique destination port
# rSrcDstPort   Ratio of nSrcPort to nDstPort
# nPkt          Number of packets
# nByte         Number of bytes sent
# avgPktRate    Average packet transmission rate
# avgByteRate   Average transmission rate

df["sa_da_pair"] = df["sa"] + "_" + df["da"]

In [None]:
df.groupby("sa").filter(lambda x: len(x) > 1)["sa"].value_counts()

- 資料暫存

In [None]:
df.to_csv("20210103_pair.csv", index=False)

- 讀取暫存資料

In [None]:
dt = {
    "ts": np.dtype("datetime64"),
    "td": "float",
    "sa": "str",
    "da": "str",
    "sp": "int",
    "dp": "int",
    "pr": "str",
    "stos": "int",
    "ipkt": "float",
    "ibyt": "float",
}
df = pd.read_csv("20210103_pair.csv")
df = df.astype(dt)

- 以 30 秒為單位形成個時間 time window 的資料
  - **未來需測試其他時間長度的 time window**

In [None]:
df_time_series = []
index = 0

for start_time in idx:
    end_time = start_time + pd.Timedelta(seconds=window_size)
    df_time = df[(df["ts"] >= start_time) & (df["ts"] <= end_time)]
    df_time = df_time.groupby("sa_da_pair").agg(
        {
            "ipkt": "sum",
            "ibyt": "sum",
            "sp": "nunique",
            "dp": "nunique",
            "pr": "nunique",
            "stos": "nunique",
        }
    )
    df_time["nPkt"] = df_time["ipkt"]
    df_time["nByte"] = df_time["ibyt"]
    df_time["nSrcPort"] = df_time["sp"]
    df_time["nDstPort"] = df_time["dp"]
    df_time["rSrcDstPort"] = df_time["nSrcPort"] / df_time["nDstPort"]
    df_time["avgPktRate"] = df_time["nPkt"] / (window_size + 1)
    df_time["avgByteRate"] = df_time["nByte"] / (window_size + 1)
    df_time["time_index"] = index
    df_time = df_time.reset_index()
    df_time_series.append(df_time)
    index += 1

- 資料暫存

In [None]:
pd.concat(df_time_series).to_csv("20210103_pair_time_series.csv", index=False)

- 讀取暫存資料

In [2]:
dt = {
    "sa_da_pair": "str",
    "ipkt": "float",
    "ibyt": "float",
    "sp": "int",
    "dp": "int",
    "pr": "int",
    "stos": "int",
    "nPkt": "float",
    "nByte": "float",
    "nSrcPort": "int",
    "nDstPort": "int",
    "rSrcDstPort": "float",
    "avgPktRate": "float",
    "avgByteRate": "float",
    "time_index": "int"
}
df_time_series = pd.read_csv("20210103_pair_time_series.csv")
df_time_series = df_time_series.astype(dt)

- 準備分類資料

In [3]:
feature = ["nPkt", "nByte", "nSrcPort", "nDstPort", "rSrcDstPort", "avgPktRate", "avgByteRate"]

- 重組資料
  - 暫時不用排除 flow 過少出現的資料

In [4]:
pair_time_series = dict()

for group_id, df_pair in df_time_series.groupby("sa_da_pair"):
    df_temp = pd.DataFrame({"time_index": range(15*60//shift)})
    merged = df_temp.merge(df_pair, on="time_index", how="left").fillna(0)
    pair_time_series[group_id] = merged[feature].values.tolist()

Exception ignored in: <generator object _combine_concat_plans at 0x000001F5699C3B10>
MemoryError: 


MemoryError: 

- 資料暫存

In [None]:
with open('20210103_pair_time_series.pickle', 'wb') as handle:
    pickle.dump(pair_time_series, handle, protocol=pickle.HIGHEST_PROTOCOL)

- 讀取暫存資料

In [None]:
with open('20210103_pair_time_series.pickle', 'rb') as handle:
    df_pair_time_series = pickle.load(handle)

### PFCM

### EAC

### WDTW

- 透過權重控制時間序列之間的比較更偏向 DTW 或是 Euclidean
- 目前先不考慮這個解法，因為權重是一項需要人為調整的參數