last modified: 2023.11.26 23:17

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Model 평가

사용할 feature
1. `count`: total number of packets
2. `incoming_count`: total number of incoming packets
3. `incoming_rate`: incoming_count / count
4. `outgoing_count`: total number of outgoing packets
5. `outgoing_rate`: outgoing_count / count

+ `initial_{n}_{feature}`: number of feature among first n packets

In [2]:
import pickle
import numpy as np
import pandas as pd
from itertools import chain

In [3]:
"""
파일의 폴더 위치 (절대경로 및 상대경로 무관)

ex)
/root
  ㄴ data
    ㄴ mon_standard.pkl
    ㄴ unmon_standard10.pkl

=> BASE_PATH = "/root/data/"
"""
# BASE_PATH = "./data/"
BASE_PATH = "/content/drive/MyDrive/Colab Notebooks/23-02 Machine Learning/project/"

In [4]:
# data 선택
FT_UNMON = True
FT_MON = True

# label type - False 인 경우 monitored(1), unmonitored(-1)
FT_MULTI_LABEL = True

In [5]:
PACKET_SIZE = 512

#categorical
FT_CATEGORICAL = True
FT_CATEGORICAL_SIZE = range(10, 60, 10) # 10부터 10씩 늘려감, 60보다 작을 때까지. -> (10, 20, 30, 40, 50)
FT_SEQ_SIZE = 50

# feature 선택
# continuous
FT_TIMESTAMPS = False

FT_DIRECTION = False # 방향만 표기 [1. -1, 1, 1, ...]
FT_PACKET_SIZE = False # packet의 size로 표기 [512, -512, 512, 512, ...]


# busrt와 cumulative의 경우, Packet size를 반영하여 계산할 수도 있지만 Tor에서는 모든 패킷이 512 단위이므로, 그냥 scale을 줄여서 사용할 수도 있을거 같습니다.
# (저희에게 주어진 데이터의 경우 모두 512이므로 그냥 방향만으로 계산한다고 생각해도 무방)
FT_BURST_DIR = False # [1, -1, 1, 2, ...]
FT_BURST_SIZE = False # burst_dir에 packet size를 반영한 것 (burst_dir * 512)
FT_CUMULATIVE_DIR = False # [1, 0, 1, 2, ...]
FT_CUMULATIVE_SIZE = False # cumulative dir에 packet size를 반영한 것 (cumulative_dir * 512)

In [6]:
seqeunce_features_size = {
    'timestamps' :FT_SEQ_SIZE,
    'direction' :FT_SEQ_SIZE,
    'packet_size' :FT_SEQ_SIZE,
    'burst_dir' :FT_SEQ_SIZE,
    'burst_size' :FT_SEQ_SIZE,
    'cumulative_dir' :FT_SEQ_SIZE,
    'cumulative_size' :FT_SEQ_SIZE
    }

In [7]:
def load_pickle_file(file_name):
    # Load the pickle file
    print("Loading datafile...")
    with open(BASE_PATH + file_name, 'rb') as fi: # Path to mon_standard.pkl in Colab
        data = pickle.load(fi)

    print("Done.")
    return data

In [8]:
SITE_CNT = 95
URL_PER_SITE = 10

def parse_dataset(mon_data, unmon_data):
    print("parsing dataset...")
    dataset = []
    label = []

    if mon_data:
        if FT_MULTI_LABEL:
            label = [i for i in range(95) for _ in range(200)]
        else:
            label = [1] * 95 * 200

        for i in range(SITE_CNT):
            temp = []
            for j in range(URL_PER_SITE):
                temp.append(mon_data[i*URL_PER_SITE + j])
            dataset.append(list(chain.from_iterable(temp)))

    if unmon_data:
        label.extend([-1]*10000)
        dataset.append(unmon_data)


    dataset = list(chain.from_iterable(dataset))
    print("Done.")
    return dataset, label

In [9]:
if FT_MON:
    mon_data = load_pickle_file("mon_standard.pkl")
else:
    mon_data = None
if FT_UNMON:
    unmon_data = load_pickle_file("unmon_standard10.pkl")
else:
    unmon_data = None

dataset, label = parse_dataset(mon_data, unmon_data)

Loading datafile...
Done.
Loading datafile...
Done.
parsing dataset...
Done.


메모리 확보

In [10]:
del mon_data
del unmon_data

In [11]:
def pad_right(arr, size):
    if len(arr) >= size:
        return arr[:size]
    return list(np.pad(arr, (0, max(0, size - len(arr))), mode='constant'))


def calculate_burst_pattern(arr):
    if not arr:
        return []
    prev = arr[0]
    result = []

    for dir in arr[1:]:
        if (prev * dir) > 0:
            prev += dir
        else:
            result.append(prev)
            prev = dir

    result.append(prev)
    return result


def unfold_array(arr, size, col_name):
    return pd.DataFrame(map(lambda x:pad_right(x, size), arr), columns=[f"{col_name}_{i}" for i in range(size)])

def get_categorical_features(direction_arr, prefix):
    df = pd.DataFrame()

    length = list(map(len, direction_arr))

    df[f"{prefix}_incoming_count"] = list(map(lambda x:x.count(-1), direction_arr))
    df[f"{prefix}_incoming_rate"] = np.array(df[f"{prefix}_incoming_count"]) / length
    df[f"{prefix}_outgoing_count"] = length - np.array(df[f"{prefix}_incoming_count"])
    df[f"{prefix}_outgoing_rate"] = np.array(df[f"{prefix}_outgoing_count"]) / length
    return df

def extract_features(direction_arr, timestamps_arr):
    df = pd.DataFrame()

    columns = []

    if FT_TIMESTAMPS:
        columns.append(unfold_array(timestamps_arr, seqeunce_features_size['timestamps'], 'timestamp'))
        columns[0] = columns[0].drop(columns=['timestamp_0'])

    if FT_DIRECTION:
        columns.append(unfold_array(direction_arr, seqeunce_features_size['direction'], 'direction'))


    if FT_PACKET_SIZE:
        packet_size = list(map(lambda x: np.array(x) * PACKET_SIZE, direction_arr))
        columns.append(unfold_array(packet_size, seqeunce_features_size['packet_size'], 'packet_size'))

    if FT_BURST_DIR or FT_BURST_SIZE:
        burst = list(map(calculate_burst_pattern, direction_arr))
        if FT_BURST_DIR:
            columns.append(unfold_array(burst, seqeunce_features_size['burst_dir'], 'burst_dir'))
        if FT_BURST_SIZE:
            burst = list(map(lambda x: np.array(x) * PACKET_SIZE, burst))
            columns.append(unfold_array(burst, seqeunce_features_size['burst_size'], 'burst_size'))

    if FT_CUMULATIVE_DIR or FT_CUMULATIVE_SIZE:
        cumulative = list(map(lambda x: np.cumsum(x), direction_arr))
        if FT_CUMULATIVE_DIR:
            columns.append(unfold_array(cumulative, seqeunce_features_size['cumulative_dir'], 'cumulative_dir'))
        if FT_CUMULATIVE_SIZE:
            cumulative = list(map(lambda x:x * PACKET_SIZE, cumulative))
            columns.append(unfold_array(cumulative, seqeunce_features_size['cumulative_size'], 'cumulative_size'))

    if FT_CATEGORICAL:
        df["count"] = list(map(lambda x:len(x), direction_arr))
        columns.append(get_categorical_features(direction_arr, ''))

        for i in FT_CATEGORICAL_SIZE:
            columns.append(get_categorical_features(list(map(lambda x:pad_right(x, i), direction_arr)), f"initial{i}"))

    return pd.concat([df, *columns], axis=1)

In [12]:
timestamps = []
direction = []

for data in dataset:
    timestamps.append(abs(np.array(data)))
    direction.append(list(map(lambda x: 1 if x > 0 else -1, data)))

del dataset

df = extract_features(direction, timestamps)
df['label'] = label

메모리 확보

In [13]:
del label
del timestamps
del direction

In [14]:
from sklearn.model_selection import train_test_split

class Dataset:
    x = None
    y = None

    def __init__(self, x, y, random_state=0, test_size=0.2):
        self.x = x
        self.y = y
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)

## 사용 예시

In [15]:
open_multi_dataset = Dataset(df.drop(columns=['label']), df['label'])
open_binary_dataset = Dataset(df.drop(columns=['label']), df['label'].map(lambda x:1 if x>= 0 else -1))
closed_multi_dataset = Dataset(df[df['label'] >= 0].drop(columns=['label']), df[df['label'] >= 0]['label'])

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV

In [28]:
RESULT_ARCHIEVE_PATH = "/content/drive/MyDrive/2023-2ML/result/"

In [39]:
def save_from_grid_search(model, param, name):
    grid = GridSearchCV(model(), param, refit = True, verbose = 3)
    grid.fit(open_multi_dataset.x_train, open_multi_dataset.y_train)

    open_multi_pred_y = grid.predict(open_multi_dataset.x_test)

    with open(f"{RESULT_ARCHIEVE_PATH}{name}.pkl", "wb") as f:
        pickle.dump(confusion_matrix(open_multi_dataset.y_test, open_multi_pred_y), f)


In [40]:
param = {
    'n_estimators': [10, 50, 100],
    'criterion': ['gini', 'entropy']
}
save_from_grid_search(RandomForestClassifier, param, 'random_forest')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ...criterion=gini, n_estimators=10;, score=0.808 total time=   0.8s
[CV 2/5] END ...criterion=gini, n_estimators=10;, score=0.808 total time=   0.8s
[CV 3/5] END ...criterion=gini, n_estimators=10;, score=0.807 total time=   0.8s
[CV 4/5] END ...criterion=gini, n_estimators=10;, score=0.807 total time=   0.8s
[CV 5/5] END ...criterion=gini, n_estimators=10;, score=0.799 total time=   0.8s
[CV 1/5] END ...criterion=gini, n_estimators=50;, score=0.816 total time=   4.5s
[CV 2/5] END ...criterion=gini, n_estimators=50;, score=0.819 total time=   5.2s
[CV 3/5] END ...criterion=gini, n_estimators=50;, score=0.822 total time=   3.6s
[CV 4/5] END ...criterion=gini, n_estimators=50;, score=0.821 total time=   3.6s
[CV 5/5] END ...criterion=gini, n_estimators=50;, score=0.812 total time=   4.0s
[CV 1/5] END ..criterion=gini, n_estimators=100;, score=0.819 total time=   9.4s
[CV 2/5] END ..criterion=gini, n_estimators=100;,

In [41]:
param = {
    'n_estimators':[10, 50, 100],
    'learning_rate': [0.5, 1.0]
}
save_from_grid_search(AdaBoostClassifier, param, 'adaboost')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END learning_rate=0.5, n_estimators=10;, score=0.352 total time=   0.9s
[CV 2/5] END learning_rate=0.5, n_estimators=10;, score=0.359 total time=   0.9s
[CV 3/5] END learning_rate=0.5, n_estimators=10;, score=0.355 total time=   0.9s
[CV 4/5] END learning_rate=0.5, n_estimators=10;, score=0.354 total time=   0.9s
[CV 5/5] END learning_rate=0.5, n_estimators=10;, score=0.351 total time=   1.7s
[CV 1/5] END learning_rate=0.5, n_estimators=50;, score=0.120 total time=   7.8s
[CV 2/5] END learning_rate=0.5, n_estimators=50;, score=0.120 total time=   5.1s
[CV 3/5] END learning_rate=0.5, n_estimators=50;, score=0.153 total time=   4.4s
[CV 4/5] END learning_rate=0.5, n_estimators=50;, score=0.163 total time=   5.5s
[CV 5/5] END learning_rate=0.5, n_estimators=50;, score=0.137 total time=   7.2s
[CV 1/5] END learning_rate=0.5, n_estimators=100;, score=0.066 total time=   9.7s
[CV 2/5] END learning_rate=0.5, n_estimators=100

In [43]:
BaggingClassifier()
param = {
    'n_estimators':[10, 50, 100],
}
save_from_grid_search(BaggingClassifier, param, 'bagging')

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ...................n_estimators=10;, score=0.809 total time=   5.6s
[CV 2/5] END ...................n_estimators=10;, score=0.803 total time=   4.1s
[CV 3/5] END ...................n_estimators=10;, score=0.814 total time=   3.2s
[CV 4/5] END ...................n_estimators=10;, score=0.809 total time=   3.8s
[CV 5/5] END ...................n_estimators=10;, score=0.800 total time=   4.1s
[CV 1/5] END ...................n_estimators=50;, score=0.814 total time=  16.3s
[CV 2/5] END ...................n_estimators=50;, score=0.816 total time=  16.5s
[CV 3/5] END ...................n_estimators=50;, score=0.824 total time=  16.5s
[CV 4/5] END ...................n_estimators=50;, score=0.823 total time=  16.4s
[CV 5/5] END ...................n_estimators=50;, score=0.815 total time=  16.5s
[CV 1/5] END ..................n_estimators=100;, score=0.815 total time=  33.0s
[CV 2/5] END ..................n_estimators=100;,

In [44]:
param = {
    'n_estimators':[10, 50, 100],
}
save_from_grid_search(GradientBoostingClassifier, param, 'gradient_boosting')

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ...................n_estimators=10;, score=0.604 total time= 1.5min
[CV 2/5] END ...................n_estimators=10;, score=0.623 total time= 1.5min
[CV 3/5] END ...................n_estimators=10;, score=0.605 total time= 1.5min
[CV 4/5] END ...................n_estimators=10;, score=0.597 total time= 1.6min
[CV 5/5] END ...................n_estimators=10;, score=0.606 total time= 1.5min
[CV 1/5] END ...................n_estimators=50;, score=0.734 total time= 7.7min
[CV 2/5] END ...................n_estimators=50;, score=0.746 total time= 7.7min
[CV 3/5] END ...................n_estimators=50;, score=0.739 total time= 7.7min
[CV 4/5] END ...................n_estimators=50;, score=0.742 total time= 7.6min
[CV 5/5] END ...................n_estimators=50;, score=0.740 total time= 7.6min
[CV 1/5] END ..................n_estimators=100;, score=0.752 total time=15.4min
[CV 2/5] END ..................n_estimators=100;,

In [45]:
param = {
    'voting':['hard', 'soft'],
}
save_from_grid_search(VotingClassifier, param, 'voting')

TypeError: ignored

### 모델 저장

- 학습 완료하면 모델 백업 해주세요!

공유 폴더 바로가기를 [내 드라이브] 밑에 생성해두면 바로 연동 됩니다!

In [None]:
MODEL_ARCHIEVE_PATH = "/content/drive/MyDrive/2023-2ML/models/"

In [None]:
import joblib
joblib.dump(tree_grid, MODEL_ARCHIEVE_PATH+'open-multi-1.pkl')

['/content/drive/MyDrive/2023-2ML/models/open-multi-1.pkl']

### 모델 불러오기

In [None]:
model = joblib.load(MODEL_ARCHIEVE_PATH+"open-multi-1.pkl")
model.predict(open_multi_dataset.x_train)

array([ 6, 28, 48, ..., 49, 53, 13])