# Open-World Experiment



## 준비
1. 데이터 업로드
    - mon_standard.pkl
    - X3_mon_data.pkl
    - X4_mon_data.pkl
2. 파일 경로 설정
    - 데이터를 업로드한 위치의 경로를 path 변수에 할당


In [4]:
# 파일 경로 설정
path = '/content/drive/MyDrive/24-2/ML'
mon_file = f'{path}/mon_standard.pkl'
unmon_file = f'{path}/unmon_standard10.pkl'
X3_mon_file = f'{path}/X3_mon_data.pkl'
X4_mon_file = f'{path}/X4_mon_data.pkl'

## Categorical Features extraction
categorical_feature.ipynb 파일의 내용과 겹칩니다.

In [5]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive

# 설정
USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS_MON = 950  # Monitored 데이터 URL 수
TOTAL_URLS_UNMON = 3000  # Unmonitored 데이터 URL 수

# Number of Incoming/Outgoing Packets
def count_in_out_packets(data):
    num_incoming = sum(1 for c in data if c > 0)  # Positive indicates incoming
    num_outgoing = sum(1 for c in data if c < 0)  # Negative indicates outgoing
    return num_incoming, num_outgoing

# Number of Incoming/Outgoing Packets as a Fraction of Total
def fraction_in_out_packets(num_incoming, num_outgoing):
    total_packets = num_incoming + num_outgoing
    if total_packets == 0:  # To avoid division by zero
        return 0, 0
    fraction_incoming = num_incoming / total_packets
    fraction_outgoing = num_outgoing / total_packets
    return fraction_incoming, fraction_outgoing


# Standard Deviation of Outgoing Packet Ordering List
def std_dev_outgoing_packets(data):
    outgoing_sizes = [abs(c) * 512 for c in data if c < 0]  # Only outgoing packets
    if len(outgoing_sizes) > 1:
        return np.std(outgoing_sizes)
    return 0


# Sum of All Items in the Alternative Concentration Feature List
def sum_alternative_concentration(size_seq):
    return sum(size_seq)


# Average of the Outgoing and Total Number of Packets
def avg_outgoing_and_total_packets(num_incoming, num_outgoing):
    total_packets = num_incoming + num_outgoing
    if total_packets == 0:  # To avoid division by zero
        return 0
    avg = (num_outgoing + total_packets) / 2
    return avg


# Sum of Incoming, Outgoing, and Total Number of Packets
def sum_in_out_total_packets(num_incoming, num_outgoing):
    return num_incoming + num_outgoing


# Sum of Alternative Number of Packets per Second
def sum_packets_per_second(time_seq, size_seq):
    total_time = sum(time_seq)
    if total_time == 0:
        return 0
    total_packets = sum(size_seq)
    return total_packets / total_time

# Total Number of Packets
def total_packets(num_incoming, num_outgoing):
    return num_incoming + num_outgoing

# 데이터 로드
with open(mon_file, 'rb') as f:
    mon_data = pickle.load(f)
with open(unmon_file, 'rb') as f:
    unmon_data = pickle.load(f)

# Monitored 데이터 변환
X1_mon, X2_mon, X3_mon, X4_mon, X5_mon, X6_mon, X7_mon, X8_mon, X9_mon, y_mon = [], [], [], [], [], [], [], [], [], []

# For monitored data
for i in range(TOTAL_URLS_MON):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE

    for sample in mon_data[i]:
        size_seq = []
        time_seq = []

        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)

        num_incoming, num_outgoing = count_in_out_packets(sample)
        fraction_incoming, fraction_outgoing = fraction_in_out_packets(num_incoming, num_outgoing)
        std_dev_outgoing = std_dev_outgoing_packets(sample)
        sum_alternative = sum_alternative_concentration(size_seq)
        avg_outgoing_total = avg_outgoing_and_total_packets(num_incoming, num_outgoing)
        sum_in_out_total = sum_in_out_total_packets(num_incoming, num_outgoing)
        sum_per_second = sum_packets_per_second(time_seq, size_seq)
        total = total_packets(num_incoming, num_outgoing)

        # Store all features in respective lists
        X1_mon.append(time_seq)
        X2_mon.append(size_seq)
        X5_mon.append(fraction_incoming)
        X6_mon.append(fraction_outgoing)
        X7_mon.append(std_dev_outgoing)
        X8_mon.append(sum_alternative)
        X9_mon.append(avg_outgoing_total)
        y_mon.append(label)

with open(X3_mon_file, 'rb') as f_x3:
    X3_mon = pickle.load(f_x3)

with open(X4_mon_file, 'rb') as f_x4:
    X4_mon = pickle.load(f_x4)

X1_unmon, X2_unmon, X3_unmon, X4_unmon, X5_unmon, X6_unmon, X7_unmon, X8_unmon, X9_unmon = [], [], [], [], [], [], [], [], []

# For unmonitored data
for i in range(TOTAL_URLS_UNMON):
    size_seq = []
    time_seq = []

    for c in unmon_data[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512)

    num_incoming, num_outgoing = count_in_out_packets(unmon_data[i])
    fraction_incoming, fraction_outgoing = fraction_in_out_packets(num_incoming, num_outgoing)
    std_dev_outgoing = std_dev_outgoing_packets(unmon_data[i])
    sum_alternative = sum_alternative_concentration(size_seq)
    avg_outgoing_total = avg_outgoing_and_total_packets(num_incoming, num_outgoing)
    sum_in_out_total = sum_in_out_total_packets(num_incoming, num_outgoing)
    sum_per_second = sum_packets_per_second(time_seq, size_seq)
    total = total_packets(num_incoming, num_outgoing)

    # Store all features in respective lists
    X1_unmon.append(time_seq)
    X2_unmon.append(size_seq)
    X5_unmon.append(fraction_incoming)
    X6_unmon.append(fraction_outgoing)
    X7_unmon.append(std_dev_outgoing)
    X8_unmon.append(sum_alternative)
    X9_unmon.append(avg_outgoing_total)




### Extracting X3_unmon, X4_unmon
- Use X2_unmon.
- Check X3_unmon, X4_unmon

In [6]:

import numpy as np
from collections import Counter

# Function to calculate X3 (Cumulative Packet Sizes)
def compute_cumulative_sizes(X2_unmon):
    return [np.cumsum(seq).tolist() for seq in X2_unmon]

# Function to calculate X4 (Bursts)
def compute_bursts(X2_unmon):
    bursts = []
    for seq in X2_unmon:
        current_burst = 0
        burst_sequence = []
        for i, value in enumerate(seq):
            if i == 0 or np.sign(value) == np.sign(seq[i - 1]):
                current_burst += value
            else:
                burst_sequence.append(current_burst)
                current_burst = value
        burst_sequence.append(current_burst)  # Append the last burst
        bursts.append(burst_sequence)
    return bursts

# Extract X3 and X4
X3_unmon = compute_cumulative_sizes(X2_unmon)
X4_unmon = compute_bursts(X2_unmon)

# Monitored와 Unmonitored 데이터를 Pandas 데이터프레임으로 변환
mon_df = pd.DataFrame({
    'Fraction Incoming (X5)': X5_mon,
    'Fraction Outgoing (X6)': X6_mon,
    'Std Dev Outgoing (X7)': X7_mon,
    'Sum Alternative (X8)': X8_mon,
    'Average Outgoing & Total (X9)': X9_mon,
    'Label': ['Monitored'] * len(X5_mon)
})

unmon_df = pd.DataFrame({
    'Fraction Incoming (X5)': X5_unmon,
    'Fraction Outgoing (X6)': X6_unmon,
    'Std Dev Outgoing (X7)': X7_unmon,
    'Sum Alternative (X8)': X8_unmon,
    'Average Outgoing & Total (X9)': X9_unmon,
    'Label': ['Unmonitored'] * len(X5_unmon)
})

# 데이터 병합
data = pd.concat([mon_df, unmon_df], ignore_index=True)
print("데이터 로드 및 준비 완료!")

데이터 로드 및 준비 완료!


## model 학습 및 실험 수행
1) Binary Classification: Determine whether the web traffic trace corresponds to a monitored website.   

 To do this, reassign the label '1' to all monitored website instances (positive samples) and assign the label '-1' to all unmonitored website instances (negative samples). Train and test the model in this binary setting.

2) Multi-Class Classification: Classify 95 monitored website traces with unique labels against additional unmonitored websites.

   In the multi-class setting, label the monitored website instances with {0, 1, 2, ..., 94} and the unmonitored website instances with the label '-1'.


### SVM

#### 1) 리스트 형식의 X1~X4 요약해서 사용

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 리스트 데이터를 요약하는 함수 (평균, 표준편차, 최대값, 최소값, 합계 등 사용 가능)
def summarize_features(data):
    return {
        'mean': np.mean(data) if len(data) > 0 else 0,
        'std': np.std(data) if len(data) > 0 else 0,
        'max': np.max(data) if len(data) > 0 else 0,
        'min': np.min(data) if len(data) > 0 else 0,
        'sum': np.sum(data)
    }

# Monitored 데이터 요약
mon_features = pd.DataFrame({
    'X1_mean': [summarize_features(seq)['mean'] for seq in X1_mon],
    'X1_std': [summarize_features(seq)['std'] for seq in X1_mon],
    'X1_max': [summarize_features(seq)['max'] for seq in X1_mon],
    'X2_mean': [summarize_features(seq)['mean'] for seq in X2_mon],
    'X2_std': [summarize_features(seq)['std'] for seq in X2_mon],
    'X2_max': [summarize_features(seq)['max'] for seq in X2_mon],
    'X3_mean': [summarize_features(seq)['mean'] for seq in X3_mon],
    'X3_std': [summarize_features(seq)['std'] for seq in X3_mon],
    'X3_max': [summarize_features(seq)['max'] for seq in X3_mon],
    'X4_mean': [summarize_features(seq)['mean'] for seq in X4_mon],
    'X4_std': [summarize_features(seq)['std'] for seq in X4_mon],
    'X4_max': [summarize_features(seq)['max'] for seq in X4_mon],
    'X5': X5_mon,
    'X6': X6_mon,
    'X7': X7_mon,
    'X8': X8_mon,
    'X9': X9_mon,
    'Label': 1  # Monitored는 1
})

# Unmonitored 데이터 요약
unmon_features = pd.DataFrame({
    'X1_mean': [summarize_features(seq)['mean'] for seq in X1_unmon],
    'X1_std': [summarize_features(seq)['std'] for seq in X1_unmon],
    'X1_max': [summarize_features(seq)['max'] for seq in X1_unmon],
    'X2_mean': [summarize_features(seq)['mean'] for seq in X2_unmon],
    'X2_std': [summarize_features(seq)['std'] for seq in X2_unmon],
    'X2_max': [summarize_features(seq)['max'] for seq in X2_unmon],
    'X3_mean': [summarize_features(seq)['mean'] for seq in X3_unmon],
    'X3_std': [summarize_features(seq)['std'] for seq in X3_unmon],
    'X3_max': [summarize_features(seq)['max'] for seq in X3_unmon],
    'X4_mean': [summarize_features(seq)['mean'] for seq in X4_unmon],
    'X4_std': [summarize_features(seq)['std'] for seq in X4_unmon],
    'X4_max': [summarize_features(seq)['max'] for seq in X4_unmon],
    'X5': X5_unmon,
    'X6': X6_unmon,
    'X7': X7_unmon,
    'X8': X8_unmon,
    'X9': X9_unmon,
    'Label': -1  # Unmonitored는 -1
})

# Binary와 Multi-Class 데이터 생성
binary_data = pd.concat([mon_features, unmon_features], ignore_index=True)
multi_data = binary_data.copy()

# Multi-Class Label 설정
multi_data.loc[:len(mon_features)-1, 'Label'] = np.repeat(range(95), len(mon_features) // 95)

# Binary Classification: 데이터 분리
X_bin = binary_data.drop('Label', axis=1)
y_bin = binary_data['Label']
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X_bin, y_bin, test_size=0.2, random_state=0)

# Multi-Class Classification: 데이터 분리
X_multi = multi_data.drop('Label', axis=1)
y_multi = multi_data['Label']
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.2, random_state=0)

# 데이터 표준화
scaler_bin = StandardScaler()
X_train_bin = scaler_bin.fit_transform(X_train_bin)
X_test_bin = scaler_bin.transform(X_test_bin)

scaler_multi = StandardScaler()
X_train_multi = scaler_multi.fit_transform(X_train_multi)
X_test_multi = scaler_multi.transform(X_test_multi)

# SVM 모델 생성 및 학습
# Binary Classification
svm_bin = SVC(kernel='rbf', C=1, gamma='scale', random_state=0)
svm_bin.fit(X_train_bin, y_train_bin)

# Multi-Class Classification
svm_multi = SVC(kernel='rbf', C=1, gamma='scale', decision_function_shape='ovr', random_state=0)
svm_multi.fit(X_train_multi, y_train_multi)

# 평가
# Binary Classification 결과
y_pred_bin = svm_bin.predict(X_test_bin)
print("Binary Classification Accuracy:", accuracy_score(y_test_bin, y_pred_bin))
print("Binary Classification Report:")
print(classification_report(y_test_bin, y_pred_bin))

# Multi-Class Classification 결과
y_pred_multi = svm_multi.predict(X_test_multi)
print("Multi-Class Classification Accuracy:", accuracy_score(y_test_multi, y_pred_multi))
print("Multi-Class Classification Report:")
print(classification_report(y_test_multi, y_pred_multi))


Binary Classification Accuracy: 0.9211363636363636
Binary Classification Report:
              precision    recall  f1-score   support

          -1       0.99      0.43      0.60       609
           1       0.92      1.00      0.96      3791

    accuracy                           0.92      4400
   macro avg       0.95      0.72      0.78      4400
weighted avg       0.93      0.92      0.91      4400

Multi-Class Classification Accuracy: 0.3640909090909091
Multi-Class Classification Report:
              precision    recall  f1-score   support

          -1       0.22      0.82      0.35       609
           0       0.00      0.00      0.00        40
           1       0.33      0.24      0.28        46
           2       0.00      0.00      0.00        43
           3       0.00      0.00      0.00        47
           4       0.10      0.05      0.06        43
           5       0.70      0.35      0.47        40
           6       0.42      0.76      0.54        29
           7  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### 2)리스트 형식의 X1~X4 제외하고 X5부터 사용

In [8]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd

# 1. Feature와 Target 준비
# X1~X9 모두 사용 (리스트들을 열로 변환)
mon_features = pd.DataFrame({
    'X5': X5_mon, 'X6': X6_mon, 'X7': X7_mon, 'X8': X8_mon, 'X9': X9_mon
})
unmon_features = pd.DataFrame({
    'X5': X5_unmon, 'X6': X6_unmon, 'X7': X7_unmon, 'X8': X8_unmon, 'X9': X9_unmon
})

# Binary Classification Label (Monitored: 1, Unmonitored: -1)
mon_features['Label'] = 1
unmon_features['Label'] = -1

# Multi-Class Classification Label (Monitored: 0~94, Unmonitored: -1)
mon_features_multi = mon_features.copy()
mon_features_multi['Label'] = np.repeat(range(95), len(mon_features) // 95)
unmon_features_multi = unmon_features.copy()
unmon_features_multi['Label'] = -1

# 2. 데이터 결합
binary_data = pd.concat([mon_features, unmon_features], ignore_index=True)
multi_data = pd.concat([mon_features_multi, unmon_features_multi], ignore_index=True)

# 3. 데이터 정규화 및 Train/Test Split
def preprocess_data(data):
    X = data.drop(columns=['Label']).values  # Features
    y = data['Label'].values  # Labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test

# Binary 데이터 전처리
X_train_bin, X_test_bin, y_train_bin, y_test_bin = preprocess_data(binary_data)

# Multi-Class 데이터 전처리
X_train_multi, X_test_multi, y_train_multi, y_test_multi = preprocess_data(multi_data)

# 4. SVM 모델 학습 및 평가
def train_and_evaluate_svm(X_train, X_test, y_train, y_test, task_name="Task"):
    svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=0)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    print(f"=== {task_name} ===")
    print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print()

# Binary Classification
train_and_evaluate_svm(X_train_bin, X_test_bin, y_train_bin, y_test_bin, "Binary Classification (Open-World)")

# Multi-Class Classification
train_and_evaluate_svm(X_train_multi, X_test_multi, y_train_multi, y_test_multi, "Multi-Class Classification (Open-World)")


=== Binary Classification (Open-World) ===
Accuracy: 86.18%
Classification Report:
              precision    recall  f1-score   support

          -1       1.00      0.00      0.00       609
           1       0.86      1.00      0.93      3791

    accuracy                           0.86      4400
   macro avg       0.93      0.50      0.46      4400
weighted avg       0.88      0.86      0.80      4400

Confusion Matrix:
[[   1  608]
 [   0 3791]]

=== Multi-Class Classification (Open-World) ===
Accuracy: 19.57%
Classification Report:
              precision    recall  f1-score   support

          -1       0.15      0.87      0.25       609
           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00        46
           2       0.00      0.00      0.00        43
           3       0.00      0.00      0.00        47
           4       0.00      0.00      0.00        43
           5       0.00      0.00      0.00        40
           6       0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### RF

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# 1. 데이터 준비
# 'Label' 컬럼을 포함하여, 'X'는 피처들, 'y'는 라벨로 설정
X = pd.concat([mon_features[['X5', 'X6', 'X7', 'X8', 'X9']], unmon_features[['X5', 'X6', 'X7', 'X8', 'X9']]], ignore_index=True)
y = pd.concat([mon_features['Label'], unmon_features['Label']], ignore_index=True)

# 2. 데이터 분할 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 3. Random Forest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier.fit(X_train, y_train)

# 4. 예측
y_pred_bin = rf_classifier.predict(X_test)

# 5. 성능 평가 (Binary Classification)
print("=== Binary Classification (Monitored vs. Unmonitored) ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_bin):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_bin))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bin))

# === Multi-Class Classification (Monitored Classes) ===
# 여기서는 Multi-Class Classification을 위해 'Label'을 각각의 Monitored 클래스별로 다르게 설정했다고 가정
# Multi-Class 모델을 위한 예시로 'Label'이 Monitored에서 다른 범주들을 가지게 설정
y_multi = y  # 여기에 Multi-Class 라벨을 추가하는 부분 (기존 'Label' 컬럼을 사용)

# Multi-Class 분할
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X, y_multi, test_size=0.2, random_state=42)

# 6. Multi-Class Random Forest 모델 학습
rf_classifier_multi = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier_multi.fit(X_train_multi, y_train_multi)

# 7. Multi-Class 예측
y_pred_multi = rf_classifier_multi.predict(X_test_multi)

# 8. 성능 평가 (Multi-Class Classification)
print("\n=== Multi-Class Classification (Monitored Classes) ===")
print(f"Accuracy: {accuracy_score(y_test_multi, y_pred_multi):.4f}")
print("Classification Report:")
print(classification_report(y_test_multi, y_pred_multi))
print("Confusion Matrix:")
print(confusion_matrix(y_test_multi, y_pred_multi))


=== Binary Classification (Monitored vs. Unmonitored) ===
Accuracy: 0.8607
Classification Report:
              precision    recall  f1-score   support

          -1       0.49      0.12      0.20       609
           1       0.87      0.98      0.92      3791

    accuracy                           0.86      4400
   macro avg       0.68      0.55      0.56      4400
weighted avg       0.82      0.86      0.82      4400

Confusion Matrix:
[[  75  534]
 [  79 3712]]

=== Multi-Class Classification (Monitored Classes) ===
Accuracy: 0.8636
Classification Report:
              precision    recall  f1-score   support

          -1       0.47      0.12      0.19       590
           1       0.88      0.98      0.93      3810

    accuracy                           0.86      4400
   macro avg       0.67      0.55      0.56      4400
weighted avg       0.82      0.86      0.83      4400

Confusion Matrix:
[[  69  521]
 [  79 3731]]
