## 1. load the raw dataset
* mon_standard.pkl > array code



In [1]:
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open("../datasets/mon_standard.pkl", 'rb') as fi: # Path to mon_standard.pkl in Colab
    data = pickle.load(fi)

X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information
y = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
        y.append(label)
size = len(y)

print(f'Total samples: {size}') # Output: 19000


Loading datafile...
Total samples: 19000


## 2. Implement candidate features

In [None]:
# Candidate features
 
# Total packets
total_num = []
total_sum_dir = []
total_avg = []

# Incoming packets
inpkt_num = []
inpkt_avg = []
inpkt_sum = []
inpkt_num_frac_total = []
inpkt_avg_ordering = []
inpkt_std_ordering = []
inpkt_num_frac_outpkt = []
inpkt_sum_firstn = []

# Outgoing packets
outpkt_num = []
outpkt_avg = []
outpkt_sum = []
outpkt_num_frac_total = []
outpkt_avg_ordering = []
outpkt_std_ordering = []
outpkt_num_frac_inpkt = []
outpkt_sum_firstn = []

# etc
pkt_avg_sec = []
pkt_std_sec = []
pkt_max_sec = []
inpkt_avg_sec = []
inpkt_std_sec = []
inpkt_max_sec = []
outpkt_avg_sec = []
outpkt_std_sec = []
outpkt_max_sec = []

In [None]:
import numpy as np

In [None]:
for time, dir in X1, X2:
    # Total packets
    f1 = len(X1)
    total_num.append(f1)
    f2 = sum(dir)
    total_sum_dir.append(f2)
    f3 = np.average(time)
    total_avg.append(f3)
    
    # Incoming, Outgoing packets
    inpkt = []
    inpkt_index = []
    outpkt = []
    outpkt_index = []
    for i in len(dir):
        if(dir[i] < 0):
            inpkt.append(time[i])
            inpkt_index.append(i)
            continue
        outpkt.append(time[i])
        outpkt_index.append(i)
            
    f5 = len(inpkt)
    inpkt_num.append(f5)
    f6 = np.average(inpkt)
    inpkt_avg.append(inpkt)
    f7 = sum(inpkt)
    inpkt_sum.append(f7)
    f8 = f5 / f1
    inpkt_num_frac_total.append(f8)
    f9 = np.average(inpkt_index)
    inpkt_avg_ordering.append(f9)
    f10 = np.std(inpkt_index)
    inpkt_std_ordering.append(f10)
    
    f13 = len(outpkt)
    outpkt_num.append(f13)
    f14 = np.average(outpkt)
    outpkt_avg.append(f14)
    f15 = sum(inpkt)
    outpkt_sum.append(f15)
    f16 = f13 / f1
    outpkt_num_frac_total.append(f16)
    f17 = np.average(outpkt_index)
    outpkt_avg_ordering.append(f17)
    f18 = np.std(outpkt_index)
    outpkt_std_ordering.append(f18)
    
    f11 = f5 / f13
    inpkt_num_frac_outpkt.append(f11)
    f19 = f13 / f5
    outpkt_num_frac_inpkt.append(f19)
    
    inpkt_sum_firstn.append(f12)
    outpkt_sum_firstn.append(f21)
    
    

    #etc
    pkt_num_sec = []
    inpkt_num_sec = []
    outpkt_num_sec = []
    start_t = 0
    pkt_sum = input_sum = output_sum = 0 
    while start_t <= time_seq[-1]: 
        end_t = start_t + 1 
        pkt_in_interval = [i for i, t in enumerate(time) if start_t <= t < end_t]
        
        pkt_count = len(pkt_in_interval)
        pkt_num_sec.append(pkt_count)
        inpkt_count = sum(1 for idx in pkt_in_interval if idx in inpkt_index)
        inpkt_num_sec.append(inpkt_count)
        outpkt_count = sum(1 for idx in pkt_in_interval if idx in outpkt_index)
        outpkt_num_sec.append(inpkt_count)
        
        start_t = end_t

    pkt_avg_sec = np.average(pkt_num_sec)
    pkt_std_sec = np.std(pkt_num_sec)
    pkt_max_sec = max(pkt_num_sec)
    inpkt_avg_sec = np.average(inpkt_num_sec)
    inpkt_std_sec = np.std(inpkt_num_sec)
    inpkt_max_sec = max(inpkt_num_sec)
    outpkt_avg_sec = np.average(outpkt_num_sec)
    outpkt_std_sec = np.std(outpkt_num_sec)
    outpkt_max_sec = max(outpkt_num_sec)



## 3. Data Preprocessing

In [None]:
import pandas as pd

In [None]:
data = {
    'total_num': total_num,
    'total_sum_dir': total_sum_dir,
    'total_avg': total_avg,
    'inpkt_num': inpkt_num,
    'inpkt_avg': inpkt_avg,
    'inpkt_sum': inpkt_sum,
    'inpkt_num_frac_total': inpkt_num_frac_total,
    'inpkt_avg_ordering': inpkt_avg_ordering,
    'inpkt_std_ordering': inpkt_std_ordering,
    'inpkt_num_frac_outpkt': inpkt_num_frac_outpkt,
    'inpkt_sum_firstn': inpkt_sum_firstn,
    'outpkt_num': outpkt_num,
    'outpkt_avg': outpkt_avg,
    'outpkt_sum': outpkt_sum,
    'outpkt_num_frac_total': outpkt_num_frac_total,
    'outpkt_avg_ordering': outpkt_avg_ordering,
    'outpkt_std_ordering': outpkt_std_ordering,
    'outpkt_num_frac_inpkt': outpkt_num_frac_inpkt,
    'outpkt_sum_firstn': outpkt_sum_firstn,
    'pkt_avg_sec': pkt_avg_sec,
    'pkt_std_sec': pkt_std_sec,
    'pkt_max_sec': pkt_max_sec,
    'inpkt_avg_sec': inpkt_avg_sec,
    'inpkt_std_sec': inpkt_std_sec,
    'inpkt_max_sec': inpkt_max_sec,
    'outpkt_avg_sec': outpkt_avg_sec,
    'outpkt_std_sec': outpkt_std_sec,
    'outpkt_max_sec': outpkt_max_sec
}
df = pd.DataFrame(data)

In [None]:
print(df.head())

## 4. feature 중요도 파악을 위한 RF

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=1
)

In [None]:
df.shape

In [None]:
clf = RandomForestClassifier(n_estimators=20, criterion="entropy", max_depth=100, min_samples_split=2, max_features="sqrt", random_state=0)

In [None]:
scores = cross_val_score(clf, df, y, cv=5)
print(scores)
print(sum(scores)/len(scores))

In [None]:
clf.fit(X_train, y_train)
imp_score=clf.feature_importances_

In [None]:
top_indices=np.argsort(imp_score)[::-1][:5]
print(top_indices)
top_feature_names=[data.feature_names[int(x)] for x in top_indices]
print(top_feature_names)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
feature_imp = pd.Series(clf.feature_importances_, index=data.feature_names).sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.tight_layout()