In [None]:
# move two dirs up
%cd ../..

In [None]:
import os
import pandas as pd

# set path
base_path = os.getcwd()
path = os.path.join(base_path, "data", "output", "concatenated.csv")
df = pd.read_csv(path)

In [None]:
import numpy as np

# statistical numbers
num_df = df.select_dtypes(include=[np.number])

# compute metrics
mins    = num_df.min()
maxs    = num_df.max()
nunique = num_df.nunique()
sums    = num_df.sum()

# combine into summary table
stats = pd.DataFrame({
    'min':    mins,
    'max':    maxs,
    'nunique': nunique,
    'sum':    sums
})

print(stats)

In [None]:
# constants
cols = [
    'fwd_URG_flag_count',
    'bwd_URG_flag_count',
    'flow_CWR_flag_count',
    'flow_ECE_flag_count'
]

# drop constants
df = df.drop(columns=cols)

In [None]:
import matplotlib.pyplot as plt

# print some initial counts and plots

# chart 1: attack vs benign
attack_counts = df['attack'].value_counts()
plt.figure()
attack_counts.plot(kind='bar')
plt.title('Attack vs Benign')
plt.xlabel('Attack')
plt.ylabel('Count')
plt.show()
print("Attack vs Benign counts:\n", attack_counts, "\n")

# chart 2: service distribution
service_counts = df['service'].value_counts()
plt.figure()
service_counts.plot(kind='bar')
plt.title('Service Distribution')
plt.xlabel('Service')
plt.ylabel('Count')
plt.show()
print("Service distribution counts:\n", service_counts)

# chart 3: attack distribution
attack_counts = df['attack_type'].value_counts()
plt.figure()
attack_counts.plot(kind='bar')
plt.title('Attack vs Benign')
plt.xlabel('Attack')
plt.ylabel('Count')
plt.show()
print("Attack type counts:\n", attack_counts, "\n")




> “The attacks revshell and Server-Side Request Forgery (SSRF) are usually only successful when the victim server creates a new connection  
> to a host specified in one of the attacks. This results in that proper detection is only guaranteed when at least two flows are analyzed.”  
> — Lanfer *et al.*, 2025  

Because this pipeline performs **flow-level classification** , it cannot reliably detect:

- Reverse Shell attacks  
  - `revshell_http`  
  - `revshell_https`  
- Server-Side Request Forgery (SSRF) attacks  
  - `ssrf_http`  
  - `ssrf_https`  


In [None]:
# attacks to exclude
exclude_attacks = ['revshell_https', 'revshell_http', 'ssrf_https', 
                   'ssrf_http', 'xss_http', 'xss_https', 'smtp_enum'] # remove smtp_enum as well due to its very low sample count

# df without excluded attacks
df = df[~df['attack_type'].isin(exclude_attacks)].copy()

# map to merge http and https and similar attacks
merge_map = {
    'bruteforce_http': 'bruteforce',
    'bruteforce_https': 'bruteforce',
    'sql_injection_http': 'sql_injection',
    'sql_injection_https': 'sql_injection',
    'dos_http': 'dos',
    'dos_https': 'dos',
    'ssh_login_successful': 'ssh_login',
    'hostsweep_Pn': 'hostsweep',
    'hostsweep_sn': 'hostsweep'
}

# merge attacks
df['attack_type'] = df['attack_type'].replace(merge_map)

# chart 4: attack distribution v2
attack_counts = df['attack_type'].value_counts()
plt.figure()
attack_counts.plot(kind='bar')
plt.title('Attack vs Benign')
plt.xlabel('Attack')
plt.ylabel('Count')
plt.show()
print("Attack type counts:\n", attack_counts, "\n")

In [None]:
# select only numeric columns
num_df = df.select_dtypes(include=[np.number])

# compute correlation matrix
corr_matrix = num_df.corr()

# plot heatmap
plt.figure(figsize=(16, 12))
plt.imshow(corr_matrix, aspect='auto', interpolation='nearest', cmap='coolwarm')
plt.colorbar(label='r')
plt.xticks(ticks=np.arange(len(corr_matrix.columns)), labels=corr_matrix.columns, rotation=90)
plt.yticks(ticks=np.arange(len(corr_matrix.index)), labels=corr_matrix.index)
plt.title('Correlation Heatmap of Numeric Features')
plt.tight_layout()
plt.show()

# mask out the upper triangle and the diagonal
mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)

# stack into a series of pairwise correlations
corr_pairs = corr_matrix.where(mask).stack()

# select those above 0.9
high_corr = corr_pairs[abs(corr_pairs) > 0.9].sort_values(ascending=False)

print("Feature pairs with absoulte r > 0.9:\n")
for (feat1, feat2), r in high_corr.items():
    print(f"{feat1:30s} <-> {feat2:30s} : r = {r:.3f}")

# extract all feature names from the index
feat1 = high_corr.index.get_level_values(0)
feat2 = high_corr.index.get_level_values(1)

# build a sorted list of unique features
high_corr_features = sorted(set(feat1).union(feat2))

print("Features appearing in any |r| > 0.9 pair:")
print(high_corr_features)


In [None]:
drop_cols = [
    'bwd_data_pkts_tot',
    'bwd_pkts_per_sec',
    'flow_pkts_per_sec',
    'fwd_header_size_tot',
    'fwd_header_size_max',
    'bwd_header_size_max',
    'flow_FIN_flag_count',
    'flow_SYN_flag_count',
    'flow_ACK_flag_count',
    'bwd_init_window_size'
]

In [None]:
def make_zero_day_splits(df,
                         splits: dict,
                         n_benign_zero_day: int = 220000,
                         train_samples: tuple = (500000, 500000),
                         test_samples: tuple = (199800, 200),
                         random_state: int = 1304):

    
    for split_name, zero_day_attacks in splits.items():
        # split off attack flows for test
        df_test = df[df['attack_type'].isin(zero_day_attacks)].copy()
        df_train = df[~df['attack_type'].isin(zero_day_attacks)].copy()
        
        # sample benigns for test
        df_train_benign = df_train[df_train['attack_type'] == 'benign']
        df_benign_zero_day = df_train_benign.sample(
            n=n_benign_zero_day, random_state=random_state
        )
        df_train = df_train.drop(df_benign_zero_day.index)
        df_test = pd.concat([df_test, df_benign_zero_day])
        
        # binary encode attack column
        mapping = {'benign': 0, 'attack': 1}
        df_train['attack'] = df_train['attack'].map(mapping)
        df_test['attack']  = df_test['attack'].map(mapping)
        
        # balance & shuffle train
        n_benign_tr, n_attack_tr = train_samples
        benign_train = df_train[df_train['attack'] == 0].sample(n=n_benign_tr, random_state=random_state)
        attack_train = df_train[df_train['attack'] == 1].sample(n=n_attack_tr, random_state=random_state)
        df_train_final = pd.concat([benign_train, attack_train]).sample(frac=1, random_state=random_state)
        
        # balance & shuffle test
        n_benign_te, n_attack_te = test_samples
        benign_test = df_test[df_test['attack'] == 0].sample(n=n_benign_te, random_state=random_state)
        attack_test = df_test[df_test['attack'] == 1].sample(n=n_attack_te, random_state=random_state)
        df_test_final = pd.concat([benign_test, attack_test]) .sample(frac=1, random_state=random_state)
        
        # save to csv
        train_path = os.path.join(base_path, "data", "output", f"train_{split_name}.csv")
        test_path  = os.path.join(base_path, "data", "output", f"test_{split_name}.csv")
        df_train_final.to_csv(train_path, index=False)
        df_test_final.to_csv(test_path,  index=False)
        
        # report
        print(f"Split '{split_name}' -> train: {df_train_final.shape}, "
              f"test: {df_test_final.shape}")


In [None]:
splits = {
    'split_1': ['ftp_login', 'ftp_version', 'smtp_version'],
    'split_2': ['sql_injection', 'dos', 'ssh_login']
}

make_zero_day_splits(df, splits)