In [1]:
import os
import csv
import dpkt
import pcapml_fe

from tqdm import tqdm

# Dataset to load
PCAPML_PATH = "traffic.pcapng"

# Directory to save individual pcaps
PCAP_DIR = "pcaps"

# Directory to save labels
LABELS_CSV = "labels.csv"

os.makedirs(PCAP_DIR, exist_ok=True)

# convert sample metadata into a string
def extract_label(sample):
    meta = sample.metadata
    parts = []
    for p in meta.split(","):
        cleaned = p.strip()
        if cleaned:
            parts.append(cleaned)

    lf = parts[0]

    if "-" in lf:
        lf = lf.split("-", 1)[0]

    return lf.strip()

with open(LABELS_CSV, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Item", "Label"])

    for sample in tqdm(pcapml_fe.sampler(PCAPML_PATH), desc="Samples"):
        sid = str(sample.sid)
        label = extract_label(sample)
        pcap_name = f"{sid}.pcap"
        pcap_path = os.path.join(PCAP_DIR, pcap_name)

        pkts = sample.packets

        # Find earliest timestamp in this sample to normalize
        ts0 = float("inf")
        for p in pkts:
            ts0 = min(ts0, p.ts)

        with open(pcap_path, "wb") as pcap_f:
            # write pcap file per sample
            w = dpkt.pcap.Writer(pcap_f)
            for pkt in pkts:
                # dpkt requires positive timestamp so we normalize with ts0
                norm_ts = max(0,pkt.ts - ts0)
                w.writepkt(pkt.raw_bytes, ts=norm_ts)

        writer.writerow([pcap_name, label])



Samples: 25798it [00:15, 1687.48it/s]


In [None]:
# I set 7 arbitrarily but the idea is to create a standard train matrix size
# Example command
# nprint -4 -t -c 7 -P pcaps/273992.pcap | head

I am not sure if different nprint versions have different command structures, however, here is the help menu from the version I am running (1.2.1):
```
-4, --ipv4                 include ipv4 headers
-6, --ipv6                 include ipv6 headers
-A, --absolute_timestamps  include absolute timestmap field
-c, --count=INTEGER        number of packets to parse (if not all)
-C, --csv_file=FILE        csv (hex packets) infile
-d, --device=STRING        device to capture from if live capture
-e, --eth                  include eth headers
-f, --filter=STRING        filter for libpcap
-F, --fill_int=INT8_T      integer to fill missing bits with
-h, --nprint_filter_help   print regex possibilities
-i, --icmp                 include icmp headers
-N, --nPrint_file=FILE     nPrint infile
-O, --write_index=INTEGER  Output file Index (first column) Options:
                            
                            0: source IP (default)
                            
                            1: destination IP
                            
                            2: source port
                            
                            3: destination port
                            
                            4: flow (5-tuple)
                            
                            5: wlan tx mac
-p, --payload=PAYLOAD_SIZE include n bytes of payload
-P, --pcap_file=FILE       pcap infile
-r, --radiotap             include radiotap headers
-R, --relative_timestamps  include relative timestamp field
-S, --stats                print stats about packets processed when finished
-t, --tcp                  include tcp headers
-u, --udp                  include udp headers
-V, --verbose              print human readable packets with nPrints
-w, --wlan                 include wlan headers
-W, --write_file=FILE      file for output, else stdout
-x, --nprint_filter=STRING regex to filter bits out of nPrint. nprint -h for
                            details
-?, --help                 Give this help list
    --usage                Give a short usage message
    --version              Print program version
```

In [None]:
import subprocess
import numpy as np
import io
import csv
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

PCAP_DIR = "pcaps"
LABELS_CSV = "labels.csv"
OUT_X = "X_nprint.npy"
OUT_Y = "y_nprint.npy"
OUT_LABEL_MAP = "label_map_nprint.txt"

N_PACKETS = 7

def nprint_vector(pcap_path, n_packets=N_PACKETS):
    # same command as above
    cmd = ["nprint", "-4", "-t", "-c", str(n_packets), "-P", pcap_path]
    try:
        result = subprocess.run(cmd,check=True,capture_output=True,text=True)
    except:
        return None

    text = result.stdout.strip()
    if not text:
        return None

    #nprint outputs csv
    reader = csv.reader(io.StringIO(text))
    rows = list(reader)
    if len(rows) <= 1:
        return None

    header = rows[0]
    # discarding the index col
    n_features = len(header) - 1
    data_rows = rows[1:]

    packet_feats = []
    # Want to create uniform sized trainint/testing sets so truncate at n_packets
    for row in data_rows[:n_packets]:
        # drop index
        feat_vals = row[1:]

        # Add padding if necessary (gpt made this if statement)
        if len(feat_vals) < n_features:
            feat_vals += ["-1"]*(n_features-len(feat_vals))
        feats = []

        for v in feat_vals:
            feats.append(int(v))
        packet_feats.append(feats)

    # suppose we have less than n_packets in this sample, we just pad by adding -1s
    while len(packet_feats) < n_packets:
        packet_feats.append([-1]* n_features)

    # Return the features as a flattened array
    return np.array(packet_feats, dtype=np.int8).flatten()

def _worker(job):
    pcap_path, label_str = job
    vec = nprint_vector(pcap_path)
    return vec, label_str

def build_nprint_dataset_parallel():
    with open(LABELS_CSV) as f:
        reader = csv.DictReader(f)
        rows = list(reader)

    jobs = []
    for r in rows:
        pcap_path = os.path.join(PCAP_DIR, r["Item"])
        label_str = r["Label"]
        jobs.append((pcap_path, label_str))

    X_list = []
    y_list = []
    labels_seen = {}

    # Used GPT to modify this function to be multithreaded since it was taking so long to run
    with ThreadPoolExecutor(max_workers=8) as ex:
        for vec, label_str in tqdm(ex.map(_worker, jobs), total=len(jobs)):
            if vec is None:
                continue
            
            # idea is to create a new label mapping every time we see a new label
            if label_str not in labels_seen:
                labels_seen[label_str] = len(labels_seen)

            X_list.append(vec)
            y_list.append(labels_seen[label_str])

    # X is now a matrix with each row representing a new sample and the row containing the flattened 7 nprint representations
    X = np.stack(X_list)
    # These are the corresponding labels (as integer class ids)
    y = np.array(y_list, dtype=np.int64)
    return X, y, labels_seen

# build and save data
# I decided to save the dataset since this process takes so long
X, y, label_map = build_nprint_dataset_parallel()
np.save(OUT_X, X)
np.save(OUT_Y, y)

# write the label map too
with open(OUT_LABEL_MAP, "w") as f:
    for label, idx in label_map.items():
        f.write(f"{idx},{label}\n")

print("X shape:", X.shape)
print("y shape:", y.shape)
print("labels:", label_map)


100%|██████████| 25798/25798 [10:11<00:00, 42.21it/s]  


X shape: (25798, 6720)
y shape: (25798,)
labels: {'avtech': 0, 'huawei': 1, 'roku': 2, 'axis': 3, 'h3c': 4, 'lancom': 5, 'mikrotik': 6, 'dell': 7, 'juniper': 8, 'cisco': 9, 'zte': 10, 'nec': 11, 'adtran': 12, 'ubiquoss': 13, 'chromecast': 14}


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# load saved data for next step
X = np.load("X_nprint.npy")
y = np.load("y_nprint.npy")

# create the train test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
# First trying with a random forest classifer
clf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42,
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = balanced_accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

Accuracy: 0.8940645321678519
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       430
           1       0.80      0.75      0.78       282
           2       0.85      0.94      0.89       481
           3       0.85      0.95      0.90       531
           4       0.88      0.81      0.84       276
           5       0.94      0.92      0.93       285
           6       0.81      0.67      0.73       272
           7       0.85      0.87      0.86       290
           8       0.96      0.92      0.94       289
           9       0.97      0.94      0.95       290
          10       0.96      0.94      0.95       285
          11       0.98      0.99      0.98       290
          12       0.98      0.98      0.98       290
          13       0.98      0.93      0.95       295
          14       0.99      0.98      0.99       574

    accuracy                           0.90      5160
   macro avg       0.91      0.89      0.90      51

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

clf = Pipeline([
    # Since we have a pretty sparse matrix, scale by std not mean
    ("scaler", StandardScaler(with_mean=False)),
    ("mlp", MLPClassifier(
        hidden_layer_sizes=(256,128),
        activation="relu",
        solver="adam",
        max_iter=30,
        random_state=42,
        early_stopping=True,
        verbose=True,
        batch_size=256,
    )),
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = balanced_accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

Iteration 1, loss = 3.78500820
Validation score: 0.610950
Iteration 2, loss = 1.13859623
Validation score: 0.730136
Iteration 3, loss = 0.90572200
Validation score: 0.773740
Iteration 4, loss = 0.74436790
Validation score: 0.804748
Iteration 5, loss = 0.65368186
Validation score: 0.805233
Iteration 6, loss = 0.59900762
Validation score: 0.823159
Iteration 7, loss = 0.54989130
Validation score: 0.843992
Iteration 8, loss = 0.51773694
Validation score: 0.829942
Iteration 9, loss = 0.48238573
Validation score: 0.833333
Iteration 10, loss = 0.46053538
Validation score: 0.866764
Iteration 11, loss = 0.44340787
Validation score: 0.815891
Iteration 12, loss = 0.42075833
Validation score: 0.814438
Iteration 13, loss = 0.39405591
Validation score: 0.860950
Iteration 14, loss = 0.38429296
Validation score: 0.875969
Iteration 15, loss = 0.35442227
Validation score: 0.868217
Iteration 16, loss = 0.37086689
Validation score: 0.811047
Iteration 17, loss = 0.37177727
Validation score: 0.846899
Iterat