# Purpose of this notebook

This notebook produced the numpy arrays required for ml_test.ipynb from 20 second labeled pcaps. 

In [1]:
import os
import numpy as np
import re
from scapy.all import *
from datetime import datetime as dt
from matplotlib import pyplot as plt



In [2]:
TIMEFRAME = 20          # Number of seconds per captured frame

NAME = "AWS_trial_NO_GUI_30k"

MIN_PACKETS = 200

CLIENTS = [
    "192.168.254.0",
    "192.168.254.1",
    "192.168.254.2",
    "172.31.51.124"
]
                          # List of Client IP addresses used in the dataset.

Run below to scrape from S3.

In [3]:
!mkdir ~/Documents/Uni/UNB_Datasets/pcaps/{NAME};
!aws s3 cp s3://pcaps-for-wfa ~/Documents/Uni/UNB_Datasets/pcaps/{NAME} --recursive;

mkdir: /Users/michaelkearney/Documents/Uni/UNB_Datasets/pcaps/AWS_trial_NO_GUI_30k: File exists
download: s3://pcaps-for-wfa/0101.co.jp-VM1-140-12-2023_03_21__17_40_57.pcap to pcaps/AWS_trial_NO_GUI_30k/0101.co.jp-VM1-140-12-2023_03_21__17_40_57.pcap
download: s3://pcaps-for-wfa/010shangpu.com-VM1-112-11-2023_03_21__08_04_40.pcap to pcaps/AWS_trial_NO_GUI_30k/010shangpu.com-VM1-112-11-2023_03_21__08_04_40.pcap
download: s3://pcaps-for-wfa/0123movies.com-VM1-138-15-2023_03_21__09_13_30.pcap to pcaps/AWS_trial_NO_GUI_30k/0123movies.com-VM1-138-15-2023_03_21__09_13_30.pcap
download: s3://pcaps-for-wfa/01thy.top-VM1-182-12-2023_03_20__03_34_50.pcap to pcaps/AWS_trial_NO_GUI_30k/01thy.top-VM1-182-12-2023_03_20__03_34_50.pcap
download: s3://pcaps-for-wfa/0101.co.jp-VM1-155-15-2023_03_20__04_02_23.pcap to pcaps/AWS_trial_NO_GUI_30k/0101.co.jp-VM1-155-15-2023_03_20__04_02_23.pcap
download: s3://pcaps-for-wfa/01bzvip7.cc-VM1-109-11-2023_03_20__02_44_44.pcap to pcaps/AWS_trial_NO_GUI_30k/01bzvip

In [4]:
PATH = f"/Users/michaelkearney/Documents/Uni/UNB_Datasets/pcaps/{NAME}"

In [5]:
dir_list = os.listdir(PATH)
clips = [os.path.join(PATH, clip) for clip in dir_list if clip[-5:] == ".pcap"]

In [6]:
def extract_features_from_clips(clip):

    matrix = np.zeros([TIMEFRAME * 10, 150, 2])
    start_time = clip[0].time
    for pkt in clip:
        if IP in pkt:
            length = (lambda x: x if x <= 1500 else 1500)(pkt[IP].len) # Packets over 1500 are rounded down to 1500
            dir = (lambda x: 0 if x[IP].src in CLIENTS else 1)(pkt)
            time_round = round(pkt.time - start_time, 1)
            if time_round >= 20:
                time_round = 20.0
            matrix[int(time_round * 10)-1][int(length / 10)-1][dir] += 1
    return np.array(matrix)
    
    

In [7]:
features_list = []
labels_list =[]

for idx, file in enumerate(clips):
    clip = rdpcap(file)
    if len(clip) > MIN_PACKETS:
        features_list.append(extract_features_from_clips(clip))
        label = clips[idx][len(PATH)+1:].split("-")[0]
        # label = re.split("\d", label)[0]   # Uncomment for use with ICSX dataset
        # label = re.split("_[AB]", label)[0]    # Uncomment for use with ICSX dataset
        label = label.split("-")[0]
        labels_list.append(label)
    del(clip)


In [8]:
features = np.array(features_list)
labels = np.array(labels_list)

print(f"features shape {features.shape}\nlabels shape {labels.shape}")

features shape (11497, 200, 150, 2)
labels shape (11497,)


In [9]:
unique, counts = np.unique(labels, return_counts=True)

counts = np.asarray((unique, counts)).T
counts

array([['010shangpu.com', '1'],
       ['0123movies.com', '1'],
       ['05wang.com', '1'],
       ...,
       ['zztt15.com', '1'],
       ['zztt30.com', '1'],
       ['zztt33.com', '2']], dtype='<U33')

In [10]:
# save = input("Do you want to save? y/n") == "y"
save = True

if save:
    date_time_format = '%Y_%m_%d__%H_%M_%S'
    current_datetime = dt.now()
    current_datetime_string = dt.strftime(current_datetime, date_time_format)

    with open(f"data/my_{TIMEFRAME}_sec_features-{NAME}-{current_datetime_string}.npy", "wb") as f:
        np.save(f, features)

    with open(f"data/my_{TIMEFRAME}_sec_labels-{NAME}-{current_datetime_string}.npy", "wb") as f:
        np.save(f, labels)