# Purpose of this notebook

This notebook produced the numpy arrays required for ml_test.ipynb from 20 second labeled pcaps. 

In [1]:
import os
import numpy as np
import re
from tqdm import tqdm
from scapy.all import *
from datetime import datetime as dt
from matplotlib import pyplot as plt



In [1]:
TIMEFRAME = 20          # Number of seconds per captured frame

NAME = "AWS_21k"

MIN_PACKETS = 1

# CLIENTS = [
#     "192.168.254.0",
#     "192.168.254.1",
#     "192.168.254.2",
#     "172.31.51.124"
# ]
                          # List of Client IP addresses used in the dataset.

CLIENT_SUBNET = "172"


COLLECT_FROM_ARRAYS = True


In [2]:
if COLLECT_FROM_ARRAYS:
    dir_name = "arrays"
else:
    dir_name = "pcaps"

Run below to scrape from S3.

In [3]:
!mkdir ~/Documents/Uni/UNB_Datasets/{dir_name}/{NAME};
!aws s3 cp s3://{dir_name}-for-wfa ~/Documents/Uni/UNB_Datasets/{dir_name}/{NAME} --recursive;

download: s3://arrays-for-wfa/010shangpu.com-WFA-winclient-3-118-14-2023_04_04__18_25_06.npy to arrays/AWS_21k/010shangpu.com-WFA-winclient-3-118-14-2023_04_04__18_25_06.npy
download: s3://arrays-for-wfa/0123movie.net-WFA_winclinet_4-131-10-2023_04_06__14_05_43.npy to arrays/AWS_21k/0123movie.net-WFA_winclinet_4-131-10-2023_04_06__14_05_43.npy
download: s3://arrays-for-wfa/01bzvip7.cc-WFA_winclinet_2-123-15-2023_04_03__23_10_57.npy to arrays/AWS_21k/01bzvip7.cc-WFA_winclinet_2-123-15-2023_04_03__23_10_57.npy
download: s3://arrays-for-wfa/0xkji3.xyz-EC2AMAZ-31JQHDO-131-10-2023_04_06__23_41_23.npy to arrays/AWS_21k/0xkji3.xyz-EC2AMAZ-31JQHDO-131-10-2023_04_06__23_41_23.npy
download: s3://arrays-for-wfa/05wang.com-WFA_winclinet_2-139-12-2023_04_03__13_02_24.npy to arrays/AWS_21k/05wang.com-WFA_winclinet_2-139-12-2023_04_03__13_02_24.npy
download: s3://arrays-for-wfa/01bzvip7.cc-WFA_winclinet_2-136-11-2023_04_04__10_40_25.npy to arrays/AWS_21k/01bzvip7.cc-WFA_winclinet_2-136-11-2023_04_04_

In [5]:
path = f"/Users/michaelkearney/Documents/Uni/UNB_Datasets/{dir_name}/{NAME}"



In [13]:
dir_list = os.listdir(path)
clips = [os.path.join(path, clip) for clip in dir_list if clip[-4:] == ".npy"]

In [14]:
if not COLLECT_FROM_ARRAYS:
    def extract_features_from_clips(clip):

        matrix = np.zeros([TIMEFRAME * 10, 150, 2])
        start_time = clip[0].time
        for pkt in clip:
            if IP in pkt:
                length = (lambda x: x if x <= 1500 else 1500)(pkt[IP].len) # Packets over 1500 are rounded down to 1500
                dir = (lambda x: 0 if x[IP].src[:3] == CLIENT_SUBNET else 1)(pkt)
                time_round = round(pkt.time - start_time, 1)
                if time_round >= 20:
                    time_round = 20.0
                matrix[int(time_round * 10)-1][int(length / 10)-1][dir] += 1
        return np.array(matrix)
        
    

In [15]:
features_list = []
labels_list =[]

for idx, file in enumerate(tqdm(clips)):
    if COLLECT_FROM_ARRAYS:
        with open(os.path.join(path, file), 'rb') as f:
            clip = np.load(f)
    else:
        clip = rdpcap(file)
    if len(clip) > MIN_PACKETS and not COLLECT_FROM_ARRAYS:
        features_list.append(extract_features_from_clips(clip))
        label = clips[idx][len(path)+1:].split("-")[0]
        # label = re.split("\d", label)[0]   # Uncomment for use with ICSX dataset
        # label = re.split("_[AB]", label)[0]    # Uncomment for use with ICSX dataset
        label = label.split("-")[0]
        labels_list.append(label)
    elif (np.sum(clip) > MIN_PACKETS) and COLLECT_FROM_ARRAYS:
        label = clips[idx][len(path)+1:].split("-")[0]
        # label = re.split("\d", label)[0]   # Uncomment for use with ICSX dataset
        # label = re.split("_[AB]", label)[0]    # Uncomment for use with ICSX dataset
        features_list.append(clip)
        label = label.split("-")[0]
        labels_list.append(label)

    del(clip)


In [16]:
features = np.array(features_list)
labels = np.array(labels_list)

print(f"features shape {features.shape}\nlabels shape {labels.shape}")

features shape (5176, 200, 150, 2)
labels shape (5176,)


In [17]:
unique, counts = np.unique(labels, return_counts=True)

counts = np.asarray((unique, counts)).T
counts

array([['01bzvip7.cc', '1'],
       ['05wang.com', '1'],
       ['0xkji3.xyz', '1'],
       ...,
       ['zto.com', '1'],
       ['zulily.com', '1'],
       ['zztt33.com', '1']], dtype='<U27')

In [18]:
# save = input("Do you want to save? y/n") == "y"
save = True

if save:
    date_time_format = '%Y_%m_%d__%H_%M_%S'
    current_datetime = dt.now()
    current_datetime_string = dt.strftime(current_datetime, date_time_format)

    with open(f"data/my_{TIMEFRAME}_sec_features-{NAME}-{current_datetime_string}.npy", "wb") as f:
        np.save(f, features)

    with open(f"data/my_{TIMEFRAME}_sec_labels-{NAME}-{current_datetime_string}.npy", "wb") as f:
        np.save(f, labels)