# Purpose of this notebook

This notebook produced the numpy arrays required for ml_test.ipynb from 20 second labeled pcaps. 

In [22]:
import os
import numpy as np
import re
from scapy.all import *
from datetime import datetime as dt
from matplotlib import pyplot as plt

In [23]:
TIMEFRAME = 20          # Number of seconds per captured frame

NAME = "AWS_trial_NO_GUI"

MIN_PACKETS = 20

CLIENTS = [
    "192.168.254.0",
    "192.168.254.1",
    "192.168.254.2",
    "172.31.51.124"
]
                          # List of Client IP addresses used in the dataset.

Run below to scrape from S3.

In [24]:
!mkdir ~/Documents/Uni/UNB_Datasets/pcaps/{NAME};
!aws s3 cp s3://pcaps-for-wfa ~/Documents/Uni/UNB_Datasets/pcaps/{NAME} --recursive;

download: s3://pcaps-for-wfa/3dmgame.com-VM1-93-11-2023_03_12__18_03_34.pcap to pcaps/AWS_trial_NO_GUI/3dmgame.com-VM1-93-11-2023_03_12__18_03_34.pcap
download: s3://pcaps-for-wfa/52pojie.cn-VM1-99-10-2023_03_12__17_56_40.pcap to pcaps/AWS_trial_NO_GUI/52pojie.cn-VM1-99-10-2023_03_12__17_56_40.pcap
download: s3://pcaps-for-wfa/1688.com-VM1-73-14-2023_03_12__18_05_13.pcap to pcaps/AWS_trial_NO_GUI/1688.com-VM1-73-14-2023_03_12__18_05_13.pcap
download: s3://pcaps-for-wfa/babayu.tv-VM1-104-13-2023_03_12__17_57_08.pcap to pcaps/AWS_trial_NO_GUI/babayu.tv-VM1-104-13-2023_03_12__17_57_08.pcap
download: s3://pcaps-for-wfa/365ball.com-VM1-244-13-2023_03_12__18_01_54.pcap to pcaps/AWS_trial_NO_GUI/365ball.com-VM1-244-13-2023_03_12__18_01_54.pcap
download: s3://pcaps-for-wfa/biccamera.com-VM1-126-14-2023_03_12__17_48_42.pcap to pcaps/AWS_trial_NO_GUI/biccamera.com-VM1-126-14-2023_03_12__17_48_42.pcap
download: s3://pcaps-for-wfa/beautifulbeachtab.com-VM1-126-15-2023_03_12__17_44_08.pcap to pcaps

In [25]:
PATH = f"/Users/michaelkearney/Documents/Uni/UNB_Datasets/pcaps/{NAME}"

In [26]:
dir_list = os.listdir(PATH)
clips = [os.path.join(PATH, clip) for clip in dir_list if clip[-5:] == ".pcap"]

In [27]:
def extract_features_from_clips(clip):

    matrix = np.zeros([TIMEFRAME * 10, 150, 2])
    start_time = clip[0].time
    for pkt in clip:
        if IP in pkt:
            length = (lambda x: x if x <= 1500 else 1500)(pkt[IP].len) # Packets over 1500 are rounded down to 1500
            dir = (lambda x: 0 if x[IP].src in CLIENTS else 1)(pkt)
            time_round = round(pkt.time - start_time, 1)
            if time_round >= 20:
                time_round = 20.0
            matrix[int(time_round * 10)-1][int(length / 10)-1][dir] += 1
    return np.array(matrix)
    
    

In [28]:
features_list = []
labels_list =[]

for idx, file in enumerate(clips):
    clip = rdpcap(file)
    if len(clip) > MIN_PACKETS:
        features_list.append(extract_features_from_clips(clip))
        label = clips[idx][len(PATH)+1:].split("-")[0]
        # label = re.split("\d", label)[0]   # Uncomment for use with ICSX dataset
        # label = re.split("_[AB]", label)[0]    # Uncomment for use with ICSX dataset
        label = label.split("-")[0]
        labels_list.append(label)
    del(clip)


In [29]:
features = np.array(features_list)
labels = np.array(labels_list)

print(f"features shape {features.shape}\nlabels shape {labels.shape}")

features shape (83, 200, 150, 2)
labels shape (83,)


In [30]:
unique, counts = np.unique(labels, return_counts=True)

counts = np.asarray((unique, counts)).T
counts

array([['1688.com', '1'],
       ['365ball.com', '1'],
       ['3dmgame.com', '1'],
       ['52pojie.cn', '1'],
       ['aliexpress.com', '1'],
       ['amazon.com', '2'],
       ['anhdep24.com', '1'],
       ['asos.com', '1'],
       ['babayu.tv', '1'],
       ['beautifulbeachtab.com', '1'],
       ['betaseries.com', '1'],
       ['biccamera.com', '1'],
       ['bnu.edu.cn', '1'],
       ['cdsc.com.np', '1'],
       ['cmu.edu', '1'],
       ['csdn.net', '1'],
       ['delfi.lt', '1'],
       ['diandongwajueji.com', '1'],
       ['etsy.com', '2'],
       ['etsystatic.com', '1'],
       ['fashionnova.com', '1'],
       ['fruitmail.net', '1'],
       ['ga', '1'],
       ['geneanet.org', '1'],
       ['google.com', '1'],
       ['hentai2read.com', '1'],
       ['hlsplayer.net', '1'],
       ['ikman.lk', '1'],
       ['imdb.com', '1'],
       ['javfinder.li', '1'],
       ['jcrew.com', '1'],
       ['kelatv.com', '1'],
       ['link1s.com', '1'],
       ['lnk.to', '1'],
       ['loawa.com'

In [31]:
# save = input("Do you want to save? y/n") == "y"
save = True

if save:
    date_time_format = '%Y_%m_%d__%H_%M_%S'
    current_datetime = dt.now()
    current_datetime_string = dt.strftime(current_datetime, date_time_format)

    with open(f"data/my_{TIMEFRAME}_sec_features-{NAME}-{current_datetime_string}.npy", "wb") as f:
        np.save(f, features)

    with open(f"data/my_{TIMEFRAME}_sec_labels-{NAME}-{current_datetime_string}.npy", "wb") as f:
        np.save(f, labels)