# Purpose of this notebook

This notebook produced the numpy arrays required for ml_test.ipynb from 20 second labeled pcaps. 

In [1]:
import os
import numpy as np
import re
from scapy.all import *
from datetime import datetime as dt
from matplotlib import pyplot as plt



In [2]:
TIMEFRAME = 20          # Number of seconds per captured frame

PATH = "pcaps/big_capture"           # Where the clips are saved
NAME = "100_plus_additional_large"

MIN_PACKETS = 20

CLIENTS = [
    "192.168.254.0",
    "192.168.254.1",
    "192.168.254.2"
]
                          # List of Client IP addresses used in the dataset.

In [3]:
dir_list = os.listdir(PATH)
clips = [os.path.join(PATH, clip) for clip in dir_list if clip[-5:] == ".pcap"]

In [4]:
def extract_features_from_clips(clip):

    matrix = np.zeros([TIMEFRAME * 10, 150, 2])
    start_time = clip[0].time
    for pkt in clip:
        if IP in pkt:
            length = (lambda x: x if x <= 1500 else 1500)(pkt[IP].len) # Packets over 1500 are rounded down to 1500
            dir = (lambda x: 0 if x[IP].src in CLIENTS else 1)(pkt)
            time_round = round(pkt.time - start_time, 1) 
            matrix[int(time_round * 10)-1][int(length / 10)-1][dir] += 1
    return np.array(matrix)
    
    

In [5]:
features_list = []
labels_list =[]

for idx, file in enumerate(clips):
    clip = rdpcap(file)
    if len(clip) > MIN_PACKETS:
        features_list.append(extract_features_from_clips(clip))
        label = clips[idx][len(PATH)+1:].split("-")[0]
        # label = re.split("\d", label)[0]   # Uncomment for use with ICSX dataset
        # label = re.split("_[AB]", label)[0]    # Uncomment for use with ICSX dataset
        label = label.split("-")[0]
        labels_list.append(label)
    del(clip)


In [6]:
features = np.array(features_list)
labels = np.array(labels_list)

print(f"features shape {features.shape}\nlabels shape {labels.shape}")

features shape (18937, 200, 150, 2)
labels shape (18937,)


In [7]:
unique, counts = np.unique(labels, return_counts=True)

counts = np.asarray((unique, counts)).T
counts

array([['0101.co.jp', '2'],
       ['0123movie.net', '1'],
       ['0123movies.com', '3'],
       ...,
       ['zzvips.com', '3'],
       ['zzzfun.com', '2'],
       ['zzzmh.cn', '2']], dtype='<U32')

In [8]:
# save = input("Do you want to save? y/n") == "y"
save = True

if save:
    date_time_format = '%Y_%m_%d__%H_%M_%S'
    current_datetime = dt.now()
    current_datetime_string = dt.strftime(current_datetime, date_time_format)

    with open(f"data/my_{TIMEFRAME}_sec_features-{NAME}-{current_datetime_string}.npy", "wb") as f:
        np.save(f, features)

    with open(f"data/my_{TIMEFRAME}_sec_labels-{NAME}-{current_datetime_string}.npy", "wb") as f:
        np.save(f, labels)