In [None]:
# Load library
import os
import subprocess
import shlex
import shutil

from scapy.all import *
from PIL import Image
import numpy as np

In [None]:
# Setup variables
IS_RESET = False
IMAGE_SIZE = 39
ROOT_DATA = './data'
SPLIT_DATA = './splited_data'
PAYLOAD_MIN = 1184
FMT = '08d'
COUNT = 1000
IMG_DATA = f'./img_data_{PAYLOAD_MIN}_f{COUNT}'

In [None]:
if IS_RESET:
    if os.path.exists(SPLIT_DATA):
        shutil.rmtree(SPLIT_DATA)
    if os.path.exists(IMG_DATA):
        shutil.rmtree(IMG_DATA)

In [None]:
def pkt2vec(pkt):
    ip = pkt['IP']
    hexst = raw(ip).hex()
    arr = np.array([int(hexst[i:i+2], 16) for i in range(0, len(hexst), 2)])
    arr = arr[0:IMAGE_SIZE*IMAGE_SIZE]
    arr = np.pad(arr, (0, IMAGE_SIZE*IMAGE_SIZE-len(arr)), 'constant', constant_values=0)
    fv = np.reshape(arr, (-1, IMAGE_SIZE))
    fv = np.uint8(fv)
    fv = np.stack((fv, fv, fv), axis=2)
    return fv

In [None]:
# read pcap files
def read_pcap(root_dir, ext=('.pcap', '.pcapng')):
    queue = [root_dir]
    while len(queue) != 0:
        nest_dir = queue.pop()
        with os.scandir(nest_dir) as it:
            for entry in it:
                if not entry.name.startswith('.') and entry.is_file():
                    if entry.name.endswith(ext):
                        label = os.path.basename(os.path.dirname(entry.path)) # dirname is label
                        yield label, entry.path
                elif not entry.name.startswith('.') and entry.is_dir():
                    queue.append(entry.path)

In [None]:
def split_pcap(src, dst):
    os.makedirs(dst, exist_ok=True)
    cmd = f'PcapSplitter -f {src} -o {dst} -m connection'
    cmd = shlex.split(cmd)
    subprocess.run(cmd)

In [None]:
# Do not rerun!
## For secure to comment
# pcap_path = os.path.abspath(os.path.expanduser(ROOT_DATA))
# for label, path in read_pcap(pcap_path):
#     dst = os.path.abspath(os.path.expanduser(os.path.join(SPLIT_DATA, label)))
#     split_pcap(path, dst)

In [None]:
def pkt2img(base, label, cnt):
    def process_pkt(pkt):
        if not pkt.haslayer('IP'):
            return
        ip = pkt['IP']
        if not (ip.haslayer('TCP') or ip.haslayer('UDP')):
            return
        if ip.haslayer('TCP'):
            l4 = 'TCP'
        elif ip.haslayer('UDP'):
            l4 = 'UDP'
        if len(raw(ip[l4].payload)) < PAYLOAD_MIN:
            return
        fv = pkt2vec(pkt)
        num = cnt.get(label, 0)
        dst = os.path.join(base, f'{num:{FMT}}.png')
        cnt[label] = num + 1
        img = Image.fromarray(fv)
        img.save(dst)
        if num % 100 == 1:
            print(f'{label}: {num} Processed')
        else:
            print(f'\r{label}: {num} Processed', end='')
    return process_pkt

def stop_filter():
    def process_pkt(pkt):
        global current
        if not pkt.haslayer('IP'):
            return False
        ip = pkt['IP']
        if not (ip.haslayer('TCP') or ip.haslayer('UDP')):
            return False
        if ip.haslayer('TCP'):
            l4 = 'TCP'
        elif ip.haslayer('UDP'):
            l4 = 'UDP'
        if len(raw(ip[l4].payload)) < PAYLOAD_MIN:
            return False
        current += 1
        if current > COUNT:
            return True
        return False
    return process_pkt


splited_path = os.path.abspath(os.path.expanduser(SPLIT_DATA))
cnt = dict()
current = 0

for label, path in read_pcap(splited_path):
    base = os.path.abspath(os.path.expanduser(os.path.join(IMG_DATA, label)))
    os.makedirs(base, exist_ok=True)
    current = 0
    sniff(offline=path, prn=pkt2img(base, label, cnt), store=False, stop_filter=stop_filter())
    print()
print(cnt)