In [None]:
# Load library
import os
import pickle

import matplotlib.pyplot as plt
# %matplotlib widget
%matplotlib inline

from scapy.all import *

import pandas as pd

# Calculate packet size
## Archive

In [None]:
# read pcap files
def read_pcap(root_dir, ext=('.pcap', '.pcapng')):
    queue = [root_dir]
    while len(queue) != 0:
        nest_dir = queue.pop()
        with os.scandir(nest_dir) as it:
            for entry in it:
                if not entry.name.startswith('.') and entry.is_file():
                    if entry.name.endswith(ext):
                        label = os.path.basename(os.path.dirname(entry.path)) # dirname is label
                        yield label, entry.path
                elif not entry.name.startswith('.') and entry.is_dir():
                    queue.append(entry.path)

In [None]:
SPLIT_DATA = './dataset/pcap_data_splited'

def pkt2img(label, cnt):
    def process_pkt(pkt):
        if not pkt.haslayer('IP'):
            return
        ip = pkt['IP']
        if not (ip.haslayer('TCP') or ip.haslayer('UDP')):
            return
        if ip.haslayer('TCP'):
            l4 = 'TCP'
        elif ip.haslayer('UDP'):
            l4 = 'UDP'
        if label not in cnt.keys():
            cnt[label] = dict()
        size = len(raw(ip[l4].payload))
        cnt[label][size] = cnt[label].get(size, 0) + 1
    return process_pkt

splited_path = os.path.abspath(os.path.expanduser(SPLIT_DATA))
cnt = dict()
for label, path in read_pcap(splited_path):
    print(f'Current processing: {label} {path}', end='\r')
    sniff(offline=path, prn=pkt2img(label, cnt), store=False)

cntl = dict()
for label, value in cnt.items():
    cntl[label] = pd.DataFrame(sorted(value.items()), columns=['size', 'count'])
    
# with open('packetsize.pickle', 'wb') as f:
#     pickle.dump(cntl, f)

# Draw Packet size chart
## Restore packet size pickle

In [None]:
# import pickle

# with open('packetsize.pickle', 'rb') as f:
#     cntl = pickle.load(f)

In [None]:
fig1 = plt.figure(figsize=(16, 4.5), tight_layout=True)

lkey = len(cntl.keys())
i = 0

for label in sorted(cntl.keys()):
    ax = fig1.add_subplot(2, lkey//2, i+1)
    ax.stem(cntl[label]['size'], cntl[label]['count'],
            markerfmt='None', linefmt='darkgray',
            use_line_collection=True)
    td = cntl[label][cntl[label]['size']>=152]
    ax.stem(td['size'], td['count'],
            markerfmt='None', linefmt='dimgray',
            use_line_collection=True)
    orate = round(td.shape[0] / cntl[label].shape[0], 2)
    ax.text(1500, 10**4*1.5, orate, color='black')
    ax.set_yscale('log')
    ax.set_xlim([0, 2000])
    ax.set_xticks([0, 500, 1000, 1500, 2000])
    ax.set_ylim([1, 10**5*1.1])
    ax.set_yticks([10**x for x in range(1, 6)])
    ax.set_xlabel(f'{label} ({round(cntl[label].shape[0]/1000, 2)}k)', size='x-large')
    ax.margins(10, 1000)
    if i%(lkey//2) != 0:
        ax.set_yticklabels([])
    i = i + 1

In [None]:
print('The number of packets by applications')
for label in sorted(cntl.keys()):
    print(f'{label} {sum(cntl[label]["count"])}')
print('The number of packets over size 1148 by applications')
for label in sorted(cntl.keys()):
    td = cntl[label][cntl[label]['size']>=1184]
    print(f'{label} {sum(td["count"])}')
print(f'The sum of packets {sum(cntl[label]['count'])}')