In [1]:
import os
import os.path
import sys
import pandas as pd
import numpy as np
from importlib import reload

In [4]:
LOCAL_PATH = 'data'
TRACE_NAME = 'youtube'
RAW_TRACE = '{trace_name}.pcapng'.format(trace_name=TRACE_NAME)
TCP_TRACE_FEATURE_FILE = '{trace_name}_tcp_pkt.csv'.format(trace_name=TRACE_NAME)
UDP_TRACE_FEATURE_FILE = '{trace_name}_udp_pkt.csv'.format(trace_name=TRACE_NAME)
TCP_FLOW_FEATURE_FILE = '{trace_name}_tcp_flow.csv'.format(trace_name=TRACE_NAME)
UDP_FLOW_FEATURE_FILE = '{trace_name}_udp_flow.csv'.format(trace_name=TRACE_NAME)
# BUCKET_NAME = '' # replace with your bucket name
# KEY = '' # replace with your object key

In [5]:
if not os.path.exists(os.path.join(LOCAL_PATH, RAW_TRACE)):
    if not os.path.exists(LOCAL_PATH):
        os.mkdir(LOCAL_PATH)
    
    import boto3
    import botocore

    s3 = boto3.resource('s3')

    try:
        s3.Bucket(BUCKET_NAME).download_file(KEY, os.path.join(LOCAL_PATH, RAW_TRACE))
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise

In [6]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.udp_generate(os.path.join(LOCAL_PATH, RAW_TRACE) , os.path.join(LOCAL_PATH, UDP_TRACE_FEATURE_FILE))
%time packet_feature.tcp_generate(os.path.join(LOCAL_PATH, RAW_TRACE) , os.path.join(LOCAL_PATH, TCP_TRACE_FEATURE_FILE))

Conversion done
CPU times: user 34.4 ms, sys: 37.6 ms, total: 72 ms
Wall time: 7.77 s
Conversion done
CPU times: user 1.33 s, sys: 104 ms, total: 1.43 s
Wall time: 8.61 s


In [12]:
def to_csv(trace_df, extract_func, file, features, max_packets_per_flow, time_delta_threshold):
    df = extract_func(trace_df, 1.0, upsampled=True, max_packets_per_flow=max_packets_per_flow, time_delta_threshold=time_delta_threshold)
    if df.shape[0] == 0:
        pd.DataFrame(columns=features).to_csv(file, index=False)
    else:
        df[features].to_csv(file, index=False)

In [14]:
from python import flow_feature
reload(flow_feature)

FEATURES = ['avg(pkt_len)', 'stddev(pkt_len)', 'fb_ratio', 'inter_arrival_time', 'pkt_count', 'duration']

tcp_trace_df = pd.read_csv(os.path.join(LOCAL_PATH, TCP_TRACE_FEATURE_FILE))
udp_trace_df = pd.read_csv(os.path.join(LOCAL_PATH, UDP_TRACE_FEATURE_FILE))
for trace_df in [tcp_trace_df, udp_trace_df]:
    if tcp_trace_df is trace_df:
        to_csv(trace_df, flow_feature.tcp_generate, os.path.join(LOCAL_PATH, TCP_FLOW_FEATURE_FILE), FEATURES, max_packets_per_flow=1000, time_delta_threshold=1)
    elif udp_trace_df is trace_df:
        to_csv(trace_df, flow_feature.udp_generate, os.path.join(LOCAL_PATH, UDP_FLOW_FEATURE_FILE), FEATURES, max_packets_per_flow=1000, time_delta_threshold=1)
    else:
        raise



[A[A



tcp flows, max pkt per flow->1000:   0%|          | 0/65980 [00:00<?, ?it/s][A[A[A[A



tcp flows, max pkt per flow->1000:   0%|          | 1/65980 [00:00<1:51:04,  9.90it/s][A[A[A[A



tcp flows, max pkt per flow->1000:   0%|          | 268/65980 [00:00<1:17:33, 14.12it/s][A[A[A[A



tcp flows, max pkt per flow->1000:   1%|          | 465/65980 [00:00<54:17, 20.11it/s]  [A[A[A[A



tcp flows, max pkt per flow->1000:   1%|          | 794/65980 [00:00<37:54, 28.66it/s][A[A[A[A



tcp flows, max pkt per flow->1000:   2%|▏         | 1102/65980 [00:00<26:31, 40.77it/s][A[A[A[A



tcp flows, max pkt per flow->1000:   2%|▏         | 1396/65980 [00:00<18:35, 57.90it/s][A[A[A[A



tcp flows, max pkt per flow->1000:   3%|▎         | 1662/65980 [00:00<13:04, 81.96it/s][A[A[A[A



tcp flows, max pkt per flow->1000:   3%|▎         | 1956/65980 [00:00<09:13, 115.70it/s][A[A[A[A



tcp flows, max pkt per flow->1000:   3%|▎         | 2218/65980 [00:00<0

tcp flows, max pkt per flow->1000:  31%|███       | 20288/65980 [00:08<00:23, 1968.65it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  31%|███▏      | 20659/65980 [00:08<00:19, 2291.23it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  32%|███▏      | 20949/65980 [00:08<00:18, 2443.98it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  32%|███▏      | 21257/65980 [00:08<00:17, 2604.74it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  33%|███▎      | 21568/65980 [00:08<00:16, 2738.02it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  33%|███▎      | 21857/65980 [00:09<00:15, 2764.76it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  34%|███▎      | 22144/65980 [00:09<00:16, 2686.39it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  34%|███▍      | 22421/65980 [00:09<00:16, 2596.59it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  34%|███▍      | 22687/65980 [00:09<00:17, 2482.56it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  35%|███▍  

tcp flows, max pkt per flow->1000:  66%|██████▌   | 43220/65980 [00:17<00:07, 2848.38it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  66%|██████▌   | 43527/65980 [00:17<00:07, 2908.10it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  66%|██████▋   | 43846/65980 [00:17<00:07, 2985.75it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  67%|██████▋   | 44165/65980 [00:17<00:07, 3043.31it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  67%|██████▋   | 44527/65980 [00:17<00:06, 3195.72it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  68%|██████▊   | 44871/65980 [00:17<00:06, 3263.48it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  69%|██████▊   | 45201/65980 [00:17<00:06, 3177.26it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  69%|██████▉   | 45522/65980 [00:18<00:06, 3107.66it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  70%|██████▉   | 45868/65980 [00:18<00:06, 3204.13it/s][A[A[A[A



tcp flows, max pkt per flow->1000:  70%|██████