In [5]:
import os
import os.path
import sys
import pandas as pd
import numpy as np

In [6]:
TRACE_FILE_NAME = 'data/bilibili.pcapng' # replace with your raw trace .pcapng/.pcap file name
TRACE_FEATURE_FILE_NAME = 'data/bilibili.csv' # replace with your favorite trace feature .csv file name

In [7]:
import subprocess
if not os.path.exists(TRACE_FEATURE_FILE_NAME):
    tshark_command = subprocess.Popen('tshark -r {} -Y tcp -T fields -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e tcp.len -e frame.time_relative -e tcp.seq -e tcp.ack -e tcp.flags.ack -e tcp.flags.syn -e tcp.flags.fin -e tcp.stream -Eheader=y -Eseparator=, > {}'.format(TRACE_FILE_NAME, TRACE_FEATURE_FILE_NAME), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out_data, err_data = tshark_command.communicate()
    out_data, err_data = out_data.decode('utf-8'), err_data.decode('utf-8')
    if err_data != '':
        pass

In [8]:
trace_df = pd.read_csv(TRACE_FEATURE_FILE_NAME)
trace_df['src_addr'] = trace_df['ip.src'] + ":" + trace_df['tcp.srcport'].apply(str)
trace_df['dst_addr'] = trace_df['ip.dst'] + ":" + trace_df['tcp.dstport'].apply(str)

In [10]:
trace_df.shape

(98181, 14)

In [11]:
trace_df.dtypes

ip.src                  object
ip.dst                  object
tcp.srcport              int64
tcp.dstport              int64
tcp.len                  int64
frame.time_relative    float64
tcp.seq                  int64
tcp.ack                  int64
tcp.flags.ack            int64
tcp.flags.syn            int64
tcp.flags.fin            int64
tcp.stream               int64
src_addr                object
dst_addr                object
dtype: object

In [12]:
trace_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,tcp.len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream
count,98181.0,98181.0,98181.0,98181.0,98181.0,98181.0,98181.0,98181.0,98181.0,98181.0
mean,12676.712276,43150.949125,974.612552,130.849073,17551480.0,4773411.0,0.997586,0.001222,0.0033,40.864597
std,22855.641473,22853.048173,543.610293,88.58776,15605730.0,11252960.0,0.049072,0.034939,0.057351,17.156651
min,80.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,443.0,55371.0,813.0,43.046007,507991.0,1186.0,1.0,0.0,0.0,23.0
50%,443.0,55371.0,1300.0,172.259552,15458870.0,1540.0,1.0,0.0,0.0,56.0
75%,443.0,55392.0,1300.0,211.174497,30988440.0,2473.0,1.0,0.0,0.0,56.0
max,55418.0,55418.0,1300.0,358.903394,50413830.0,50413800.0,1.0,1.0,1.0,88.0


In [13]:
trace_df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,tcp.len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,src_addr,dst_addr
0,47.91.74.133,172.16.26.207,443,55329,31,0.0,1,1,1,0,0,0,47.91.74.133:443,172.16.26.207:55329
1,47.91.74.133,172.16.26.207,443,55329,0,4e-06,32,1,1,0,1,0,47.91.74.133:443,172.16.26.207:55329
2,172.16.26.207,47.91.74.133,55329,443,0,5.1e-05,1,32,1,0,0,0,172.16.26.207:55329,47.91.74.133:443
3,172.16.26.207,47.91.74.133,55329,443,0,8.2e-05,1,33,1,0,0,0,172.16.26.207:55329,47.91.74.133:443
4,172.16.26.207,47.91.74.133,55329,443,31,0.000138,1,33,1,0,0,0,172.16.26.207:55329,47.91.74.133:443


In [14]:
from tqdm import tqdm
def to_feature_df(raw_trace_df,sampling_rate=1.0,upsampled=False):
    def calculate_two_way_tcp(df):
        def get_statistical_features(df, criter, feature_name,name_pred):
            # upsampling
            feature_avg = df[criter][feature_name].mean()
            feature_avg = -1 if pd.isnull(feature_avg) else feature_avg
            feature_min = df[criter][feature_name].min()
            feature_min = -1 if pd.isnull(feature_min) else feature_min
            feature_max = df[criter][feature_name].max()
            feature_max = -1 if pd.isnull(feature_max) else feature_max
            feature_std = df[criter][feature_name].std()
            feature_std = -1 if pd.isnull(feature_std) else feature_std
            feature_sum = df[criter][feature_name].sum()
            feature_sum = -1 if pd.isnull(feature_sum) else feature_sum / sampling_rate if upsampled else feature_sum
            feature_count = df[criter][feature_name].count()
            feature_count = feature_count / sampling_rate if upsampled else feature_count
            return {'avg('+name_pred+')':feature_avg,'std('+name_pred+')':feature_std,'min('+name_pred+')':feature_min,'max('+name_pred+')':feature_max,'count('+name_pred[0:8]+')':feature_count, 'sum('+name_pred+')':feature_sum}
        
        addrs = list(set(np.append(df['src_addr'].unique(), df['dst_addr'].unique())))
        if len(addrs) != 2:
            raise
        stat = get_statistical_features(df, df['src_addr'] == addrs[0],'tcp.len','forw_pkt_len')
        stat.update(get_statistical_features(df, df['src_addr'] == addrs[1],'tcp.len','back_pkt_len'))
        return pd.Series(stat)

    trace_df = raw_trace_df
    tcp_flow_df = pd.DataFrame()
    # upsampling
    tcp_flow_df['avg(tcp_pkt_len)'] = trace_df.groupby('tcp.stream')['tcp.len'].mean()
    tcp_flow_df['stddev(tcp_pkt_len)'] = trace_df.groupby('tcp.stream')['tcp.len'].std().fillna(-1)
    tcp_flow_df['min(tcp_pkt_len)'] = trace_df.groupby('tcp.stream')['tcp.len'].min()
    tcp_flow_df['max(tcp_pkt_len)'] = trace_df.groupby('tcp.stream')['tcp.len'].max()
    tcp_flow_df['tot_pkt'] = trace_df.groupby('tcp.stream')['tcp.len'].count()
    tcp_flow_df['tot_byte'] = trace_df.groupby('tcp.stream')['tcp.len'].sum()
    tcp_flow_df['rel_start'] = trace_df.groupby('tcp.stream')['frame.time_relative'].min()
    tcp_flow_df['duration'] = trace_df.groupby('tcp.stream')['frame.time_relative'].max() - tcp_flow_df['rel_start']
    if not upsampled:
        tqdm.pandas(desc='{} samp rate no upsampling'.format(sampling_rate))
    else:
        tcp_flow_df['tot_pkt'] /= sampling_rate
        tcp_flow_df['tot_byte'] /= sampling_rate
        tqdm.pandas(desc='{} samp rate with upsampling'.format(sampling_rate))
    two_way_flow_df = trace_df.groupby('tcp.stream')[['tcp.len','src_addr','dst_addr']].progress_apply(calculate_two_way_tcp)
    tcp_flow_df = pd.concat([tcp_flow_df,two_way_flow_df],axis=1)
    return tcp_flow_df

In [None]:
from tqdm import tqdm_notebook
to_feature_df(sampled_df, sampling_rate, upsampled=False).to_csv(os.path.join(LOCAL_PATH,'packet_rand_{PERCENT}%_no_upsampling.csv'.format(PERCENT=sampling_percent)))
to_feature_df(sampled_df, sampling_rate, upsampled=True).to_csv(os.path.join(LOCAL_PATH,'packet_rand_{PERCENT}%_with_upsampling.csv'.format(PERCENT=sampling_percent)))