In [1]:
import numpy as np
import pandas as pd
from importlib import reload

In [2]:
# define constants
TRACE_FILE_NAME = 'data/APP_DOWNLOAD.pcapng' # replace with your raw trace .pcapng/.pcap file name
TRACE_TCP_PACKET_FEATURE_FILE_NAME = 'data/APP_DOWNLOAD_tcp_pkt.csv' # replace with your favorite tcp packet feature .csv file name
TRACE_UDP_PACKET_FEATURE_FILE_NAME = 'data/APP_DOWNLOAD_udp_pkt.csv' # replace with your favorite udp packet feature .csv file name
TRACE_PACKET_FEATURE_FILE_NAME = 'data/APP_DOWNLOAD_pkt.csv' # replace with your favorite packet feature .csv file name
LOCAL_IP = '172.16.26.207' # your local ip

## Extract TCP packets

In [3]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.tcp_generate(TRACE_FILE_NAME,TRACE_TCP_PACKET_FEATURE_FILE_NAME)

Packet feature file already exists.
CPU times: user 112 µs, sys: 72 µs, total: 184 µs
Wall time: 261 µs


In [4]:
# read in packet feature csv file and do some transformation
import ipaddress
tcp_pkt_feature_df = pd.read_csv(TRACE_TCP_PACKET_FEATURE_FILE_NAME)
record_num = tcp_pkt_feature_df.shape[0]
tcp_pkt_feature_df['remote_ip'] = tcp_pkt_feature_df.apply(lambda row:row['ip.dst'] if row['ip.src'] == LOCAL_IP else row['ip.src'],axis=1) if record_num > 0 else None
tcp_pkt_feature_df['remote_ip2num'] = tcp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
tcp_pkt_feature_df['protocol'] = 'tcp' if record_num > 0 else None
tcp_pkt_feature_df['is_tcp'] = 1 if record_num > 0 else None
tcp_pkt_feature_df['is_udp'] = 0 if record_num > 0 else None
tcp_pkt_feature_df.rename(columns={'tcp.len':'pkt_len'},inplace=True)

In [5]:
# view the shape of the dataset: (number of records, number of features)
tcp_pkt_feature_df.shape

(351074, 17)

In [6]:
# view the data types for each feature
tcp_pkt_feature_df.dtypes

ip.src                  object
ip.dst                  object
tcp.srcport              int64
tcp.dstport              int64
pkt_len                  int64
frame.time_relative    float64
tcp.seq                  int64
tcp.ack                  int64
tcp.flags.ack            int64
tcp.flags.syn            int64
tcp.flags.fin            int64
tcp.stream               int64
remote_ip               object
remote_ip2num            int64
protocol                object
is_tcp                   int64
is_udp                   int64
dtype: object

In [7]:
# view the statistical features of each numerical feature
tcp_pkt_feature_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip2num,is_tcp,is_udp
count,351074.0,351074.0,351074.0,351074.0,351074.0,351074.0,351074.0,351074.0,351074.0,351074.0,351074.0,351074.0,351074.0
mean,7105.10183,42534.89817,1105.121798,69.490494,166443700.0,28275380.0,0.999997,6e-06,0.0,0.0,301818316.0,1.0,0.0
std,17269.940138,17269.940138,449.55112,19.122326,123884100.0,81207580.0,0.001688,0.002387,0.0,0.0,0.0,0.0,0.0
min,80.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,301818316.0,1.0,0.0
25%,80.0,49560.0,1288.0,63.618433,48839380.0,2341.0,1.0,0.0,0.0,0.0,301818316.0,1.0,0.0
50%,80.0,49560.0,1288.0,72.962957,161884900.0,2341.0,1.0,0.0,0.0,0.0,301818316.0,1.0,0.0
75%,80.0,49560.0,1288.0,81.773172,274930400.0,2341.0,1.0,0.0,0.0,0.0,301818316.0,1.0,0.0
max,49560.0,49560.0,1288.0,90.545018,387975900.0,387969500.0,1.0,1.0,0.0,0.0,301818316.0,1.0,0.0


In [8]:
# view the first 5 records
tcp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
0,172.16.26.207,17.253.97.204,49560,80,0,0.0,0,0,0,1,0,0,17.253.97.204,301818316,tcp,1,0
1,17.253.97.204,172.16.26.207,80,49560,0,0.003427,0,1,1,1,0,0,17.253.97.204,301818316,tcp,1,0
2,172.16.26.207,17.253.97.204,49560,80,0,0.003474,1,1,1,0,0,0,17.253.97.204,301818316,tcp,1,0
3,172.16.26.207,17.253.97.204,49560,80,559,0.004402,1,1,1,0,0,0,17.253.97.204,301818316,tcp,1,0
4,17.253.97.204,172.16.26.207,80,49560,0,0.008544,1,560,1,0,0,0,17.253.97.204,301818316,tcp,1,0


## Extract UDP packets

In [10]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.udp_generate(TRACE_FILE_NAME,TRACE_UDP_PACKET_FEATURE_FILE_NAME)

KeyboardInterrupt: 

In [11]:
# read in packet feature csv file and do some transformation
import ipaddress
udp_pkt_feature_df = pd.read_csv(TRACE_UDP_PACKET_FEATURE_FILE_NAME)
record_num = udp_pkt_feature_df.shape[0]
udp_pkt_feature_df['remote_ip'] = udp_pkt_feature_df.apply(lambda row:row['ip.dst'] if row['ip.src'] == LOCAL_IP else row['ip.src'],axis=1) if record_num > 0 else None
udp_pkt_feature_df['remote_ip2num'] = udp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
udp_pkt_feature_df['protocol'] = 'udp' if record_num > 0 else None
udp_pkt_feature_df['is_tcp'] = 0 if record_num > 0 else None
udp_pkt_feature_df['is_udp'] = 1 if record_num > 0 else None
udp_pkt_feature_df.rename(columns={'udp.length':'pkt_len'},inplace=True)

EmptyDataError: No columns to parse from file

In [None]:
# view the shape of the dataset: (number of records, number of features)
udp_pkt_feature_df.shape

In [None]:
# view the data types for each feature
udp_pkt_feature_df.dtypes

In [None]:
# view the statistical features of each numerical feature
udp_pkt_feature_df.describe()

In [None]:
# view the first 5 records
udp_pkt_feature_df.head()

## Combine TCP with UDP packets

In [13]:
# combine dataframes
pkt_feature_df = tcp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']].append(udp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']],ignore_index=True)

In [14]:
# shape
pkt_feature_df.shape

(351074, 4)

In [15]:
# column types
pkt_feature_df.dtypes

remote_ip2num    int64
is_tcp           int64
is_udp           int64
pkt_len          int64
dtype: object

In [16]:
# describe
pkt_feature_df.describe()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
count,351074.0,351074.0,351074.0,351074.0
mean,301818316.0,1.0,0.0,1105.121798
std,0.0,0.0,0.0,449.55112
min,301818316.0,1.0,0.0,0.0
25%,301818316.0,1.0,0.0,1288.0
50%,301818316.0,1.0,0.0,1288.0
75%,301818316.0,1.0,0.0,1288.0
max,301818316.0,1.0,0.0,1288.0


In [17]:
# head 5 records
pkt_feature_df.head()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
0,301818316,1,0,0
1,301818316,1,0,0
2,301818316,1,0,0
3,301818316,1,0,559
4,301818316,1,0,0


In [18]:
# write to csv
pkt_feature_df.to_csv(TRACE_PACKET_FEATURE_FILE_NAME, index=False)