In [1]:
import numpy as np
import pandas as pd
from importlib import reload

In [2]:
# define constants
TRACE_FILE_NAME = 'data/LOL_AI.pcapng' # replace with your raw trace .pcapng/.pcap file name
TRACE_TCP_PACKET_FEATURE_FILE_NAME = 'data/LOL_AI_tcp_pkt.csv' # replace with your favorite tcp packet feature .csv file name
TRACE_UDP_PACKET_FEATURE_FILE_NAME = 'data/LOL_AI_udp_pkt.csv' # replace with your favorite udp packet feature .csv file name
TRACE_PACKET_FEATURE_FILE_NAME = 'data/LOL_AI_pkt.csv' # replace with your favorite packet feature .csv file name
LOCAL_IP = '172.16.26.207' # your local ip

## Extract TCP packets

In [3]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.tcp_generate(TRACE_FILE_NAME,TRACE_TCP_PACKET_FEATURE_FILE_NAME)

Conversion done
CPU times: user 4.22 ms, sys: 7.77 ms, total: 12 ms
Wall time: 5.64 s


In [4]:
# read in packet feature csv file and do some transformation
import ipaddress
tcp_pkt_feature_df = pd.read_csv(TRACE_TCP_PACKET_FEATURE_FILE_NAME)
record_num = tcp_pkt_feature_df.shape[0]
tcp_pkt_feature_df['remote_ip'] = tcp_pkt_feature_df.apply(lambda row:row['ip.dst'] if row['ip.src'] == LOCAL_IP else row['ip.src'],axis=1) if record_num > 0 else None
tcp_pkt_feature_df['remote_ip2num'] = tcp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
tcp_pkt_feature_df['protocol'] = 'tcp' if record_num > 0 else None
tcp_pkt_feature_df['is_tcp'] = 1 if record_num > 0 else None
tcp_pkt_feature_df['is_udp'] = 0 if record_num > 0 else None
tcp_pkt_feature_df.rename(columns={'tcp.len':'pkt_len'},inplace=True)

In [5]:
# view the shape of the dataset: (number of records, number of features)
tcp_pkt_feature_df.shape

(0, 17)

In [6]:
# view the data types for each feature
tcp_pkt_feature_df.dtypes

ip.src                 object
ip.dst                 object
tcp.srcport            object
tcp.dstport            object
pkt_len                object
frame.time_relative    object
tcp.seq                object
tcp.ack                object
tcp.flags.ack          object
tcp.flags.syn          object
tcp.flags.fin          object
tcp.stream             object
remote_ip              object
remote_ip2num          object
protocol               object
is_tcp                 object
is_udp                 object
dtype: object

In [7]:
# view the statistical features of each numerical feature
tcp_pkt_feature_df.describe()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
count,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
unique,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# view the first 5 records
tcp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp


## Extract UDP packets

In [9]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.udp_generate(TRACE_FILE_NAME,TRACE_UDP_PACKET_FEATURE_FILE_NAME)

Conversion done
CPU times: user 4.26 ms, sys: 10.1 ms, total: 14.4 ms
Wall time: 6.62 s


In [10]:
# read in packet feature csv file and do some transformation
import ipaddress
udp_pkt_feature_df = pd.read_csv(TRACE_UDP_PACKET_FEATURE_FILE_NAME)
record_num = udp_pkt_feature_df.shape[0]
udp_pkt_feature_df['remote_ip'] = udp_pkt_feature_df.apply(lambda row:row['ip.dst'] if row['ip.src'] == LOCAL_IP else row['ip.src'],axis=1) if record_num > 0 else None
udp_pkt_feature_df['remote_ip2num'] = udp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
udp_pkt_feature_df['protocol'] = 'udp' if record_num > 0 else None
udp_pkt_feature_df['is_tcp'] = 0 if record_num > 0 else None
udp_pkt_feature_df['is_udp'] = 1 if record_num > 0 else None
udp_pkt_feature_df.rename(columns={'udp.length':'pkt_len'},inplace=True)

In [11]:
# view the shape of the dataset: (number of records, number of features)
udp_pkt_feature_df.shape

(90721, 12)

In [12]:
# view the data types for each feature
udp_pkt_feature_df.dtypes

ip.src                  object
ip.dst                  object
udp.srcport              int64
udp.dstport              int64
pkt_len                  int64
frame.time_relative    float64
udp.stream               int64
remote_ip               object
remote_ip2num            int64
protocol                object
is_tcp                   int64
is_udp                   int64
dtype: object

In [13]:
# view the statistical features of each numerical feature
udp_pkt_feature_df.describe()

Unnamed: 0,udp.srcport,udp.dstport,pkt_len,frame.time_relative,udp.stream,remote_ip2num,is_tcp,is_udp
count,90721.0,90721.0,90721.0,90721.0,90721.0,90721.0,90721.0,90721.0
mean,31246.701602,23437.298398,118.94159,846.325453,0.0,3225464000.0,0.0,1.0
std,21847.933608,21847.933608,179.081308,386.548324,0.0,0.0,0.0,0.0
min,5148.0,5148.0,16.0,0.0,0.0,3225464000.0,0.0,1.0
25%,5148.0,5148.0,26.0,522.687446,0.0,3225464000.0,0.0,1.0
50%,49536.0,5148.0,42.0,848.139542,0.0,3225464000.0,0.0,1.0
75%,49536.0,49536.0,105.0,1174.963175,0.0,3225464000.0,0.0,1.0
max,49536.0,49536.0,1004.0,1515.88633,0.0,3225464000.0,0.0,1.0


In [14]:
# view the first 5 records
udp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,udp.srcport,udp.dstport,pkt_len,frame.time_relative,udp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
0,172.16.26.207,192.64.172.62,49536,5148,60,0.0,0,192.64.172.62,3225463870,udp,0,1
1,192.64.172.62,172.16.26.207,5148,49536,48,0.025155,0,192.64.172.62,3225463870,udp,0,1
2,172.16.26.207,192.64.172.62,49536,5148,26,0.025456,0,192.64.172.62,3225463870,udp,0,1
3,172.16.26.207,192.64.172.62,49536,5148,24,0.489487,0,192.64.172.62,3225463870,udp,0,1
4,192.64.172.62,172.16.26.207,5148,49536,18,0.51992,0,192.64.172.62,3225463870,udp,0,1


## Combine TCP with UDP packets

In [15]:
# combine dataframes
pkt_feature_df = tcp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']].append(udp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']],ignore_index=True)

In [17]:
# shape
pkt_feature_df.shape

(90721, 4)

In [20]:
# column types
pkt_feature_df.dtypes

remote_ip2num    object
is_tcp           object
is_udp           object
pkt_len          object
dtype: object

In [18]:
# describe
pkt_feature_df.describe()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
count,90721,90721,90721,90721
unique,1,1,1,982
top,3225463870,0,1,26
freq,90721,90721,90721,22708


In [19]:
# head 5 records
pkt_feature_df.head()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
0,3225463870,0,1,60
1,3225463870,0,1,48
2,3225463870,0,1,26
3,3225463870,0,1,24
4,3225463870,0,1,18


In [22]:
# write to csv
pkt_feature_df.to_csv(TRACE_PACKET_FEATURE_FILE_NAME)