In [1]:
import numpy as np
import pandas as pd
from importlib import reload

In [2]:
# define constants
TRACE_FILE_NAME = 'data/tencent_game_na.pcapng' # replace with your raw trace .pcapng/.pcap file name
TRACE_TCP_PACKET_FEATURE_FILE_NAME = 'data/tencent_game_na_tcp_pkt.csv' # replace with your favorite tcp packet feature .csv file name
TRACE_UDP_PACKET_FEATURE_FILE_NAME = 'data/tencent_game_na_udp_pkt.csv' # replace with your favorite udp packet feature .csv file name
TRACE_PACKET_FEATURE_FILE_NAME = 'data/tencent_game_na_pkt.csv' # replace with your favorite packet feature .csv file name
# LOCAL_IP = '172.16.26.207' # your local ip

## Extract TCP packets

In [3]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.tcp_generate(TRACE_FILE_NAME,TRACE_TCP_PACKET_FEATURE_FILE_NAME)

Conversion done
CPU times: user 2.6 ms, sys: 6.17 ms, total: 8.77 ms
Wall time: 9.21 s


In [4]:
# read in packet feature csv file and do some transformation
import ipaddress
tcp_pkt_feature_df = pd.read_csv(TRACE_TCP_PACKET_FEATURE_FILE_NAME)
filterer = tcp_pkt_feature_df.apply(lambda row:(not pd.isnull(row['ip.src']) and ipaddress.IPv4Address(row['ip.src']).is_global) or (not pd.isnull(row['ip.dst']) and ipaddress.IPv4Address(row['ip.dst']).is_global),axis=1)
tcp_pkt_feature_df = tcp_pkt_feature_df[filterer]
record_num = tcp_pkt_feature_df.shape[0]
tcp_pkt_feature_df['remote_ip'] = tcp_pkt_feature_df.apply(lambda row:row['ip.dst'] if ipaddress.IPv4Address(row['ip.dst']).is_global else row['ip.src'],axis=1) if record_num > 0 else None
tcp_pkt_feature_df['remote_ip2num'] = tcp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
tcp_pkt_feature_df['protocol'] = 'tcp' if record_num > 0 else None
tcp_pkt_feature_df['is_tcp'] = 1 if record_num > 0 else None
tcp_pkt_feature_df['is_udp'] = 0 if record_num > 0 else None
tcp_pkt_feature_df.rename(columns={'tcp.len':'pkt_len'},inplace=True)

In [5]:
# view the shape of the dataset: (number of records, number of features)
tcp_pkt_feature_df.shape

(52802, 17)

In [6]:
# view the data types for each feature
tcp_pkt_feature_df.dtypes

ip.src                  object
ip.dst                  object
tcp.srcport              int64
tcp.dstport              int64
pkt_len                  int64
frame.time_relative    float64
tcp.seq                  int64
tcp.ack                  int64
tcp.flags.ack            int64
tcp.flags.syn            int64
tcp.flags.fin            int64
tcp.stream               int64
remote_ip               object
remote_ip2num            int64
protocol                object
is_tcp                   int64
is_udp                   int64
dtype: object

In [7]:
# view the statistical features of each numerical feature
tcp_pkt_feature_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip2num,is_tcp,is_udp
count,52802.0,52802.0,52802.0,52802.0,52802.0,52802.0,52802.0,52802.0,52802.0,52802.0,52802.0,52802.0,52802.0
mean,17299.196621,26272.557384,762.424075,65.674473,309360.5,176994.0,0.990644,0.00714,0.005738,74.305443,1530230000.0,1.0,0.0
std,20903.021846,20906.41876,668.026356,184.472434,403889.9,338120.4,0.096272,0.084196,0.075535,28.333318,870406900.0,0.0,0.0
min,80.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,288029700.0,1.0,0.0
25%,443.0,443.0,0.0,22.638198,749.0,508.0,1.0,0.0,0.0,58.0,400411300.0,1.0,0.0
50%,443.0,36444.0,1368.0,24.453401,103807.0,990.0,1.0,0.0,0.0,68.0,2163119000.0,1.0,0.0
75%,40288.0,45140.0,1368.0,27.152311,538190.0,200599.2,1.0,0.0,0.0,79.0,2163119000.0,1.0,0.0
max,63704.0,63704.0,1380.0,1338.640061,1883614.0,1883638.0,1.0,1.0,1.0,232.0,3736744000.0,1.0,0.0


In [8]:
# view the first 5 records
tcp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
0,128.238.147.112,17.248.135.181,63660,443,1368,0.0,1,1,1,0,0,0,17.248.135.181,301500341,tcp,1,0
1,128.238.147.112,17.248.135.181,63660,443,101,1e-06,1369,1,1,0,0,0,17.248.135.181,301500341,tcp,1,0
2,128.238.147.112,17.248.135.181,63660,443,1368,0.000274,1470,1,1,0,0,0,17.248.135.181,301500341,tcp,1,0
3,17.248.135.181,128.238.147.112,443,63660,0,0.001493,1,1470,1,0,0,0,128.238.147.112,2163118960,tcp,1,0
4,128.238.147.112,17.248.135.181,63660,443,824,0.001524,2838,1,1,0,0,0,17.248.135.181,301500341,tcp,1,0


## Extract UDP packets

In [9]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.udp_generate(TRACE_FILE_NAME,TRACE_UDP_PACKET_FEATURE_FILE_NAME,True)

Conversion done


No error


CPU times: user 4.27 ms, sys: 5.45 ms, total: 9.73 ms
Wall time: 7.6 s


In [10]:
udp_pkt_feature_df = pd.read_csv(TRACE_UDP_PACKET_FEATURE_FILE_NAME)
udp_pkt_feature_df

Unnamed: 0,ip.src,ip.dst,udp.srcport,udp.dstport,udp.length,frame.time_relative,udp.stream
0,128.238.147.3,224.0.0.2,1985,1985,28,0.097179,0
1,128.238.147.2,224.0.0.2,1985,1985,28,0.100842,1
2,128.238.147.145,128.238.147.255,37481,1534,16,0.574101,2
3,128.238.147.3,224.0.0.2,1985,1985,28,3.096920,0
4,128.238.147.2,224.0.0.2,1985,1985,28,3.101557,1
5,128.238.147.137,255.255.255.255,17500,17500,152,4.347565,3
6,128.238.147.137,128.238.147.255,17500,17500,152,4.347699,4
7,128.238.147.112,128.238.1.69,64703,53,40,4.791367,5
8,128.238.147.112,128.238.1.69,51917,53,45,4.791480,6
9,128.238.147.112,128.238.1.69,61129,53,49,4.791661,7


In [11]:
# read in packet feature csv file and do some transformation
import ipaddress
def filter_illegal(row):
    try:
        ipaddress.IPv4Address(row['ip.src'])
        ipaddress.IPv4Address(row['ip.dst'])
        return (not pd.isnull(row['ip.src']) and ipaddress.IPv4Address(row['ip.src']).is_global) or (not pd.isnull(row['ip.dst']) and ipaddress.IPv4Address(row['ip.dst']).is_global)
    except ValueError as e:
        print(e)
        return False                                                                                
                                                                                                
udp_pkt_feature_df = pd.read_csv(TRACE_UDP_PACKET_FEATURE_FILE_NAME)
filterer = udp_pkt_feature_df.apply(filter_illegal,axis=1)
udp_pkt_feature_df = udp_pkt_feature_df[filterer]
record_num = udp_pkt_feature_df.shape[0]
udp_pkt_feature_df['remote_ip'] = udp_pkt_feature_df.apply(lambda row:row['ip.dst'] if ipaddress.IPv4Address(row['ip.dst']).is_global else row['ip.src'],axis=1) if record_num > 0 else None
udp_pkt_feature_df['remote_ip2num'] = udp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
udp_pkt_feature_df['protocol'] = 'udp' if record_num > 0 else None
udp_pkt_feature_df['is_tcp'] = 0 if record_num > 0 else None
udp_pkt_feature_df['is_udp'] = 1 if record_num > 0 else None
udp_pkt_feature_df.rename(columns={'udp.length':'pkt_len'},inplace=True)

Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
E

In [12]:
# view the shape of the dataset: (number of records, number of features)
udp_pkt_feature_df.shape

(43819, 12)

In [13]:
# view the data types for each feature
udp_pkt_feature_df.dtypes

ip.src                  object
ip.dst                  object
udp.srcport              int64
udp.dstport              int64
pkt_len                  int64
frame.time_relative    float64
udp.stream               int64
remote_ip               object
remote_ip2num            int64
protocol                object
is_tcp                   int64
is_udp                   int64
dtype: object

In [14]:
# view the statistical features of each numerical feature
udp_pkt_feature_df.describe()

Unnamed: 0,udp.srcport,udp.dstport,pkt_len,frame.time_relative,udp.stream,remote_ip2num,is_tcp,is_udp
count,43819.0,43819.0,43819.0,43819.0,43819.0,43819.0,43819.0,43819.0
mean,25193.044866,22211.606997,173.944271,716.771043,83.376252,2550868000.0,0.0,1.0
std,17599.80157,17467.249729,115.121193,338.9694,15.466526,398763100.0,0.0,0.0
min,53.0,53.0,16.0,0.097179,0.0,301798500.0,0.0,1.0
25%,6651.0,6651.0,102.0,429.984005,85.0,2163119000.0,0.0,1.0
50%,41764.0,6651.0,102.0,724.518986,85.0,2838481000.0,0.0,1.0
75%,41764.0,41764.0,294.0,1002.336581,85.0,2838481000.0,0.0,1.0
max,65437.0,65437.0,532.0,1338.676886,214.0,4026532000.0,0.0,1.0


In [15]:
# view the first 5 records
udp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,udp.srcport,udp.dstport,pkt_len,frame.time_relative,udp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
0,128.238.147.3,224.0.0.2,1985,1985,28,0.097179,0,224.0.0.2,3758096386,udp,0,1
1,128.238.147.2,224.0.0.2,1985,1985,28,0.100842,1,224.0.0.2,3758096386,udp,0,1
2,128.238.147.145,128.238.147.255,37481,1534,16,0.574101,2,128.238.147.255,2163119103,udp,0,1
3,128.238.147.3,224.0.0.2,1985,1985,28,3.09692,0,224.0.0.2,3758096386,udp,0,1
4,128.238.147.2,224.0.0.2,1985,1985,28,3.101557,1,224.0.0.2,3758096386,udp,0,1


## Combine TCP with UDP packets

In [16]:
# combine dataframes
pkt_feature_df = tcp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']].append(udp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']],ignore_index=True)

In [17]:
# shape
pkt_feature_df.shape

(96621, 4)

In [18]:
###### column types
pkt_feature_df.dtypes

remote_ip2num    int64
is_tcp           int64
is_udp           int64
pkt_len          int64
dtype: object

In [19]:
# describe
pkt_feature_df.describe()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
count,96621.0,96621.0,96621.0,96621.0
mean,1993104000.0,0.546486,0.453514,495.5401
std,862733400.0,0.497837,0.497837,579.406738
min,288029700.0,0.0,0.0,0.0
25%,2163119000.0,0.0,0.0,32.0
50%,2163119000.0,1.0,0.0,102.0
75%,2838481000.0,1.0,1.0,1368.0
max,4026532000.0,1.0,1.0,1380.0


In [20]:
# head 5 records
pkt_feature_df.head()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
0,301500341,1,0,1368
1,301500341,1,0,101
2,301500341,1,0,1368
3,2163118960,1,0,0
4,301500341,1,0,824


In [21]:
# write to csv
pkt_feature_df.to_csv(TRACE_PACKET_FEATURE_FILE_NAME, index=False)