In [4]:
import numpy as np
import pandas as pd
from importlib import reload

In [51]:
# define constants
TRACE_FILE_NAME = 'data/Skype_HongKong.pcapng' # replace with your raw trace .pcapng/.pcap file name
TRACE_TCP_PACKET_FEATURE_FILE_NAME = 'data/Skype_HongKong_tcp_pkt.csv' # replace with your favorite tcp packet feature .csv file name
TRACE_UDP_PACKET_FEATURE_FILE_NAME = 'data/Skype_HongKong_udp_pkt.csv' # replace with your favorite udp packet feature .csv file name
TRACE_PACKET_FEATURE_FILE_NAME = 'data/Skype_HongKong_pkt.csv' # replace with your favorite packet feature .csv file name
LOCAL_IP = '172.16.26.207' # your local ip

## Extract TCP packets

In [6]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.tcp_generate(TRACE_FILE_NAME,TRACE_TCP_PACKET_FEATURE_FILE_NAME)

Packet feature file already exists.
CPU times: user 202 µs, sys: 813 µs, total: 1.02 ms
Wall time: 1.79 ms


In [40]:
# read in packet feature csv file and do some transformation
import ipaddress
tcp_pkt_feature_df = pd.read_csv(TRACE_TCP_PACKET_FEATURE_FILE_NAME)
filterer = tcp_pkt_feature_df.apply(lambda row:(not pd.isnull(row['ip.src']) and ipaddress.IPv4Address(row['ip.src']).is_global) or (not pd.isnull(row['ip.dst']) and ipaddress.IPv4Address(row['ip.dst']).is_global),axis=1)
tcp_pkt_feature_df = tcp_pkt_feature_df[filterer]
record_num = tcp_pkt_feature_df.shape[0]
tcp_pkt_feature_df['remote_ip'] = tcp_pkt_feature_df.apply(lambda row:row['ip.dst'] if ipaddress.IPv4Address(row['ip.dst']).is_global else row['ip.src'],axis=1) if record_num > 0 else None
tcp_pkt_feature_df['remote_ip2num'] = tcp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
tcp_pkt_feature_df['protocol'] = 'tcp' if record_num > 0 else None
tcp_pkt_feature_df['is_tcp'] = 1 if record_num > 0 else None
tcp_pkt_feature_df['is_udp'] = 0 if record_num > 0 else None
tcp_pkt_feature_df.rename(columns={'tcp.len':'pkt_len'},inplace=True)

In [41]:
# view the shape of the dataset: (number of records, number of features)
tcp_pkt_feature_df.shape

(3678, 17)

In [42]:
# view the data types for each feature
tcp_pkt_feature_df.dtypes

ip.src                  object
ip.dst                  object
tcp.srcport              int64
tcp.dstport              int64
pkt_len                  int64
frame.time_relative    float64
tcp.seq                  int64
tcp.ack                  int64
tcp.flags.ack            int64
tcp.flags.syn            int64
tcp.flags.fin            int64
tcp.stream               int64
remote_ip               object
remote_ip2num            int64
protocol                object
is_tcp                   int64
is_udp                   int64
dtype: object

In [43]:
# view the statistical features of each numerical feature
tcp_pkt_feature_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip2num,is_tcp,is_udp
count,3678.0,3678.0,3678.0,3678.0,3678.0,3678.0,3678.0,3678.0,3678.0,3678.0,3678.0,3678.0,3678.0
mean,29043.695487,25612.809951,265.895867,252.438116,7407.414899,7616.953235,0.96112,0.05329,0.050843,232.175095,1239089000.0,1.0,0.0
std,24214.322178,24538.058716,469.979511,243.678426,15084.97928,15551.34728,0.193335,0.224641,0.219707,243.597741,910123100.0,0.0,0.0
min,80.0,80.0,0.0,2.271637,0.0,0.0,0.0,0.0,0.0,2.0,224026400.0,1.0,0.0
25%,443.0,443.0,0.0,16.402931,208.0,235.0,1.0,0.0,0.0,38.0,301562400.0,1.0,0.0
50%,49491.0,40033.0,0.0,130.084942,1848.0,2395.0,1.0,0.0,0.0,117.0,883073300.0,1.0,0.0
75%,49590.0,49571.0,311.0,529.345451,5549.75,5657.0,1.0,0.0,0.0,512.0,1752893000.0,1.0,0.0
max,63232.0,63232.0,1460.0,660.412777,93784.0,94093.0,1.0,1.0,1.0,693.0,3427780000.0,1.0,0.0


In [44]:
# view the first 5 records
tcp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
24,90.217.83.107,192.168.1.163,62530,6769,0,2.271637,1,1,1,0,1,2,90.217.83.107,1524192107,tcp,1,0
25,192.168.1.163,90.217.83.107,6769,62530,0,2.271725,1,2,1,0,0,2,90.217.83.107,1524192107,tcp,1,0
26,192.168.1.163,90.217.83.107,6769,62530,0,2.271783,1,2,1,0,1,2,90.217.83.107,1524192107,tcp,1,0
27,90.217.83.107,192.168.1.163,62530,6769,0,2.38149,2,2,1,0,0,2,90.217.83.107,1524192107,tcp,1,0
64,52.162.161.12,192.168.1.163,443,49445,1448,3.253662,1,1,1,0,0,5,52.162.161.12,883073292,tcp,1,0


## Extract UDP packets

In [62]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.udp_generate(TRACE_FILE_NAME,TRACE_UDP_PACKET_FEATURE_FILE_NAME,True)

Conversion done


No error


CPU times: user 4.78 ms, sys: 5.41 ms, total: 10.2 ms
Wall time: 26 s


In [65]:
# read in packet feature csv file and do some transformation
import ipaddress
def filter_illegal(row):
    try:
        ipaddress.IPv4Address(row['ip.src'])
        ipaddress.IPv4Address(row['ip.dst'])
        return (not pd.isnull(row['ip.src']) and ipaddress.IPv4Address(row['ip.src']).is_global) or (not pd.isnull(row['ip.dst']) and ipaddress.IPv4Address(row['ip.dst']).is_global)
    except ValueError as e:
        print(e)
        return False                                                                                
                                                                                                
udp_pkt_feature_df = pd.read_csv(TRACE_UDP_PACKET_FEATURE_FILE_NAME)
filterer = udp_pkt_feature_df.apply(filter_illegal,axis=1)
udp_pkt_feature_df = udp_pkt_feature_df[filterer]
record_num = udp_pkt_feature_df.shape[0]
udp_pkt_feature_df['remote_ip'] = udp_pkt_feature_df.apply(lambda row:row['ip.dst'] if ipaddress.IPv4Address(row['ip.dst']).is_global else row['ip.src'],axis=1) if record_num > 0 else None
udp_pkt_feature_df['remote_ip2num'] = udp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
udp_pkt_feature_df['protocol'] = 'udp' if record_num > 0 else None
udp_pkt_feature_df['is_tcp'] = 0 if record_num > 0 else None
udp_pkt_feature_df['is_udp'] = 1 if record_num > 0 else None
udp_pkt_feature_df.rename(columns={'udp.length':'pkt_len'},inplace=True)

Expected 4 octets in '192.168.1.163,90.217.83.107'
Expected 4 octets in '192.168.1.1,192.168.1.163'
Expected 4 octets in '192.168.1.1,192.168.1.163'
Expected 4 octets in '192.168.1.1,192.168.1.163'
Expected 4 octets in '192.168.1.1,192.168.1.163'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'na

In [66]:
# view the shape of the dataset: (number of records, number of features)
udp_pkt_feature_df.shape

(365042, 12)

In [67]:
# view the data types for each feature
udp_pkt_feature_df.dtypes

ip.src                  object
ip.dst                  object
udp.srcport              int64
udp.dstport              int64
pkt_len                  int64
frame.time_relative    float64
udp.stream               int64
remote_ip               object
remote_ip2num            int64
protocol                object
is_tcp                   int64
is_udp                   int64
dtype: object

In [68]:
# view the statistical features of each numerical feature
udp_pkt_feature_df.describe()

Unnamed: 0,udp.srcport,udp.dstport,pkt_len,frame.time_relative,udp.stream,remote_ip2num,is_tcp,is_udp
count,365042.0,365042.0,365042.0,365042.0,365042.0,365042.0,365042.0,365042.0
mean,14406.164847,18850.348343,760.436632,373.753011,93.012634,1750116000.0,0.0,1.0
std,12992.739403,12960.373591,369.584589,166.696518,2.767278,73403550.0,0.0,0.0
min,123.0,80.0,11.0,0.531698,0.0,27796900.0,0.0,1.0
25%,3480.0,3480.0,551.0,229.997368,93.0,1747766000.0,0.0,1.0
50%,3480.0,29750.0,889.0,373.524868,93.0,1747766000.0,0.0,1.0
75%,29750.0,29750.0,1057.0,517.88526,93.0,1747766000.0,0.0,1.0
max,64888.0,62902.0,1321.0,663.10677,211.0,4026532000.0,0.0,1.0


In [69]:
# view the first 5 records
udp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,udp.srcport,udp.dstport,pkt_len,frame.time_relative,udp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
0,192.168.1.1,239.255.255.250,52257,1900,183,0.531698,0,239.255.255.250,4026531834,udp,0,1
7,192.168.1.163,90.217.83.107,6769,28331,26,6.203742,4,90.217.83.107,1524192107,udp,0,1
8,90.217.83.107,192.168.1.163,28331,6769,34,6.310376,4,90.217.83.107,1524192107,udp,0,1
26,192.168.1.163,239.255.255.250,51602,1900,141,8.15595,13,239.255.255.250,4026531834,udp,0,1
27,192.168.1.163,239.255.255.250,51602,1900,140,8.155951,13,239.255.255.250,4026531834,udp,0,1


## Combine TCP with UDP packets

In [70]:
# combine dataframes
pkt_feature_df = tcp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']].append(udp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']],ignore_index=True)

In [71]:
# shape
pkt_feature_df.shape

(368720, 4)

In [72]:
###### column types
pkt_feature_df.dtypes

remote_ip2num    int64
is_tcp           int64
is_udp           int64
pkt_len          int64
dtype: object

In [73]:
# describe
pkt_feature_df.describe()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
count,368720.0,368720.0,368720.0,368720.0
mean,1745019000.0,0.009975,0.990025,755.503564
std,127175800.0,0.099376,0.099376,373.962863
min,27796900.0,0.0,0.0,0.0
25%,1747766000.0,0.0,1.0,526.0
50%,1747766000.0,0.0,1.0,885.0
75%,1747766000.0,0.0,1.0,1056.0
max,4026532000.0,1.0,1.0,1460.0


In [74]:
# head 5 records
pkt_feature_df.head()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
0,1524192107,1,0,0
1,1524192107,1,0,0
2,1524192107,1,0,0
3,1524192107,1,0,0
4,883073292,1,0,1448


In [75]:
# write to csv
pkt_feature_df.to_csv(TRACE_PACKET_FEATURE_FILE_NAME, index=False)