In [1]:
from data_collection.parse_pcap import pcap_to_pandas, send_rates
from datetime import datetime, timezone
from pandas import DataFrame

## Load a Packet Capture into Pandas

In [2]:
pcap = pcap_to_pandas('/Users/feamster/Downloads/example-20200314.pcap') # this can take a few minutes
pcap.head(n=4)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_dst_int,ip_src,ip_src_int,is_dns,length,mac_dst,mac_dst_int,mac_src,mac_src_int,port_dst,port_src,protocol,time,time_normed
0,2020-03-14 20:17:15.939347,,,192.168.1.13,3232236000.0,162.255.36.123,2734630000.0,False,883,3c:15:c2:d9:d3:50,66064161035088,08:02:8e:92:27:27,8807074899751,63010.0,8801.0,UDP,1584235000.0,0.0
1,2020-03-14 20:17:15.939351,,,192.168.1.13,3232236000.0,162.255.36.123,2734630000.0,False,494,3c:15:c2:d9:d3:50,66064161035088,08:02:8e:92:27:27,8807074899751,63010.0,8801.0,UDP,1584235000.0,4e-06
2,2020-03-14 20:17:15.939353,,,192.168.1.13,3232236000.0,162.255.36.123,2734630000.0,False,525,3c:15:c2:d9:d3:50,66064161035088,08:02:8e:92:27:27,8807074899751,63010.0,8801.0,UDP,1584235000.0,6e-06
3,2020-03-14 20:17:15.939354,,,192.168.1.13,3232236000.0,162.255.36.123,2734630000.0,False,292,3c:15:c2:d9:d3:50,66064161035088,08:02:8e:92:27:27,8807074899751,63010.0,8801.0,UDP,1584235000.0,7e-06


In [3]:
pcap.shape

(3365, 18)

In [4]:
pcap[pcap['ip_src'] == '192.168.1.13'].head(2)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_dst_int,ip_src,ip_src_int,is_dns,length,mac_dst,mac_dst_int,mac_src,mac_src_int,port_dst,port_src,protocol,time,time_normed
15,2020-03-14 20:17:15.962859,,,162.255.36.123,2734630000.0,192.168.1.13,3232236000.0,False,99,08:02:8e:92:27:27,8807074899751,3c:15:c2:d9:d3:50,66064161035088,8801.0,58575.0,UDP,1584235000.0,0.023512
43,2020-03-14 20:17:16.062898,,,162.255.36.123,2734630000.0,192.168.1.13,3232236000.0,False,99,08:02:8e:92:27:27,8807074899751,3c:15:c2:d9:d3:50,66064161035088,8801.0,58575.0,UDP,1584235000.0,0.123551


**Example:** Create a Dataframe with unique IP addresses.

In [5]:
unique_dst_ip = DataFrame(pcap['ip_dst'].unique())[0]
print(unique_dst_ip)

0       192.168.1.13
1     162.255.36.123
2      172.217.8.174
3      17.253.25.203
4       157.240.2.53
5               None
6       172.217.1.46
7       34.194.201.2
8     74.125.124.189
9      35.186.224.53
10     18.211.118.21
11       192.168.1.1
12      199.232.77.7
13     17.248.132.59
14      172.217.1.42
15     23.21.193.169
16      96.17.11.144
17      172.217.9.74
Name: 0, dtype: object


## Basic Analysis of Traffic Using Pandas

Define a reverse lookup function.

In [6]:
from dns import resolver
from dns import reversename

# test reverse DNS lookup
addr = reversename.from_address('34.193.201.2')
print(resolver.query(addr, "PTR")[0])

ec2-34-193-201-2.compute-1.amazonaws.com.


In [7]:
def reverse_lookup(ip):
    if str(ip) == 'None':
        return 'None'
    addr = reversename.from_address(ip)
    try:
        return str(resolver.query(addr, "PTR")[0])
    except Exception as e:
        return 'N/A'

Use the pandas 'apply' function to create a new column with the DNS names associated with each destination. 

Then look at the unique destination IP addresses in the trace.

In [8]:
pcap['name_dst'] = pcap['ip_dst'].apply(reverse_lookup)

In [9]:
unique_dst_name = DataFrame(pcap['name_dst'].unique())[0]
print(unique_dst_name)

0                                                   N/A
1                                 zoomny123mmr.zoom.us.
2                            ord37s08-in-f14.1e100.net.
3                        uschi5-vip-bx-003.aaplimg.com.
4                   whatsapp-cdn-shv-01-ort2.fbcdn.net.
5                                                  None
6                            ord37s07-in-f46.1e100.net.
7                            ord37s07-in-f14.1e100.net.
8             ec2-34-194-201-2.compute-1.amazonaws.com.
9               53.224.186.35.bc.googleusercontent.com.
10           ec2-18-211-118-21.compute-1.amazonaws.com.
11                           ord37s07-in-f42.1e100.net.
12           ec2-23-21-193-169.compute-1.amazonaws.com.
13    a96-17-11-144.deploy.static.akamaitechnologies...
14                           ord38s09-in-f10.1e100.net.
Name: 0, dtype: object


Write functions to count ("sum") the length field so that we can know how much total traffic in bytes is sent to each destination, either by IP address or by name.

In [10]:
def volume_stats_by_ip(pcap):
    return pcap.loc[:,['ip_dst','length']].groupby('ip_dst').sum().sort_values(by=['length'], ascending=False)


def volume_stats_by_name(pcap):
    return pcap.loc[:,['name_dst','length']].groupby('name_dst').sum().sort_values(by=['length'], ascending=False)

In [11]:
volume_stats_by_ip(pcap)

Unnamed: 0_level_0,length
ip_dst,Unnamed: 1_level_1
192.168.1.13,1538859
162.255.36.123,250577
172.217.1.46,6984
17.248.132.59,2648
74.125.124.189,2316
199.232.77.7,1965
172.217.8.174,974
35.186.224.53,909
17.253.25.203,762
18.211.118.21,567


In [12]:
volume_stats_by_name(pcap)

Unnamed: 0_level_0,length
name_dst,Unnamed: 1_level_1
,1545870
zoomny123mmr.zoom.us.,250577
ord37s07-in-f14.1e100.net.,4718
ord37s07-in-f46.1e100.net.,2266
ord37s08-in-f14.1e100.net.,974
53.224.186.35.bc.googleusercontent.com.,909
uschi5-vip-bx-003.aaplimg.com.,762
ec2-18-211-118-21.compute-1.amazonaws.com.,567
a96-17-11-144.deploy.static.akamaitechnologies.com.,432
ec2-34-194-201-2.compute-1.amazonaws.com.,378
