### Import Libraries

In [0]:
import sys
from scapy.all import *
import numpy as np
import pandas as pd

import pickle
import datetime
import sys
import os
import ipaddress
import netaddr

### Upload Files from Local Machine (for Google Collaboratory)

Here is code to upload a data file from the local home machine. For the purposes of this notebook, you should upload a packet capture ("pcap") file. A pcap file is a packet capture file that is generated from a network traffic capture, using a tool such as [Wireshark](https://wireshark.org) or tcpdump.

In [10]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving google_home.pcap to google_home.pcap
User uploaded file "google_home.pcap" with length 77630488 bytes


### Example of Using Scapy PcapReader to parse pcap.

In [0]:
def load_pcap(pcap_file):
  for pkt in PcapReader(pcap_file):
    if IP in pkt:
      print(pkt[IP].dst)
      break

In [23]:
load_pcap('google_home.pcap')

255.255.255.255


### Create a Pandas Data Frame from a Pcap

In [0]:
def pcap_to_dict(pcap_file):
    '''Parses a pcap file into a list of dicts.
    Arguments:
      pcap_file: string filepath of pcap file
    Returns:
      List of dicts with one dict per packet in pcap file.
        The dicts have the following key/value pairs:
          "time"     : time the packet was receieved in seconds since epoch
          "datetime" : time the packet was received as a datetime object
          "length"   : length of packet in bytes
          "mac_src"  : source MAC address
          "mac_dst"  : destination MAC address
          "ip_src"   : source IP address
          "ip_dst"   : destination IP address
          "protocol" : 'TCP', 'UDP', 'ICMP', or None
          "port_src" : source port
          "port_dst" : destination port
          "is_dns"   : True if packet is DNS packet, else False
          "dns_query" : string DNS query
          "dns_resp" : string DNS response'''
          
    data = []
    with PcapReader(pcap_file) as pcap_reader:
        for i, pkt in enumerate(pcap_reader):
            pkt_dict = {}
            #if i % 1000 == 0 and i != 0: print(i)
            try:
                if Ether not in pkt:
                    continue
                
                pkt_dict["time"] = pkt.time
                pkt_dict["datetime"] = datetime.datetime.fromtimestamp(pkt.time)
                pkt_dict["length"] = len(pkt)
                pkt_dict["mac_dst"] = pkt[Ether].dst
                pkt_dict["mac_src"] = pkt[Ether].src
                pkt_dict["ip_dst"] = None
                pkt_dict["ip_src"] = None
                pkt_dict["protocol"] = None
                pkt_dict["port_dst"] = None
                pkt_dict["port_src"] =  None
                pkt_dict["is_dns"] = False
                pkt_dict["dns_query"] = None
                pkt_dict["dns_resp"] = None
                
                if IP in pkt:
                    pkt_dict["ip_dst"] = pkt[IP].dst
                    pkt_dict["ip_src"] = pkt[IP].src
                                
                if TCP in pkt:
                    pkt_dict["port_dst"] = pkt[TCP].dport
                    pkt_dict["port_src"] = pkt[TCP].sport
                    pkt_dict["protocol"] = 'TCP'
                elif UDP in pkt:
                    pkt_dict["port_dst"] = pkt[UDP].dport
                    pkt_dict["port_src"] = pkt[UDP].sport
                    pkt_dict["protocol"] = 'UDP'
                elif ICMP in pkt:
                    pkt_dict["protocol"] = 'ICMP'
                    
                if DNSQR in pkt:
                    pkt_dict["is_dns"] = True
                    pkt_dict["dns_query"] = pkt[DNSQR].qname
                if DNSRR in pkt:
                    pkt_dict["is_dns"] = True
                    pkt_dict["dns_resp"] = pkt[DNSRR].rrname
                
                data.append(pkt_dict)
            except:
                continue
    return data
          
          

In [0]:
pdict = pcap_to_dict('google_home.pcap')
ppd = pd.DataFrame(pdict)


In [30]:
ppd.head(10)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
0,2017-12-07 20:48:41,,,,,False,113,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,,1512679721.156083
1,2017-12-07 20:48:41,,,,,False,135,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,,,,1512679721.255304
2,2017-12-07 20:48:41,,,,,False,169,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,,1512679721.25635
3,2017-12-07 20:48:41,,,,,False,113,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,,,,1512679721.300373
4,2017-12-07 20:48:41,,,,,False,90,33:33:00:00:00:16,a4:77:33:2f:e0:6e,,,,1512679721.303218
5,2017-12-07 20:48:41,,,,,False,78,33:33:ff:2f:e0:6e,a4:77:33:2f:e0:6e,,,,1512679721.412617
6,2017-12-07 20:48:41,,,255.255.255.255,0.0.0.0,False,393,ff:ff:ff:ff:ff:ff,a4:77:33:2f:e0:6e,67.0,68.0,UDP,1512679721.432318
7,2017-12-07 20:48:42,,,,,False,70,33:33:00:00:00:02,a4:77:33:2f:e0:6e,,,,1512679722.414223
8,2017-12-07 20:48:43,,,,,False,90,33:33:00:00:00:16,a4:77:33:2f:e0:6e,,,,1512679723.204444
9,2017-12-07 20:48:44,,,172.24.1.51,172.24.1.1,False,62,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,ICMP,1512679724.186461


In [32]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving example.pcap to example.pcap
User uploaded file "example.pcap" with length 2001696 bytes


In [0]:
ex_pdict = pcap_to_dict('example.pcap')
ex_ppd = pd.DataFrame(pdict)

In [34]:
ex_ppd.head(10)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
0,2017-12-07 20:48:41,,,,,False,113,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,,1512679721.156083
1,2017-12-07 20:48:41,,,,,False,135,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,,,,1512679721.255304
2,2017-12-07 20:48:41,,,,,False,169,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,,1512679721.25635
3,2017-12-07 20:48:41,,,,,False,113,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,,,,1512679721.300373
4,2017-12-07 20:48:41,,,,,False,90,33:33:00:00:00:16,a4:77:33:2f:e0:6e,,,,1512679721.303218
5,2017-12-07 20:48:41,,,,,False,78,33:33:ff:2f:e0:6e,a4:77:33:2f:e0:6e,,,,1512679721.412617
6,2017-12-07 20:48:41,,,255.255.255.255,0.0.0.0,False,393,ff:ff:ff:ff:ff:ff,a4:77:33:2f:e0:6e,67.0,68.0,UDP,1512679721.432318
7,2017-12-07 20:48:42,,,,,False,70,33:33:00:00:00:02,a4:77:33:2f:e0:6e,,,,1512679722.414223
8,2017-12-07 20:48:43,,,,,False,90,33:33:00:00:00:16,a4:77:33:2f:e0:6e,,,,1512679723.204444
9,2017-12-07 20:48:44,,,172.24.1.51,172.24.1.1,False,62,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,ICMP,1512679724.186461


### Exploring the Dataset

Now you have some basic examples for taking a packet capture and loading it into a Pandas dataframe.  You could then explore the dataset.

In [39]:
ex_ppd.head(1)


Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
0,2017-12-07 20:48:41,,,,,False,113,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,,1512679721.156083


In [38]:
ex_ppd.shape

(95178, 13)

In [41]:
# this is basically meaningless for our packet data but just to show an example
ex_ppd.describe()

Unnamed: 0,length,port_dst,port_src
count,95178.0,94522.0,94522.0
mean,799.632646,25290.411354,19468.045619
std,692.241486,22475.68662,22130.960169
min,42.0,53.0,53.0
25%,66.0,443.0,443.0
50%,1434.0,33840.0,443.0
75%,1486.0,48410.0,43478.0
max,1514.0,60991.0,60991.0


### Selecting Data

**Selecting Based on Index**

In [43]:
ex_ppd.iloc[0]

datetime     2017-12-07 20:48:41
dns_query                   None
dns_resp                    None
ip_dst                      None
ip_src                      None
is_dns                     False
length                       113
mac_dst        a4:77:33:2f:e0:6e
mac_src        b8:27:eb:2d:24:15
port_dst                     NaN
port_src                     NaN
protocol                    None
time           1512679721.156083
Name: 0, dtype: object

In [44]:
ex_ppd.iloc[1:4]

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
1,2017-12-07 20:48:41,,,,,False,135,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,,,,1512679721.255304
2,2017-12-07 20:48:41,,,,,False,169,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,,1512679721.25635
3,2017-12-07 20:48:41,,,,,False,113,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,,,,1512679721.300373


**Selecting Based on Conditionals**

Select all TCP packets.

In [45]:
ex_ppd[ex_ppd['protocol'] == 'TCP'].head(1)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
19,2017-12-07 20:48:44,,,216.58.219.206,172.24.1.51,False,74,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,80.0,34331.0,TCP,1512679724.431387


Select all packets destined for port 443.

In [47]:
ex_ppd[(ex_ppd['protocol'] == 'TCP') & (ex_ppd['port_dst'] == 443)].head(1)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
52,2017-12-07 20:48:46,,,216.58.219.206,172.24.1.51,False,74,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,443.0,48343.0,TCP,1512679726.327938


### Compute Statistics

Compute Packet Statistics

In [51]:
print('Average Packet Length:', ex_ppd['length'].mean())
print('Minimum Packet Length:', ex_ppd['length'].min())
print('Maximum Packet Length:', ex_ppd['length'].max())
print('Median Packet Length:', ex_ppd['length'].median())


Average Packet Length: 799.632646199752
Minimum Packet Length: 42
Maximum Packet Length: 1514
Median Packet Length: 1434.0


Count the number of times a distinct value appears. For example, we can see how many packets appear on distinct ports.

In [58]:
ex_ppd[ex_ppd['port_dst'] < 2048]['port_dst'].value_counts()

443.0     39501
53.0        239
1900.0       35
80.0         29
68.0          4
67.0          4
123.0         2
Name: port_dst, dtype: int64

Find statistics grouped by particular categories.

Average length by protocol type.

In [59]:
ex_ppd.groupby('protocol')['length'].mean()

protocol
ICMP     98.382716
TCP     812.534564
UDP     283.579359
Name: length, dtype: float64

Average length for packets destined to various destination ports.

In [65]:
ex_ppd[ex_ppd['port_dst'] < 2048].groupby('port_dst')['length'].mean()

port_dst
53.0       80.928870
67.0      399.000000
68.0      342.000000
80.0      103.448276
123.0      90.000000
443.0     277.720336
1900.0    136.000000
Name: length, dtype: float64