In [2]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
"""
Reading the data
https://jon.oberheide.org/blog/2008/10/15/dpkt-tutorial-2-parsing-a-pcap-file/
https://dpkt.readthedocs.io/en/latest/_modules/examples/print_packets.html

"""
import datetime
import socket
import dpkt
import numpy as np
import pandas as pd
from dpkt.compat import compat_ord


def mac_addr(address):
    """Convert a MAC address to a readable/printable string

       Args:
           address (str): a MAC address in hex form (e.g. '\x01\x02\x03\x04\x05\x06')
       Returns:
           str: Printable/readable MAC address
    """
    return ':'.join('%02x' % compat_ord(b) for b in address)


def inet_to_str(inet):
    """Convert inet object to a string

        Args:
            inet (inet struct): inet network address
        Returns:
            str: Printable/readable IP address
    """
    # First try ipv4 and then ipv6
    try:
        return socket.inet_ntop(socket.AF_INET, inet)
    except ValueError:
        return socket.inet_ntop(socket.AF_INET6, inet)


##################################################################################

packets_to_read = 10 ** 10
f = open('C:\\Users\\udayk\\Desktop\\mmj\\sample.pcap', 'rb')
pcap = dpkt.pcapng.Reader(f)
dpcap = {
    'ts': [],
    'eth_src': [],
    'eth_dst': [],
    'eth_type': [],
    'ip_src': [],
    'ip_dst': [],
    'ip_len': [],
    'ip_ttl': [],
    'ip_df': [],
    'ip_mf': [],
    'ip_offset': [],
    'type': [],
    'tcp_dport': [],
    'http_uri': [],
    'http_method': [],
    'http_version': [],
    'http_headers_ua': [],
}
non_ip_packets = dict()
c = 0
# For each packet in the pcap process the contents
for timestamp, buf in pcap:
    
    c += 1
    if c > packets_to_read:
        break

    dpcap['ts'].append(str(datetime.datetime.utcfromtimestamp(timestamp)))  # timestamp in UTC

    # Unpack the Ethernet frame (mac src/dst, ethertype)
    eth = dpkt.ethernet.Ethernet(buf)
    dpcap['eth_src'].append(mac_addr(eth.src))
    dpcap['eth_dst'].append(mac_addr(eth.dst))
    dpcap['eth_type'].append(eth.type)

    # Make sure the Ethernet frame contains an IP packet
    if not isinstance(eth.data, dpkt.ip.IP):
        #print('Non IP Packet type not supported %s\n' % eth.data.__class__.__name__)
        non_ip_packets[eth.data.__class__.__name__] = non_ip_packets.get(eth.data.__class__.__name__, 0) + 1
        
        dpcap['ts'].pop()
        dpcap['eth_src'].pop()
        dpcap['eth_dst'].pop()
        dpcap['eth_type'].pop()
        
        continue

    # Now unpack the data within the Ethernet frame (the IP packet)
    # Pulling out src, dst, length, fragment info, TTL, and Protocol
    ip = eth.data

    # Pull out fragment information (flags and offset all packed into off field, so use bitmasks)
    do_not_fragment = bool(ip.off & dpkt.ip.IP_DF)
    more_fragments = bool(ip.off & dpkt.ip.IP_MF)
    fragment_offset = ip.off & dpkt.ip.IP_OFFMASK

    dpcap['ip_src'].append(inet_to_str(ip.src))
    dpcap['ip_dst'].append(inet_to_str(ip.dst))
    dpcap['ip_len'].append(ip.len)
    dpcap['ip_ttl'].append(ip.ttl)
    dpcap['ip_df'].append(do_not_fragment)
    dpcap['ip_mf'].append(more_fragments)
    dpcap['ip_offset'].append(fragment_offset)

    dpcap['type'].append(ip.data.__class__.__name__)
    
    dpcap['tcp_dport'].append(np.nan)
    dpcap['http_uri'].append(np.nan)
    dpcap['http_method'].append(np.nan)
    dpcap['http_version'].append(np.nan)
    dpcap['http_headers_ua'].append(np.nan)
    
    if not isinstance(ip.data, dpkt.tcp.TCP):
        continue
    
    tcp = ip.data
    
    dpcap['tcp_dport'].pop()
    dpcap['tcp_dport'].append(tcp.dport)

    if tcp.dport == 80 and len(tcp.data) > 2 or True:
        #print(tcp.data)
        try:
            http = dpkt.http.Request(tcp.data)
            dpcap['http_uri'].pop()
            dpcap['http_uri'].append(http.uri)
            #print http.uri
            dpcap['http_method'][-1] = http.method
            dpcap['http_version'][-1] = http.version
            dpcap['http_headers_ua'][-1] = http.headers['user-agent']
        except:
            pass

df = pd.DataFrame.from_dict(dpcap)
print('non_ip_packets counter:', non_ip_packets)
df[df.http_uri.notnull()].head(20)


non_ip_packets counter: {'IP6': 23, 'ARP': 5}


Unnamed: 0,ts,eth_src,eth_dst,eth_type,ip_src,ip_dst,ip_len,ip_ttl,ip_df,ip_mf,ip_offset,type,tcp_dport,http_uri,http_method,http_version,http_headers_ua
34,2020-03-03 11:42:33.401267,e4:02:9b:9a:c8:88,0c:80:63:85:74:1c,2048,192.168.0.108,81.161.59.127,135,128,True,False,0,TCP,80.0,/poll?push_id=5552113f-32f0-4b96-af61-6d763a4c...,GET,1.1,
384,2020-03-03 11:43:13.353277,e4:02:9b:9a:c8:88,0c:80:63:85:74:1c,2048,192.168.0.108,81.161.59.127,135,128,True,False,0,TCP,80.0,/poll?push_id=5552113f-32f0-4b96-af61-6d763a4c...,GET,1.1,
590,2020-03-03 11:43:53.542295,e4:02:9b:9a:c8:88,0c:80:63:85:74:1c,2048,192.168.0.108,81.161.59.127,135,128,True,False,0,TCP,80.0,/poll?push_id=5552113f-32f0-4b96-af61-6d763a4c...,GET,1.1,


In [4]:
df.head(20)

Unnamed: 0,ts,eth_src,eth_dst,eth_type,ip_src,ip_dst,ip_len,ip_ttl,ip_df,ip_mf,ip_offset,type,tcp_dport,http_uri,http_method,http_version,http_headers_ua
0,2020-03-03 11:42:25.296309,e4:02:9b:9a:c8:88,0c:80:63:85:74:1c,2048,192.168.0.108,35.190.26.106,41,128,True,False,0,TCP,443.0,,,,
1,2020-03-03 11:42:25.337648,0c:80:63:85:74:1c,e4:02:9b:9a:c8:88,2048,35.190.26.106,192.168.0.108,52,120,False,False,0,TCP,64374.0,,,,
2,2020-03-03 11:42:27.054883,e4:02:9b:9a:c8:88,0c:80:63:85:74:1c,2048,192.168.0.108,34.237.73.95,328,128,True,False,0,TCP,443.0,,,,
3,2020-03-03 11:42:27.286176,0c:80:63:85:74:1c,e4:02:9b:9a:c8:88,2048,34.237.73.95,192.168.0.108,365,237,True,False,0,TCP,50876.0,,,,
4,2020-03-03 11:42:27.339586,e4:02:9b:9a:c8:88,0c:80:63:85:74:1c,2048,192.168.0.108,34.237.73.95,40,128,True,False,0,TCP,443.0,,,,
5,2020-03-03 11:42:27.756439,e4:02:9b:9a:c8:88,b0:2a:43:42:b6:d2,2048,192.168.0.108,192.168.0.103,150,128,True,False,0,TCP,8009.0,,,,
6,2020-03-03 11:42:27.771933,b0:2a:43:42:b6:d2,e4:02:9b:9a:c8:88,2048,192.168.0.103,192.168.0.108,150,64,True,False,0,TCP,50054.0,,,,
7,2020-03-03 11:42:27.818744,e4:02:9b:9a:c8:88,b0:2a:43:42:b6:d2,2048,192.168.0.108,192.168.0.103,40,128,True,False,0,TCP,8009.0,,,,
8,2020-03-03 11:42:28.729336,0c:80:63:85:74:1c,e4:02:9b:9a:c8:88,2048,172.217.160.138,192.168.0.108,484,56,True,False,0,UDP,,,,,
9,2020-03-03 11:42:28.741625,e4:02:9b:9a:c8:88,0c:80:63:85:74:1c,2048,192.168.0.108,172.217.160.138,57,128,True,False,0,UDP,,,,,
