In [1]:
import pandas as pd
from functools import reduce
from pathlib import Path
import argparse
import scapy.all as scapy
from datetime import datetime, timezone

In [2]:
def hex_to_int(hex_str):
    try:
        return int(hex_str, 16)
    except (TypeError, ValueError):
        return None

def tcp_flags_to_str(flags):
    flag_str = []
    if flags & 0x02:  # SYN
        flag_str.append('SYN')
    if flags & 0x10:  # ACK
        flag_str.append('ACK')
    if flags & 0x04:  # RST
        flag_str.append('RST')
    if flags & 0x01:  # FIN
        flag_str.append('FIN')
    if flags & 0x08:  # PSH
        flag_str.append('PSH')
    if flags & 0x20:  # URG
        flag_str.append('URG')
    return ''.join(flag_str)

def extract_packet_data(packet, client_ip):
    if packet.haslayer(scapy.IP):
        ip_layer = packet[scapy.IP]
        proto = ip_layer.proto
        is_downstream = ip_layer.dst == client_ip
        packet_size = ip_layer.len
    elif packet.haslayer(scapy.IPv6):
        ip_layer = packet[scapy.IPv6]
        proto = ip_layer.nh
        is_downstream = ip_layer.dst == client_ip
        packet_size = len(packet)
    else:
        return None
    
    if proto == 6 and packet.haslayer(scapy.TCP):  # TCP
        tcp_layer = packet[scapy.TCP]
        tcp_flag_str = tcp_flags_to_str(tcp_layer.flags)
        packet_data = {
            'ts': float(packet.time),  # Convert EDecimal to float
            'is_downstream': is_downstream,
            'packet_size': packet_size,
            'tcp_seq': tcp_layer.seq,
            'tcp_flags': tcp_flag_str
        }
        return packet_data
    return None

def pcap_to_df(pcap_file, client_ip, start_time, end_time):
    # Convert start and end times to datetime objects with timezone info
    start_dt = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S.%f %Z').replace(tzinfo=timezone.utc)
    end_dt = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S.%f %Z').replace(tzinfo=timezone.utc)
    
    # Convert datetime objects to Unix timestamps
    start_unix = start_dt.timestamp()
    end_unix = end_dt.timestamp()

    print(f"Filtering packets between {start_unix} and {end_unix}")

    packets = scapy.rdpcap(pcap_file)
    packet_data_list = []
    
    for packet in packets:
        if packet.haslayer(scapy.IP) or packet.haslayer(scapy.IPv6):
            data = extract_packet_data(packet, client_ip)
            if data is not None:
                if start_unix <= data['ts'] <= end_unix:
                    packet_data_list.append(data)
        else:
            print("Packet does not have IP or IPv6 layer")  # Debug: Indicate absence of IP or IPv6 layer
    
    return pd.DataFrame(packet_data_list)

def process_pcap_file(pcap_file_path, output_file_path, client_ip, start_time, end_time):
    df = pcap_to_df(pcap_file_path, client_ip, start_time, end_time)
    if df.empty:
        raise ValueError("No valid TCP packets found in the pcap file within the specified time range.")

    df['ts'] = df['ts'].astype(float)
    df['is_downstream'] = df['is_downstream'].astype(bool)
    df['packet_size'] = df['packet_size'].astype('Int32')
    df['tcp_seq'] = df['tcp_seq'].astype('Int64')
    
    df.to_csv(output_file_path, index=False)

    return df

In [4]:
# Testing the function



pcap_file_path = "../data_aggregation/pcaps_old/ndt-2b7tp_1659651184_0000000000B99C00.pcap.gz"

output_file_path = "out.csv"

client_ip = "2603:8081:2000:60eb::11fd"

start_time = "2022-09-04 03:29:20.357355 UTC"

end_time = "2022-09-04 03:29:35.357775 UTC"

process_pcap_file(pcap_file_path, output_file_path, client_ip, start_time, end_time)

Filtering packets between 1662262160.357355 and 1662262175.357775


Unnamed: 0,ts,is_downstream,packet_size,tcp_seq,tcp_flags
0,1.662262e+09,True,114,3434409402,ACKPSH
1,1.662262e+09,True,114,3434411796,ACKPSH
2,1.662262e+09,True,114,3434414652,ACKPSH
3,1.662262e+09,True,114,3434417508,ACKPSH
4,1.662262e+09,True,114,3434420364,ACK
...,...,...,...,...,...
1924,1.662262e+09,True,114,3437998932,ACKPSH
1925,1.662262e+09,True,114,3438001788,ACK
1926,1.662262e+09,True,114,3437744748,ACK
1927,1.662262e+09,True,114,3437744748,ACK


In [5]:
# TO DO: Filter for start and end time in pcap

import json
import csv

ground_truths = []

# Open the CSV file
with open('../data_aggregation/df_15s_download_info.csv', 'r') as file:
    # Create a DictReader object
    csv_reader = csv.DictReader(file)
    
    # Iterate through each row
    for i, row in enumerate(csv_reader):
        # Access each column value by its header
        pcap_file_path = "../data_aggregation/pcaps/" + row['id'] + ".pcap.gz"
        output_file_path = "../dfs/out" + str(i)  + ".csv"
        client_ip = row['ClientIP']
        start_time = row['StartTime']
        end_time =  row['EndTime']
        ground_truths.append(float(row['MeanThroughputMbps']))
        #process_pcap_file(pcap_file_path, output_file_path, client_ip, start_time, end_time)

json.dump(ground_truths, open("../dfs/ground_truths.json",'w'))

In [None]:
# 