In [1]:
import pyshark
import nest_asyncio
nest_asyncio.apply()

## Preprocess train data

In [None]:
import os
import glob
import pyshark
import pandas as pd

# Define the directory containing your pcapng files
input_directory = "data/Train/pcap"  # Change this to your directory path

# Define the output directory for the CSV files
output_directory = "data/Train/csv"  # Change this as needed

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get lists of both pcapng and pcap files
pcapng_files = glob.glob(os.path.join(input_directory, "*.pcapng"))
pcap_files = glob.glob(os.path.join(input_directory, "*.pcap"))
all_files = pcapng_files + pcap_files

# Process each file
for pcap_file in all_files:
    print(f"Processing file: {pcap_file}")
    
    # Open the file with PyShark (both pcap and pcapng are supported)
    capture = pyshark.FileCapture(pcap_file, display_filter='ip')
    
    data = []
    ref_time = None  # Will store the sniff_time of the first packet as our reference
    
    for packet in capture:
        # Only process packets that have both frame and IP layers
        if hasattr(packet, 'frame_info') and hasattr(packet, 'ip'):
            # Set the reference time using the first packet encountered
            if ref_time is None:
                ref_time = packet.sniff_time

            # Calculate elapsed time (in seconds) since the reference time
            elapsed_time = (packet.sniff_time - ref_time).total_seconds()

            # Get the frame length (convert to int)
            frame_length = int(packet.frame_info.len)

            # Get IP addresses from the IP layer (not used in final CSV but available for debugging)
            source_ip = packet.ip.src
            destination_ip = packet.ip.dst

            # Initialize ports as None; try to extract from TCP or UDP if available
            source_port = None
            destination_port = None
            if hasattr(packet, 'tcp'):
                source_port = int(packet.tcp.srcport)
                destination_port = int(packet.tcp.dstport)
            elif hasattr(packet, 'udp'):
                source_port = int(packet.udp.srcport)
                destination_port = int(packet.udp.dstport)

            # Determine the direction based on port-to-port logic:
            #   - If source IP is 127.0.0.1 and source port is 5000, then it's Downlink (-1).
            #   - If destination IP is 127.0.0.1 and destination port is 5000 and source port is not 5000, then it's Uplink (1).
            #   - Otherwise, 0.
            direction = 0
            if source_ip == "127.0.0.1" and source_port == 5000:
                direction = -1
            elif destination_ip == "127.0.0.1" and destination_port == 5000 and (source_port != 5000 if source_port is not None else True):
                direction = 1

            # Append the extracted data as a dictionary
            data.append({
                "Time (s)": elapsed_time,
                "Frame Length": frame_length,
                "Direction": direction
            })

    # Close the capture when done processing the file
    capture.close()

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(data)

    # Calculate interarrival time by taking the difference between consecutive "Time (s)" values
    df['Interarrival'] = df['Time (s)'].diff().fillna(0)


    # Build the output CSV file name based on the input file's base name
    base_filename = os.path.basename(pcap_file)
    csv_filename = os.path.splitext(base_filename)[0] + ".csv"
    csv_filepath = os.path.join(output_directory, csv_filename)

    # Save the DataFrame to a CSV file
    df.to_csv(csv_filepath, index=False)
    print(f"CSV saved: {csv_filepath}")

## prerpocess test data

In [None]:
import os
import glob
import pyshark
import pandas as pd

# Define the directory containing your pcapng files
input_directory = "data/Test/pcap"  # Change this to your directory path
#input_directory = "data/Train/pcap"  # Change this to your directory path

# Define the output directory for the CSV files
output_directory = "data/Test/csv"  # Change this as needed
#output_directory = "data/Traincsv"  # Change this as needed

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get lists of both pcapng and pcap files
pcapng_files = glob.glob(os.path.join(input_directory, "*.pcapng"))
pcap_files = glob.glob(os.path.join(input_directory, "*.pcap"))
all_files = pcapng_files + pcap_files

# Process each file
for pcap_file in all_files:
    print(f"Processing file: {pcap_file}")
    for packet in capture:
        # Only process packets that have both frame and IP layers
        if hasattr(packet, 'frame_info') and hasattr(packet, 'ip'):
            # Set the reference time using the first packet encountered
            if ref_time is None:
                ref_time = packet.sniff_time

            # Calculate elapsed time (in seconds) since the reference time
            elapsed_time = (packet.sniff_time - ref_time).total_seconds()

            # Get the frame length (convert to int)
            frame_length = int(packet.frame_info.len)

            # Get IP addresses from the IP layer (not used in final CSV but available for debugging)
            source_ip = packet.ip.src
            destination_ip = packet.ip.dst

            # Initialize ports as None; try to extract from TCP or directorory need to change between train and test to select the pcap fileUDP if available
            source_port = None
            destination_port = None
            if hasattr(packet, 'tcp'):
                source_port = int(packet.tcp.srcport)
                destination_port = int(packet.tcp.dstport)
            elif hasattr(packet, 'udp'):
                source_port = int(packet.udp.srcport)
                destination_port = int(packet.udp.dstport)

            # Determine the direction based on port-to-port logic:
            #   - If source IP is 127.0.0.1 and source port is 5000, then it's Downlink (-1).
            #   - If destination IP is 127.0.0.1 and destination port is 5000 and source port is not 5000, then it's Uplink (1).
            #   - Otherwise, 0.
            direction = 0
            if source_ip == "127.0.0.1" and source_port == 5000:
                direction = -1
            elif destination_ip == "127.0.0.1" and destination_port == 5000 and (source_port != 5000 if source_port is not None else True):
                direction = 1

            # Append the extracted data as a dictionary
            data.append({
                "Time (s)": elapsed_time,
                "Frame Length": frame_length,
                "Direction": direction
            })

    # Close the capture when done processing the file
    capture.close()

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(data)

    # Calculate interarrival time by taking the difference between consecutive "Time (s)" values
    df['Interarrival'] = df['Time (s)'].diff().fillna(0)


    # Build the output CSV file name based on the input file's base name
    base_filename = os.path.basename(pcap_file)
    csv_filename = os.path.splitext(base_filename)[0] + ".csv"
    csv_filepath = os.path.join(output_directory, csv_filename)

    # Save the DataFrame to a CSV file
    df.to_csv(csv_filepath, index=False)
    print(f"CSV saved: {csv_filepath}")