In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
from tqdm import trange, tqdm

from pcap_processor import calculate_features



In [2]:
# get the csv at location ../data_aggregation/processed_df.csv as a df
df = pd.read_csv('../data_aggregation/processed_df.csv')

# randomly select 400 rows from the df for training and 100 for testing
sample = df.sample(100)

# get the keys of the df_train
keys = sample.keys()

In [3]:
packet_capture_data = []  # List of lists of dicts
speed_test_results = []  # List of floats

# iterate through each of the rows of df_train and get the features and targets
for index, row in tqdm(sample.iterrows(), total=len(sample)):
    speed_test_results.append(row['MeanThroughputMbps'])

    packet_capture_data.append(calculate_features(pcap_dir=("../data_aggregation/pcaps/" + row['id'] + ".pcap.gz"), time_delta_s=1, start_time_str=row['StartTime'], client_ip=row['ClientIP'], total_time_s=10))


# Convert data to tensors
data_tensors = []
for packet_capture in packet_capture_data:
    packet_capture_tensor = torch.tensor(
        [list(packet.values()) for packet in packet_capture], dtype=torch.float32
    )
    data_tensors.append(packet_capture_tensor)
target_tensors = torch.tensor(speed_test_results, dtype=torch.float32)


  0%|          | 0/100 [00:00<?, ?it/s]


TypeError: list.append() takes no keyword arguments

In [None]:

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=1)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (h_n, _) = self.rnn(x)
        out = self.fc(h_n[-1])
        return out


# Define custom dataset
class PacketCaptureDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]



In [None]:
train_data, test_data, train_targets, test_targets = train_test_split(
    data_tensors, target_tensors, test_size=0.2, random_state=42
)

# Create datasets and data loaders
train_dataset = PacketCaptureDataset(train_data, train_targets)
test_dataset = PacketCaptureDataset(test_data, test_targets)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
input_size = len(
    packet_capture_data[0][0]
)  # Assuming all packet captures have the same structure
hidden_size = 64
output_size = 1
model = RNNModel(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

In [None]:
model.eval()
test_loss = 0
num_examples = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        # print(inputs)
        # print(inputs.size())
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        test_loss += loss.item()
        num_examples += inputs.size(0)

average_test_loss = test_loss / num_examples
print(f"Average Test Loss per Example: {average_test_loss}")

In [None]:
# get the percentage of the values in the test dataset that are 0
zero_percentage = (test_targets == 0).sum().item() / len(test_targets)
zero_percentage