In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
from tqdm import trange, tqdm
import joblib

from pcap_processor import calculate_features



In [2]:
# Check if CUDA (NVIDIA's GPU programming toolkit) is available
if torch.cuda.is_available():
    print("CUDA is available. PyTorch can use the GPU.")
    print("Number of GPUs available:", torch.cuda.device_count())
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. PyTorch cannot use the GPU.")

CUDA is available. PyTorch can use the GPU.
Number of GPUs available: 1
GPU Name: NVIDIA GeForce RTX 3070 Ti Laptop GPU


In [3]:
# Load the tensors from the .pt files
data_tensors = torch.load('data_tensors_15s_0.2s.pt')
target_tensors = torch.load('target_tensors_15s_0.2s.pt')

print("Data tensors and target tensors loaded successfully!")

# Example usage: Checking the shapes of the loaded tensors
print(f"Number of data samples: {len(data_tensors)}")
print(f"Shape of first data sample: {data_tensors[0].shape}")
print(f"Shape of target tensor: {target_tensors.shape}")

Data tensors and target tensors loaded successfully!
Number of data samples: 3000
Shape of first data sample: torch.Size([75, 22])
Shape of target tensor: torch.Size([3000])


In [4]:

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=3)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (h_n, _) = self.rnn(x)
        out = self.fc(h_n[-1])
        return out


# Define custom dataset
class PacketCaptureDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]



In [5]:
train_data, test_data, train_targets, test_targets = train_test_split(
    data_tensors, target_tensors, test_size=0.2, random_state=42
)

# Create datasets and data loaders
train_dataset = PacketCaptureDataset(train_data, train_targets)
test_dataset = PacketCaptureDataset(test_data, test_targets)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [6]:
input_size = len(
    train_dataset[0][0][0]
)  # Assuming all packet captures have the same structure
hidden_size = 64
output_size = 1
model = RNNModel(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [7]:
# load the weights from model_weights.pt
model.load_state_dict(torch.load('model_weights_15s_0.2s.pt'))

# started with 20 000 epochs
num_epochs = 3850
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

Epoch 1, Loss: 24780.840602874756
Epoch 2, Loss: 35360.05989456177
Epoch 3, Loss: 26356.117393493652
Epoch 4, Loss: 37106.34190559387
Epoch 5, Loss: 30243.52255821228
Epoch 6, Loss: 21484.260959625244
Epoch 7, Loss: 18054.202228546143
Epoch 8, Loss: 13270.51802444458
Epoch 9, Loss: 14409.66647720337
Epoch 10, Loss: 14139.200119018555
Epoch 11, Loss: 12868.416603088379
Epoch 12, Loss: 10977.383734703064
Epoch 13, Loss: 9932.118698120117
Epoch 14, Loss: 8937.363561630249
Epoch 15, Loss: 8249.74599647522
Epoch 16, Loss: 8096.719263076782
Epoch 17, Loss: 8407.116602897644
Epoch 18, Loss: 8138.868124008179
Epoch 19, Loss: 7972.909778594971
Epoch 20, Loss: 7834.973787307739
Epoch 21, Loss: 7853.243414878845
Epoch 22, Loss: 8219.323895454407
Epoch 23, Loss: 7459.3127183914185
Epoch 24, Loss: 7240.797216415405
Epoch 25, Loss: 7342.774250030518
Epoch 26, Loss: 6893.968723297119
Epoch 27, Loss: 7373.516651153564
Epoch 28, Loss: 8808.95948600769
Epoch 29, Loss: 9891.992564201355
Epoch 30, Loss: 1

In [14]:
# save the weights of the model to a file
torch.save(model.state_dict(), 'model_weights_15s_0.2s.pt')

In [15]:
total_absolute_error = 0
num_samples = len(test_targets)

for i in range(num_samples):
    predicted_value = model(test_data[i].unsqueeze(0)).item()
    actual_value = test_targets[i].item()
    absolute_error = abs(predicted_value - actual_value)
    total_absolute_error += absolute_error

average_absolute_error = total_absolute_error / num_samples
print(f"Average Absolute Error: {average_absolute_error}")


Average Absolute Error: 31.368160190644364


In [16]:
# get the median absolute error
absolute_errors = []
for i in range(num_samples):
    predicted_value = model(test_data[i].unsqueeze(0)).item()
    actual_value = test_targets[i].item()
    absolute_error = abs(predicted_value - actual_value)
    absolute_errors.append(absolute_error)

median_absolute_error = np.median(absolute_errors)
print(f"Median Absolute Error: {median_absolute_error}")

Median Absolute Error: 3.5058929920196533


In [17]:
# print each actual vs prediction value
for i in range(num_samples):
    predicted_value = model(test_data[i].unsqueeze(0)).item()
    actual_value = test_targets[i].item()
    print(f"Actual: {actual_value}, Predicted: {predicted_value}")

Actual: 0.23467867076396942, Predicted: -2.550067901611328
Actual: 23.574888229370117, Predicted: 17.972332000732422
Actual: 0.3150703012943268, Predicted: 0.023731231689453125
Actual: 0.8776228427886963, Predicted: 1.7831687927246094
Actual: 5.211883068084717, Predicted: 11.158061981201172
Actual: 8.56942367553711, Predicted: 7.251903533935547
Actual: 17.285400390625, Predicted: 16.886247634887695
Actual: 1.7512636184692383, Predicted: 12.557605743408203
Actual: 6.08070182800293, Predicted: 5.031932830810547
Actual: 2.4700632095336914, Predicted: 9.290393829345703
Actual: 0.4156245291233063, Predicted: 2.6052017211914062
Actual: 0.6440844535827637, Predicted: 2.7067604064941406
Actual: 0.1825038343667984, Predicted: 0.19631576538085938
Actual: 20.93109893798828, Predicted: 5.939197540283203
Actual: 43.1968879699707, Predicted: 19.705585479736328
Actual: 11.367669105529785, Predicted: 9.864986419677734
Actual: 13.284963607788086, Predicted: 3.0449256896972656
Actual: 0.8706076741218567

In [18]:
# calculate the average percentage error of the model
total_percentage_error = 0
for i in range(num_samples):
    predicted_value = model(test_data[i].unsqueeze(0)).item()
    actual_value = test_targets[i].item()
    percentage_error = abs(predicted_value - actual_value) / actual_value
    total_percentage_error += percentage_error

average_percentage_error = (total_percentage_error / num_samples) * 100
print(f"Average Percentage Error: {average_percentage_error}%")

Average Percentage Error: 658.0238001084548%


In [19]:
# calculate the median percentage error of the model
percentage_errors = []
for i in range(num_samples):
    predicted_value = model(test_data[i].unsqueeze(0)).item()
    actual_value = test_targets[i].item()
    percentage_error = abs(predicted_value - actual_value) / actual_value
    percentage_errors.append(percentage_error)

median_percentage_error = np.median(percentage_errors) * 100
print(f"Median Percentage Error: {median_percentage_error}%")


Median Percentage Error: 100.42319862395675%
