In [1]:
import torch
from torch.functional import F
from torch import nn

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import pandas as pd
from trace_process import *

TIME_DELTA = 1000 * NANO_TO_MICRO
SEQ_L = 200
N_EPOCHS = 10
BATCH_SIZE = 64

In [2]:
total_recored = 3555634102
skip = 0
chunk = 10000000

df = pd.read_csv(
    "../data/1/node-1/packets", 
    sep='\t', 
    lineterminator='\n', 
    header=None,
    index_col=False,
    names=['timestamp', 'size', 'src', 'dest', 'dir'], 
    dtype={'size': "int16", 'src': "category", 'dest': "category", "timestamp": "int64", "size": "int16", "dir": "int8"},
    skiprows=skip,
    nrows=chunk
)
df = df[df['dir'] == 2][["timestamp", "size"]]

In [3]:
trace = df.values
flows_indexes, flow_sizes = get_flows_index(trace, TIME_DELTA)
start = np.argwhere(flows_indexes[:, 0] > SEQ_L).min()
packets = trace[:, :]

In [4]:
EM_THRESHOLD = np.median(flow_sizes)

In [5]:
class TraceDataset(torch.utils.data.IterableDataset):
    def __init__(self, packet_trace, flows_indexes, flow_sizes, start):
        self.packet_trace = packet_trace
        self.flows_indexes = flows_indexes
        self.flow_sizes = flow_sizes
        self.start = start
    
    def __iter__(self):
        idx = start
        end = len(self.flows_indexes[start:, 0])
        while idx < end - 1:
            target = self.flows_indexes[idx, 0]
            x = self.packet_trace[target - SEQ_L:target]
            y = (self.flow_sizes[idx + 1] > EM_THRESHOLD) * 1.
            yield x[None, :].astype(np.float32), y
            idx += 1
    
    def __len__(self):
        return len(self.flows_indexes) - start

In [6]:
class CNNModel(nn.Module):
    def __init__(self, filters=(4, 3, 2, 1)):
        super().__init__()
        self.filters = filters

        self.conv1 = nn.Conv2d(1, filters[0], (1, 2))
        self.bn1 = nn.BatchNorm2d(filters[0])
        self.conv2 = nn.Conv2d(filters[0], filters[1], (5, 1))
        self.bn2 = nn.BatchNorm2d(filters[1])
        self.conv3 = nn.Conv2d(filters[1], filters[2], (4, 1))
        self.bn3 = nn.BatchNorm2d(filters[2])
        self.conv4 = nn.Conv2d(filters[2], filters[3], (3, 1))
        self.bn4 = nn.BatchNorm2d(filters[3])
        self.linear = nn.Linear(filters[3] * 10, 1)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.avg_pool2d(x, (2, 1))
        x = self.bn1(x)

        x = self.conv2(x)
        x = F.relu(x)
        x = F.avg_pool2d(x, (2, 1))
        x = self.bn2(x)

        x = self.conv3(x)
        x = F.relu(x)
        x = F.avg_pool2d(x, (2, 1))
        x = self.bn3(x)

        x = self.conv4(x)
        x = F.relu(x)
        x = F.avg_pool2d(x, (2, 1))
        x = self.bn4(x)
        
        x = x.view(-1, self.filters[-1] * 10)

        x = self.linear(x)
        x = F.sigmoid(x)

        return x

In [7]:
train_dataset = TraceDataset(packets, flows_indexes, flow_sizes, start)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)

In [8]:
criterion = nn.BCELoss()
model = CNNModel()
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-5, momentum=0.9)

In [9]:
if torch.cuda.is_available():
  model = model.cuda()
  criterion = criterion.cuda()
  device = torch.device("cuda:0")
else:
  device = torch.device("cpu")

In [10]:
def get_loss_and_correct(model, batch, criterion, device):
    data, target = batch
    data, target = data.to(device, dtype=torch.float), target.to(device, dtype=torch.float)
    output = model(data)
    output = torch.squeeze(output)
    
    loss = criterion(output, target)

    pred = torch.round(output).int()
    target_int = target.int()

    true_num = pred.eq(target_int.data.view_as(pred)).sum()

    return loss, true_num

def step(loss, optimizer):
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

In [11]:
train_losses = []
train_accuracies = []

pbar = tqdm(range(N_EPOCHS))

for i in pbar:
  total_train_loss = 0.0
  total_train_correct = 0.0

  model.train()

  for batch in tqdm(train_dataloader, leave=False):
    loss, correct = get_loss_and_correct(model, batch, criterion, device)
    step(loss, optimizer)
    total_train_loss += loss.item()
    total_train_correct += correct.item()

  mean_train_loss = total_train_loss / len(train_dataset)
  train_accuracy = total_train_correct / len(train_dataset)

  train_losses.append(mean_train_loss)

  train_accuracies.append(train_accuracy)

  pbar.set_postfix({'train_loss': mean_train_loss, 'train_accuracy': train_accuracy})

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]



  0%|          | 0/21824 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]

  0%|          | 0/21824 [00:00<?, ?it/s]

In [12]:
len(np.argwhere(flow_sizes <= EM_THRESHOLD)) / len(flow_sizes)

0.6090771627565983