This first bit of code takes care of a few things. First, it gets some of the necessary imports out of the way and sets pandas print options. Then, it reads in the sequence csv file and filters out unnecessary columns, leaving 16 total features:

1.   The position of the packet within its sequence. This is used only to parse the csv file back out into sequences.
2.   The timestamp of the packet
3.   The packet length
4.   The source port
5.   The destination port
6.   The 10 possible TCP flags

In [0]:
import pandas as pd
import numpy as np
import time
import os
import random
import psutil
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

# Read the data and filter out unnecessary columns
data = pd.read_csv("week1mon_seqs.csv", delimiter=',', header=0)
data = data.filter(items=['position','timestamp','protocol','length','src_port','dst_port',
                          'res', 'ns', 'cwr', 'ecn', 'urg', 'ack', 'push', 'reset', 'syn', 'fin'])
print(data)

        position     timestamp protocol  length  src_port  dst_port  res  ns  cwr  ecn  urg  ack  push  reset  syn  fin
0              0     37.294817      TCP      60        79      1024    0   0    0    0    0    1     0      0    1    0
1              1     37.295017      TCP      60      1024        79    0   0    0    0    0    1     0      0    0    0
2              2     37.295563      TCP      60      1024        79    0   0    0    0    0    1     1      0    0    0
3              3     37.307251      TCP      60        79      1024    0   0    0    0    0    1     0      0    0    0
4              4     37.327150      TCP      60        79      1024    0   0    0    0    0    1     0      0    0    0
...          ...           ...      ...     ...       ...       ...  ...  ..  ...  ...  ...  ...   ...    ...  ...  ...
826561         3  25638.793159   TELNET     922        23     13416    0   0    0    0    0    1     1      0    0    0
826562         0  25638.796071      TCP 

# Feature vector encoding

This cell defines two functions to normalize and encode the various features of the dataset. The first function encodes all of the values of a numerical feature as zscores, while the second encodes a text-based feature into a 1 or 0 value that can be interpreted by the model. 

In [0]:
# Encode a numeric feature
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

# Encode a categorical feature
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

data['src_port_type'] = ""
data['dst_port_type'] = ""
data['src_port_type'][data['src_port']<1024] = 'well_known'
data['src_port_type'][data['src_port']>49151] = 'dynamic'
data['src_port_type'][data['src_port_type'] == ""] = 'registered'
data['dst_port_type'][data['dst_port']<1024] = 'well_known'
data['dst_port_type'][data['dst_port']>49151] = 'dynamic'
data['dst_port_type'][data['dst_port_type'] == ""] = 'registered'
data = data.drop(['src_port'], axis=1)
data = data.drop(['dst_port'], axis=1)

encode_numeric_zscore(data, "timestamp")
encode_numeric_zscore(data, "length")
encode_text_dummy(data, "protocol")
encode_text_dummy(data, "src_port_type")
data['src_port_type-dynamic'] = 0
encode_text_dummy(data, "dst_port_type")
data['dst_port_type-dynamic'] = 0

data.dropna(inplace=True,axis=1)
print(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

        position  timestamp    length  res  ns  cwr  ecn  urg  ack  push  reset  syn  fin  protocol-HTTP  protocol-IRC  protocol-POP  protocol-SMTP  protocol-SSHv1  protocol-TCP  protocol-TELNET  src_port_type-registered  src_port_type-well_known  src_port_type-dynamic  dst_port_type-registered  dst_port_type-well_known  dst_port_type-dynamic
0              0  -2.120420 -0.393051    0   0    0    0    0    1     0      0    1    0              0             0             0              0               0             1                0                         0                         1                      0                         1                         0                      0
1              1  -2.120420 -0.393051    0   0    0    0    0    1     0      0    0    0              0             0             0              0               0             1                0                         1                         0                      0                         0               

# Data preparation

This cell begins preparing the data for insertion into the sequence to sequence (seq2seq) model by parsing it into sequences of packets and creating [input, target] pairs. The first for-loop here goes through the dataset and finds the indexes of the start of each sequence and saves them in the "points" list. Therefore, the DataFrame rows with indexes that fall between two of these points are considered a single "sequence" whose length can range from 4 to 320 total packets.

The second for-loop goes through the list of points and uses it to create input and target sequence pairs. The main part of this is handled within a nested for-loop, which reads the data between two break points and stores the first 3 packets as the input and the rest of the sequence as the target. Sequence pairs are stored in this way because the seq2seq model aims to take the first 3 packets of a sequence as input and use them to predict the entire rest of that sequence. While the number of packets in the input remains constant at 3, the number of packets in the target ranges from 1 to 317.

To check that everything has worked correctly, a random [input, target] pair is printed along with the total number of detected sequences.

In [0]:
# Get the starting points of each sequence within the dataset
points = []
for x in range(0, len(data.index)):
  if(data['position'][x] == 0):
    points.append(x)

data = data.drop(['position'], axis=1)
# Get the input and target sequences for the pairs
pairs = []
for x in range(0, len(points)-1):
  count = 0
  input = []
  target = []
  for y in range(points[x], points[x+1]):
    if(count < 3):
      # Get input sequence
      packet = [data['timestamp'][y], data['length'][y], data['res'][y], data['ns'][y], 
                data['cwr'][y], data['ecn'][y], data['urg'][y], data['ack'][y], 
                data['push'][y], data['reset'][y], data['syn'][y], data['fin'][y],
                data['protocol-TCP'][y], data['protocol-TELNET'][y], data['protocol-SMTP'][y], 
                data['protocol-HTTP'][y], data['protocol-SSHv1'][y],data['protocol-IRC'][y], 
                data['protocol-POP'][y], data['src_port_type-well_known'][y], 
                data['src_port_type-registered'][y], data['src_port_type-dynamic'][y],
                data['dst_port_type-well_known'][y], data['dst_port_type-registered'][y],
                data['dst_port_type-dynamic'][y]]
      input.append(packet)
      count = count + 1
    else:
      # Get target sequence
      packet = [data['timestamp'][y], data['length'][y], data['res'][y], data['ns'][y], 
                data['cwr'][y], data['ecn'][y], data['urg'][y], data['ack'][y], 
                data['push'][y], data['reset'][y], data['syn'][y], data['fin'][y],
                data['protocol-TCP'][y], data['protocol-TELNET'][y], data['protocol-SMTP'][y], 
                data['protocol-HTTP'][y], data['protocol-SSHv1'][y],data['protocol-IRC'][y], 
                data['protocol-POP'][y], data['src_port_type-well_known'][y], 
                data['src_port_type-registered'][y], data['src_port_type-dynamic'][y],
                data['dst_port_type-well_known'][y], data['dst_port_type-registered'][y],
                data['dst_port_type-dynamic'][y]]
      target.append(packet)
      count = count + 1

  pair = [input, target]
  pairs.append(pair)

print(random.choice(pairs))
print(len(pairs))

[[[0.48494809173918346, -0.39305105735673224, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [0.4849500025490883, -0.39305105735673224, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [0.48495008966524905, -0.39305105735673224, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0]], [[0.4849502213488567, -0.39305105735673224, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]]]
81895


# Make the tensors

The final step of data preparation requires us to transform the [input, target] pairs into tensors so that the model can read them. This is done simply with the following two functions:

1.   tensor_from_sequence - converts the token vectors into tensors
2.   tensors_from_pair - calls the previous function for the input and target sequences of a given pair and returns the resulting tensors

In [0]:
# Functions to prepare the data for insertion into the model
def tensor_from_sequence(sequence):  # Create a tensor from a sequence using the dictionary
    return torch.tensor(sequence, dtype=torch.float, device=device)

def tensors_from_pair(pair):  # Get an input and target tensor out of a pair
    input_tensor = tensor_from_sequence(pair[0])
    target_tensor = tensor_from_sequence(pair[1])
    return (input_tensor, target_tensor)

The following few segments of code define the two parts of the actual seq2seq model itself: the encoder and the decoder. Each of these two parts is a recurrent neural network (RNN), which is a neural network that performs some operation on a sequence of data and uses the output generated by that operation as input for the next step (recurrence). For these RNNs, we use the **Gated Recurrent Unit** (GRU) architecture, as opposed to the more commonly used **Long Short Term Memory** (LSTM) architecture. This is because, despite being a newer architecture, GRU works similarly to LSTM and has been shown to yield similar results while being slightly more efficient computationally. [This paper](https://arxiv.org/pdf/1412.3555v1.pdf) gives a more in-depth overview of the differences between the two architectures.
# Encoder
In a seq2seq model using an encoder and decoder, the responsibility of the encoder is to encode, or condense, the input sequence into a single vector while retaining the original meaning of that sequence. Upon creation, the encoder takes two parameters called input_size and hidden_size, where input_size is the number of features to be used for each token in a given sequence, and hidden_size is the number of features to be used for the hidden state. For each packet in the input sequence, the encoder will produce two things:



1.   A **vector** (called output_vector in the following code)
2.   A **hidden state** (called hidden_state in the following code)



Following this, a new input sequence and the previous hidden state will be taken as input to do the next step on the next packet in the sequence, and the output vector will be adjusted accordingly and a new hidden state produced. This process is repeated until a final output vector (the **context vector**) is reached, which will be given to the decoder later on. The forward function carries out these tasks in our implementation.

In [0]:
# Recurrent neural network for Encoder of the seq2seq model
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size) # Applies Gated Recurrent Unit (GRU) to input sequence

    def forward(self, input_token, hidden_state):
        input_token = input_token.unsqueeze(0).unsqueeze(0)
        output_vector, hidden_state = self.gru(input_token, hidden_state)
        return output_vector, hidden_state

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Decoder

The decoder, like the encoder, is a recurrent neural network using GRU architecture and takes a number of features and hidden size as parameters. However, the number of features for the decoder does not necessarily have to be equal to the number of features for the encoder. For instance, we could theoretically use the decoder to predict the source and destination IPs of packets (2 features), while passing input sequences with only source IPs (1 feature) to the encoder. In this implementation, however, this is not necessary, so the encoder and decoder take an equal number of features.

The decoder takes the context vector as its initial hidden state, and has a couple "extra" layers compared to the encoder, including the LeakyReLU activation layer. As before, the forward function carries out the necessary steps, taking an input token and hidden state as input, then producing an output vector and new hidden state. The output of each run of the decoder will be a single predicted packet within a larger sequence.

In [0]:
# Recurrent neural network for Decoder of seq2seq model
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(output_size, hidden_size) # Applies GRU
        self.out = nn.Linear(hidden_size, output_size)
        self.relu = nn.LeakyReLU()

    def forward(self, input_token, hidden_state):
        input_token = input_token.unsqueeze(0).unsqueeze(0)
        output_vector = self.relu(input_token)
        output_vector, hidden_state = self.gru(output_vector, hidden_state)
        output_vector = self.out(output_vector[0])
        return output_vector, hidden_state

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Helper functions

In [0]:
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Helper functions to keep track of the time elapsed and time remaining
def as_minutes(sec):
    mins = math.floor(sec / 60)
    sec = sec - (mins * 60)
    return '%dm %ds' % (mins, sec)

def time_since(since, percent):
    now = time.time()
    sec = now-since
    es = sec/(percent)
    rs = es-sec
    return '%sec (- %sec)' % (as_minutes(sec), as_minutes(rs))

# Plot loss vs number of iterations
def show_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.1) # Put plot ticks at intervals of 0.1
    ax.yaxis.set_major_locator(loc)
    plt.title("Training - Loss vs Num of Iterations")
    plt.xlabel("Number of iterations")
    plt.ylabel("Loss value")
    plt.plot(points)

# Training function
The training function performs the core aspect of machine learning model optimization. In this implementation, it takes several parameters:

1. Input tensor which has the tensor representation of the first 3 packets in the sequence
2. Target tensor
3. The encoder and decoder that the function is going to be training
4. Optimizers for the encoder and decoder to oversee the machine learning process
5. A function (Mean Squared Error loss) to calculate the training loss, which is called "criterion" in the code

The function follows a process of several steps, beginning with preparation, which includes initializing the hidden state of the encoder, wiping the gradients of the optimizers, and getting the lengths of the input and target sequences. Next, we have to run the encoder on all of the tokens in the input sequence, which is handled in a loop. Once the encoder has produced a context vector, it is passed to the decoder along with the decoder's input token – the most recently predicted token if teacher forcing is not being used, and the actual target token if it is. After each prediction, the total loss is increased by comparing the predicted token to the expected token using the loss function. Finally, the optimizers are updated using the results of this iteration of the model, and the average loss is returned.

In [0]:
teacher_forcing_ratio = 0 # Make 0 if don't want teacher forcing

# Training function
# Criterion = negative log likelihood loss (NLLLoss)
# Lines commented out are for attn decoder
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_hidden = encoder.init_hidden() # Initialize hidden state of the encoder
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_length = input_tensor.size(0) # length of the input sequence
    target_length = target_tensor.size(0) # length of the target sequence
    loss = 0

# loop through the input tokens w/ encoder and get the final vector/hidden state
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

    decoder_input = target_tensor[0]
    decoder_hidden = encoder_hidden # Initialize the hidden state of the decoder
    # Decide whether to use teacher forcing on this run
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Run the decoder for each element of the target sequence
    if use_teacher_forcing:
      for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        loss = loss + criterion(decoder_output, target_tensor[di])
        decoder_input = target_tensor[di]  # Teacher forcing

    else:
      for di in range(target_length):
        #print(decoder_input)
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        decoder_input = decoder_output.squeeze()
        loss = loss + criterion(decoder_output, target_tensor[di].unsqueeze(0))

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item() / target_length

# Training overhead
In order to train the model using an entire training dataset, an overhead function is required to oversee the process. This function takes the encoder and decoder as parameters, along with a few user-defined training variables including the number of iterations to train over, intervals at which to print and plot results, and the learning rate, which determines how much the optimizer will be influenced by each iteration.

The function defines the optimizers for the encoder and decoder, takes a series of random input, target pairs, defines the criterion function, and runs the train function on a different pair for the number of iterations specified by the user. Periodically, progress is printed using the previously defined helper functions, so that the user can keep track of the model's performance. Additionally, "snapshots" of the model's optimized parameters are saved at the 20%, 40%, 55%, 70%, 85%, and 100% milestones in the training process so that these models can be evaluated individually and examined for overfitting and underfitting.

In [0]:
# Repeatedly run the train function and print evaluation info as it goes
def train_iterations(encoder, decoder, n_iterations, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    # Get the training pairs
    training_pairs = [tensors_from_pair(random.choice(pairs)) for i in range(n_iterations)]
    criterion = nn.MSELoss()

    # Loop to train the model with the specified number of iterations
    for iteration in range(1, n_iterations + 1):
        training_pair = training_pairs[iteration - 1]
        input_tensor = training_pair[0] # Get an input tensor from the pair
        target_tensor = training_pair[1]  # Get a target tensor from the pair

        # Train the model on the pairs
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        loss_total = loss_total + loss
        plot_loss_total = plot_loss_total + loss

        if(iteration == 40000):
          torch.save(encoder.state_dict(), '/content/encoder40percent.dict')
          torch.save(decoder.state_dict(), '/content/decoder40percent.dict')
        
        if(iteration == 80000):
          torch.save(encoder.state_dict(), '/content/encoder80percent.dict')
          torch.save(decoder.state_dict(), '/content/decoder80percent.dict')

        if(iteration == 100000):
          torch.save(encoder.state_dict(), '/content/encoder100percent.dict')
          torch.save(decoder.state_dict(), '/content/decoder100percent.dict')

        # If it has reached the print interval, print progress information
        if iteration % print_every == 0:
            loss_avg = loss_total/print_every
            loss_total = 0
            print('%s (%d %d%%) Current Loss Value ----> %.4f' % (time_since(start, iteration / n_iterations), iteration, iteration / n_iterations * 100, loss_avg))

        # If it has reached the plot interval, add info to the plot_losses array
        if iteration % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    print("=================================================================")
    print("-------> MODEL HAS FINISHED TRAINING <-------\n")
    print("Plotting Model......")
    show_plot(plot_losses)

# Train the model

Define a size for the hidden vector, initialize an encoder and decoder, then run the train_iterations function. In our implementation, data points with 25 input features are used to produce predictions with 25 output features.

In [0]:
hidden_size = 256
encoder1 = Encoder(input_size=25,hidden_size=hidden_size).to(device)
decoder1 = Decoder(hidden_size=hidden_size,output_size=25).to(device)

train_iterations(encoder1, decoder1, n_iterations=100000, print_every=100, plot_every=100, learning_rate=0.0001)