In [0]:
import pandas as pd
import numpy as np
import time
import os
import random
import psutil
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

# Read the data and filter out unnecessary columns
data = pd.read_csv("testing_sequences.csv", delimiter=',', header=0)
data = data.filter(items=['position','timestamp','protocol','length','src_port','dst_port',
                          'res', 'ns', 'cwr', 'ecn', 'urg', 'ack', 'push', 'reset', 'syn', 'fin'])
print(data)

        position     timestamp protocol  length  src_port  dst_port  res  ns  cwr  ecn  urg  ack  push  reset  syn  fin
0              0     30.208283      TCP      60        25      1024    0   0    0    0    0    1     0      0    1    0
1              1     30.208486      TCP      60      1024        25    0   0    0    0    0    1     0      0    0    0
2              2     30.441850     SMTP     145        25      1024    0   0    0    0    0    1     1      0    0    0
3              3     30.456735      TCP      60      1024        25    0   0    0    0    0    1     0      0    0    0
4              4     30.479638     SMTP      78      1024        25    0   0    0    0    0    1     1      0    0    0
...          ...           ...      ...     ...       ...       ...  ...  ..  ...  ...  ...  ...   ...    ...  ...  ...
841078         4  27045.279727      TCP      60     25134        23    0   0    0    0    0    1     0      0    0    0
841079         0  27045.303068   TELNET 

# Feature vector encoding

In [0]:
# Encode a numeric feature
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

# Encode a categorical feature
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

data['src_port_type'] = ""
data['dst_port_type'] = ""
data['src_port_type'][data['src_port']<1024] = 'well_known'
data['src_port_type'][data['src_port']>49151] = 'dynamic'
data['src_port_type'][data['src_port_type'] == ""] = 'registered'
data['dst_port_type'][data['dst_port']<1024] = 'well_known'
data['dst_port_type'][data['dst_port']>49151] = 'dynamic'
data['dst_port_type'][data['dst_port_type'] == ""] = 'registered'
data = data.drop(['src_port'], axis=1)
data = data.drop(['dst_port'], axis=1)

encode_numeric_zscore(data, "timestamp")
encode_numeric_zscore(data, "length")
encode_text_dummy(data, "protocol")
encode_text_dummy(data, "src_port_type")
encode_text_dummy(data, "dst_port_type")

data.dropna(inplace=True,axis=1)
print(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

        position  timestamp    length  res  ns  cwr  ecn  urg  ack  push  reset  syn  fin  protocol-HTTP  protocol-IRC  protocol-POP  protocol-SMTP  protocol-SSHv1  protocol-TCP  protocol-TELNET  src_port_type-dynamic  src_port_type-registered  src_port_type-well_known  dst_port_type-dynamic  dst_port_type-registered  dst_port_type-well_known
0              0  -2.084844 -0.500428    0   0    0    0    0    1     0      0    1    0              0             0             0              0               0             1                0                      0                         0                         1                      0                         1                         0
1              1  -2.084844 -0.500428    0   0    0    0    0    1     0      0    0    0              0             0             0              0               0             1                0                      0                         1                         0                      0                  

# Data preparation

In [0]:
# Get the starting points of each sequence within the dataset
points = []
for x in range(0, len(data.index)):
  if(data['position'][x] == 0):
    points.append(x)

data = data.drop(['position'],axis=1)
# Get the input and target sequences for the pairs
pairs = []
for x in range(0, len(points)-1):
  count = 0
  input = []
  target = []
  for y in range(points[x], points[x+1]):
    if(count < 3):
      # Get input sequence
      packet = [data['timestamp'][y], data['length'][y], data['res'][y], data['ns'][y], 
                data['cwr'][y], data['ecn'][y], data['urg'][y], data['ack'][y], 
                data['push'][y], data['reset'][y], data['syn'][y], data['fin'][y],
                data['protocol-TCP'][y], data['protocol-TELNET'][y], data['protocol-SMTP'][y], 
                data['protocol-HTTP'][y], data['protocol-SSHv1'][y],data['protocol-IRC'][y], 
                data['protocol-POP'][y], data['src_port_type-well_known'][y], 
                data['src_port_type-registered'][y], data['src_port_type-dynamic'][y],
                data['dst_port_type-well_known'][y], data['dst_port_type-registered'][y],
                data['dst_port_type-dynamic'][y]]
      input.append(packet)
      count = count + 1
    else:
      # Get target sequence
      packet = [data['timestamp'][y], data['length'][y], data['res'][y], data['ns'][y], 
                data['cwr'][y], data['ecn'][y], data['urg'][y], data['ack'][y], 
                data['push'][y], data['reset'][y], data['syn'][y], data['fin'][y],
                data['protocol-TCP'][y], data['protocol-TELNET'][y], data['protocol-SMTP'][y], 
                data['protocol-HTTP'][y], data['protocol-SSHv1'][y],data['protocol-IRC'][y], 
                data['protocol-POP'][y], data['src_port_type-well_known'][y], 
                data['src_port_type-registered'][y], data['src_port_type-dynamic'][y],
                data['dst_port_type-well_known'][y], data['dst_port_type-registered'][y],
                data['dst_port_type-dynamic'][y]]
      target.append(packet)
      count = count + 1
      
  pair = [input, target]
  pairs.append(pair)

# Make the Tensors

In [0]:
# Functions to prepare the data for insertion into the model
def tensor_from_sequence(sequence):  # Create a tensor from a sequence using the dictionary
    return torch.tensor(sequence, dtype=torch.float, device=device)

def tensors_from_pair(pair):  # Get an input and target tensor out of a pair
    input_tensor = tensor_from_sequence(pair[0])
    target_tensor = tensor_from_sequence(pair[1])
    return (input_tensor, target_tensor)

# Encoder


In [0]:
# Recurrent neural network for Encoder of the seq2seq model
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size) # Applies Gated Recurrent Unit (GRU) to input sequence

    def forward(self, input_token, hidden_state):
        input_token = input_token.unsqueeze(0).unsqueeze(0)
        output_vector, hidden_state = self.gru(input_token, hidden_state)
        return output_vector, hidden_state

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Decoder 

In [0]:
# Recurrent neural network for Decoder of seq2seq model
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(output_size, hidden_size) # Applies GRU
        self.out = nn.Linear(hidden_size, output_size)
        self.relu = nn.LeakyReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_token, hidden_state):
        input_token = input_token.unsqueeze(0).unsqueeze(0)
        output_vector = self.relu(input_token)
        output_vector, hidden_state = self.gru(output_vector, hidden_state)
        output_vector = self.out(output_vector[0])
        return output_vector, hidden_state

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Load the models

In [0]:
# Load the encoder and set it to eval mode
encoder1 = Encoder(25, 256).to(device)
encoder1.load_state_dict(torch.load('/content/encoder80percent.dict'))
encoder1.eval()

# Load the decoder and set it to eval mode
decoder1 = Decoder(256, 25).to(device)
decoder1.load_state_dict(torch.load('/content/decoder80percent.dict'))
decoder1.eval()

Decoder(
  (gru): GRU(25, 256)
  (out): Linear(in_features=256, out_features=25, bias=True)
  (relu): LeakyReLU(negative_slope=0.01)
  (softmax): LogSoftmax()
)

# Anomaly Detection & Evaluation

In [0]:
# Evaluate the seq2seq model on this dataset with a specified threshold for anomaly detection
def test_model(encoder, decoder, print_every=1000, threshold=1):
  normal_rmse_score = 0
  atk_rmse_score = 0
  rmse_score = 0
  testing_pairs = []
  for x in range(len(pairs)):
    testing_pairs.append(tensors_from_pair(pairs[x]))
  
  count_anomalies = 0 # Total possible anomalous sequences discovered
  loss_func = nn.MSELoss()

  for sample in range(len(testing_pairs)):
    test_pair = testing_pairs[sample]
    input_tensor = test_pair[0]
    target_tensor = test_pair[1]
    loss = test(input_tensor, target_tensor, encoder, decoder, loss_func)
    rmse_score = rmse_score + loss
    if(loss > threshold):
      atk_rmse_score = atk_rmse_score + loss
      count_anomalies = count_anomalies + 1
    elif(loss < threshold):
      normal_rmse_score = normal_rmse_score + loss
    
    if(sample % print_every == 0):
      pct = (sample/len(testing_pairs))
      print('Percent complete: '+'{:.0%}'.format(pct))

  return count_anomalies, rmse_score/len(testing_pairs), (normal_rmse_score/(len(testing_pairs)-count_anomalies)), (atk_rmse_score/count_anomalies)

# Test with a given input, target pair
def test(input_tensor, target_tensor, encoder, decoder, loss_func):
  encoder_hidden = encoder.init_hidden()
  input_length = input_tensor.size(0)
  target_length = target_tensor.size(0)
  loss = 0

  # Read until the end of file
  for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

  decoder_input = target_tensor[0]
  decoder_hidden = encoder_hidden

  for di in range(target_length):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    decoder_input = decoder_output.squeeze()
    loss = loss + torch.sqrt(loss_func(decoder_output, target_tensor[di].unsqueeze(0)))

  return loss.item() / target_length

# Test the model

In [0]:
count_anomalies, rmse_score, normal_rmse_score, atk_rmse_score = test_model(encoder1, decoder1, print_every = 1000, threshold = 0.5)

print('Potentially anomalous sequences: ', count_anomalies)
print('Overall RMSE score: ', rmse_score)
print('Normal RMSE score: ', normal_rmse_score)
print('Attack RMSE score: ', atk_rmse_score)
print("---> Finished Testing Function <---")

Percent complete: 0%
Percent complete: 2%
Percent complete: 3%
Percent complete: 5%
Percent complete: 6%
Percent complete: 8%
Percent complete: 9%
Percent complete: 11%
Percent complete: 12%
Percent complete: 14%
Percent complete: 15%
Percent complete: 17%
Percent complete: 18%
Percent complete: 20%
Percent complete: 22%
Percent complete: 23%
Percent complete: 25%
Percent complete: 26%
Percent complete: 28%
Percent complete: 29%
Percent complete: 31%
Percent complete: 32%
Percent complete: 34%
Percent complete: 35%
Percent complete: 37%
Percent complete: 39%
Percent complete: 40%
Percent complete: 42%
Percent complete: 43%
Percent complete: 45%
Percent complete: 46%
Percent complete: 48%
Percent complete: 49%
Percent complete: 51%
Percent complete: 52%
Percent complete: 54%
Percent complete: 55%
Percent complete: 57%
Percent complete: 59%
Percent complete: 60%
Percent complete: 62%
Percent complete: 63%
Percent complete: 65%
Percent complete: 66%
Percent complete: 68%
Percent complete: