In [1]:
import torch
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import sys
sys.path.append("../../")
from TCN.mnist_pixel.utils import data_generator
from TCN.mnist_pixel.modeltcan import TCAN
import numpy as np
import argparse

In [3]:
def customOneHotEncoder(data):
    dataAdjust = data.ljust(200,'0')[:200] # padding if not of length and adjusting the data lenght to get a 200x39 input matrix
    # define universe of possible input values
    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz,._'
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    # integer encode input data
    integer_encoded = [char_to_int[char] for char in dataAdjust]
    #print(integer_encoded)
    # one hot encode
    onehot_encoded = list()
    for i, value in enumerate(integer_encoded):
        letter = [0 for _ in range(len(alphabet))]
        letter[value] = 1
        onehot_encoded.append(letter)
    #print(onehot_encoded) # the real encoding
    return onehot_encoded

# takes a .csv filename
def dataPreprocessing(fileName):
    df = pd.read_csv(fileName, header = None)
    
    #prepare the imput data
    xString = df.iloc[:,:41].to_string(header=False, index=False, index_names = False).split('\n')
    xList = [','.join(ele.split()) for ele in xString] # gives comma separated strings for each row of DataFrame
    xData = []
    for string in xList:
        stringLower = string.lower()
        oneHot = customOneHotEncoder(stringLower)
        xData.append(oneHot)
    xMid = np.array(xData)
    xArray = xMid.transpose(0,2,1) # convert xMid's dim (size, 200, 39) to (size, 39, 200)
    
    #prepare the label data
    df[41] = np.where(df[41]=='normal', 'normal', 'attack') # replacing anything except 'normal' with 'attack'
    Ydf = df[41]
    #labelName = Ydf.unique().tolist().sort() # sorted 38 label names
    #yArray = Ydf.str.get_dummies().to_numpy() # ndarray of shape(rows/lines, 38)
    yArray = Ydf.to_numpy()
    
    assert xArray.shape[0] == yArray.shape[0], 'unequal input and label sample size'
    
    
    return xArray, yArray # return processed array of input and label

In [15]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

class NSLKDDDataset(Dataset):
    def __init__(self, fileName):
        self.data = pd.read_csv(fileName, header = None)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # prepare x data
        print("value of idx is %f " %idx)
        string = list(','.join('%s' %x for x in y) for y in self.data.iloc[[idx], :41].values)
        stringLower = string[0].lower()
        xData = customOneHotEncoder(stringLower) # Dim (200, 39)
        xMid = np.array(xData)
        xArray = xMid.transpose() # should be now (39, 200)
        
        # prepate y data
        #self.data.iloc[idx, 41] = np.where(self.data.iloc[idx, 41]=='normal', 0, 1) # replacing normals with 0 and anything else with 1
        yArray = np.where(self.data.iloc[idx, 41]=='normal', 0, 1)
        
        #yArray = Ydf.to_numpy()
    
        #assert xArray.shape == yArray.shape, 'unequal input and label sample size'
        
        return torch.from_numpy(xArray), torch.from_numpy(yArray) # returns torch tensor of x and y

In [16]:
params = {'batch_size': 64, 'shuffle': True}
fileNameTrain = 'UNSW_NB15_training-set.csv'
fileNameTest = 'UNSW_NB15_testing-set.csv'
datasetTrain = NSLKDDDataset(fileNameTrain)
datasetTest = NSLKDDDataset(fileNameTest)
dataGeneratorTrain = DataLoader(datasetTrain, **params)
dataGeneratorTest = DataLoader(datasetTest, **params)


In [13]:
a = pd.read_csv('UNSW_NB15_training-set.csv', header = None)
print(a)

          0         1    2  3    4   5   6      7    8              9   ...  \
0          1  0.000011  udp  -  INT   2   0    496    0   90909.090200  ...   
1          2  0.000008  udp  -  INT   2   0   1762    0  125000.000300  ...   
2          3  0.000005  udp  -  INT   2   0   1068    0  200000.005100  ...   
3          4  0.000006  udp  -  INT   2   0    900    0  166666.660800  ...   
4          5  0.000010  udp  -  INT   2   0   2126    0  100000.002500  ...   
...      ...       ...  ... ..  ...  ..  ..    ...  ...            ...  ...   
82327  82328  0.000005  udp  -  INT   2   0    104    0  200000.005100  ...   
82328  82329  1.106101  tcp  -  FIN  20   8  18062  354      24.410067  ...   
82329  82330  0.000000  arp  -  INT   1   0     46    0       0.000000  ...   
82330  82331  0.000000  arp  -  INT   1   0     46    0       0.000000  ...   
82331  82332  0.000009  udp  -  INT   2   0    104    0  111111.107200  ...   

       35  36  37  38  39  40  41  42      43  44  

In [14]:
b = pd.read_csv('KDDTrain+.csv', header = None)
print(b)

        0    1         2   3     4     5   6   7   8   9   ...    33    34  \
0        0  tcp  ftp_data  SF   491     0   0   0   0   0  ...  0.17  0.03   
1        0  udp     other  SF   146     0   0   0   0   0  ...  0.00  0.60   
2        0  tcp   private  S0     0     0   0   0   0   0  ...  0.10  0.05   
3        0  tcp      http  SF   232  8153   0   0   0   0  ...  1.00  0.00   
4        0  tcp      http  SF   199   420   0   0   0   0  ...  1.00  0.00   
...     ..  ...       ...  ..   ...   ...  ..  ..  ..  ..  ...   ...   ...   
125968   0  tcp   private  S0     0     0   0   0   0   0  ...  0.10  0.06   
125969   8  udp   private  SF   105   145   0   0   0   0  ...  0.96  0.01   
125970   0  tcp      smtp  SF  2231   384   0   0   0   0  ...  0.12  0.06   
125971   0  tcp    klogin  S0     0     0   0   0   0   0  ...  0.03  0.05   
125972   0  tcp  ftp_data  SF   151     0   0   0   0   0  ...  0.30  0.03   

          35    36    37    38    39    40       41  42  
0    