In [1]:
import torch
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import sys
sys.path.append("../../")
from TCN.mnist_pixel.utils import data_generator
from TCN.mnist_pixel.model import TCN
import numpy as np
import argparse

In [2]:
def customOneHotEncoder(data):
    dataAdjust = data.ljust(200,'0')[:200] # padding if not of length and adjusting the data lenght to get a 200x39 input matrix
    # define universe of possible input values
    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz,._'
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    # integer encode input data
    integer_encoded = [char_to_int[char] for char in dataAdjust]
    #print(integer_encoded)
    # one hot encode
    onehot_encoded = list()
    for i, value in enumerate(integer_encoded):
        letter = [0 for _ in range(len(alphabet))]
        letter[value] = 1
        onehot_encoded.append(letter)
    #print(onehot_encoded) # the real encoding
    return onehot_encoded

# takes a .csv filename
def dataPreprocessing(fileName):
    df = pd.read_csv(fileName, header = None)
    
    #prepare the imput data
    xString = df.iloc[:,:41].to_string(header=False, index=False, index_names = False).split('\n')
    xList = [','.join(ele.split()) for ele in xString] # gives comma separated strings for each row of DataFrame
    xData = []
    for string in xList:
        stringLower = string.lower()
        oneHot = customOneHotEncoder(stringLower)
        xData.append(oneHot)
    xMid = np.array(xData)
    xArray = xMid.transpose(0,2,1) # convert xMid's dim (size, 200, 39) to (size, 39, 200)
    
    #prepare the label data
    df[41] = np.where(df[41]=='normal', 'normal', 'attack') # replacing anything except 'normal' with 'attack'
    Ydf = df[41]
    #labelName = Ydf.unique().tolist().sort() # sorted 38 label names
    #yArray = Ydf.str.get_dummies().to_numpy() # ndarray of shape(rows/lines, 38)
    yArray = Ydf.to_numpy()
    
    assert xArray.shape[0] == yArray.shape[0], 'unequal input and label sample size'
    
    
    return xArray, yArray # return processed array of input and label

In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

class NSLKDDDataset(Dataset):
    def __init__(self, fileName):
        self.data = pd.read_csv(fileName, header = None)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # prepare x data
        string = list(','.join('%s' %x for x in y) for y in self.data.iloc[[idx], :41].values)
        stringLower = string[0].lower()
        xData = customOneHotEncoder(stringLower) # Dim (200, 39)
        xMid = np.array(xData)
        xArray = xMid.transpose() # should be now (39, 200)
        
        # prepate y data
        #self.data.iloc[idx, 41] = np.where(self.data.iloc[idx, 41]=='normal', 0, 1) # replacing normals with 0 and anything else with 1
        yArray = np.where(self.data.iloc[idx, 41]=='normal', 0, 1)
        
        #yArray = Ydf.to_numpy()
    
        #assert xArray.shape == yArray.shape, 'unequal input and label sample size'
        
        return torch.from_numpy(xArray), torch.from_numpy(yArray) # returns torch tensor of x and y

In [4]:
params = {'batch_size': 64, 'shuffle': True}
fileNameTrain = 'KDDTrain+.csv'
fileNameTest = 'KDDTest+.csv'
datasetTrain = NSLKDDDataset(fileNameTrain)
datasetTest = NSLKDDDataset(fileNameTest)
dataGeneratorTrain = DataLoader(datasetTrain, **params)
dataGeneratorTest = DataLoader(datasetTest, **params)


In [5]:
root = './data/mnist'
batch_size = 64
n_classes = 2
input_channels = 39
seq_length = int(200)
epochs = 100
steps = 0

In [6]:
#train_loader, test_loader = data_generator(root, batch_size)

permute = torch.Tensor(np.random.permutation(784).astype(np.float64)).long()
channel_sizes = [32] * 6 #hidden nodes times levels 
kernel_size = 5
model = TCN(input_channels, n_classes, channel_sizes, kernel_size=kernel_size, dropout=0.25)


lr = 1e-5
optimizer = getattr(optim, 'Adam')(model.parameters(), lr=lr)

In [7]:
def train(ep):
    global steps
    train_loss = 0
    model.train()
    for batch_idx, (data, target) in enumerate(dataGeneratorTrain):
        # print('data Shape: {} target shape: {} data type: {}'.format(data.shape, target.shape, type(data)))
        optimizer.zero_grad()
        data = data.view(-1, input_channels, seq_length)
        data, target = Variable(data), Variable(target)
        #print('data Shape: {} target shape: {} data type: {}'.format(data.shape, target.shape, type(data)))
        #print(target)
        optimizer.zero_grad()
        #print(data[0])
        data = data.type(torch.FloatTensor)
        output = model(data)
        #print(output.shape)
        target = target.type(torch.LongTensor)
        #loss1 = torch.nn.CrossEntropyLoss()
        loss = F.nll_loss(output, target) # negative log likelihood
        #loss = loss1(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss
        steps += seq_length
        if batch_idx > 0 and batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tSteps: {}'.format(
                ep, batch_idx * batch_size, len(dataGeneratorTrain.dataset),
                100. * batch_idx / len(dataGeneratorTrain), train_loss.item()/100, steps))
            train_loss = 0

In [8]:
def test():
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in dataGeneratorTest:
            model.eval()
            #data = data.view(-1, input_channels, seq_length)
            data = data.type(torch.FloatTensor)
            target = target.type(torch.LongTensor)
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            #loss1 = torch.nn.CrossEntropyLoss()
            test_loss += F.nll_loss(output, target, size_average=False).item()
            #test_loss += loss1(output, target).item()
            #print(output.data.max(1, keepdim=True)[1])
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()

        test_loss /= len(dataGeneratorTest.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(dataGeneratorTest.dataset),
            100. * correct / len(dataGeneratorTest.dataset)))
        return test_loss

In [9]:
if __name__ == "__main__":
    for epoch in range(1, epochs+1):
        train(epoch)
        test()
        if epoch % 10 == 0:
            lr /= 10
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr



  # Remove the CWD from sys.path while we load stuff.



Test set: Average loss: 0.6951, Accuracy: 9711/22544 (43%)


Test set: Average loss: 0.4165, Accuracy: 18116/22544 (80%)


Test set: Average loss: 0.5319, Accuracy: 17295/22544 (77%)


Test set: Average loss: 0.6145, Accuracy: 17301/22544 (77%)


Test set: Average loss: 0.6389, Accuracy: 17346/22544 (77%)


Test set: Average loss: 0.6647, Accuracy: 18174/22544 (81%)


Test set: Average loss: 0.6809, Accuracy: 18327/22544 (81%)




Test set: Average loss: 0.7459, Accuracy: 18279/22544 (81%)


Test set: Average loss: 0.7256, Accuracy: 18427/22544 (82%)


Test set: Average loss: 0.8012, Accuracy: 18226/22544 (81%)


Test set: Average loss: 0.7838, Accuracy: 18352/22544 (81%)


Test set: Average loss: 0.7948, Accuracy: 18326/22544 (81%)


Test set: Average loss: 0.7919, Accuracy: 18337/22544 (81%)




Test set: Average loss: 0.7983, Accuracy: 18319/22544 (81%)


Test set: Average loss: 0.7995, Accuracy: 18332/22544 (81%)



KeyboardInterrupt: 

In [11]:
from sklearn.metrics import classification_report
def get_metrics(dataLoader):
    total = 0
    correct = 0
    precision = 0
    recall = 0
    f1_score = 0
    accuracy = 0
    with torch.no_grad():
        for data, target in dataLoader: # just 1 batch
            model.eval()
            data = data.type(torch.FloatTensor)
            target = target.type(torch.LongTensor)
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            #print(predicted)
            total+=target.size(0)
            correct+=(predicted == target).sum().item()
            report = classification_report(target, predicted, output_dict=True)
            precision += report['macro avg']['precision']
            recall += report['macro avg']['recall']
            f1_score += report['macro avg']['f1-score']
            accuracy += report['accuracy']
            print(report)
    print("Precision: {}, Recall: {}, F1-Score: {}, Accuracy: {}, AccuracyCust: {}".format(precision, recall, f1_score, accuracy, correct/total))
    

In [12]:
fileNameVal = 'Ds.csv'
datasetVal = NSLKDDDataset(fileNameVal)
params = {'batch_size': 22544, 'shuffle': True}
dataGeneratorVal = DataLoader(datasetVal, **params)
#RuntimeError: expected scalar type Int but found Float
#get_metrics(dataGeneratorVal)
def val():
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in dataGeneratorVal:
            model.eval()
            #data = data.view(-1, input_channels, seq_length)
            data = data.type(torch.FloatTensor)
            target = target.type(torch.LongTensor)
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            #loss1 = torch.nn.CrossEntropyLoss()
            #test_loss += F.nll_loss(output, target, size_average=False).item()
            #test_loss += loss1(output, target).item()
            #print(output.data.max(1, keepdim=True)[1])
            pred = output.data.max(1, keepdim=True)[1]
            print(pred)
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()

        test_loss /= len(dataGeneratorTest.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(dataGeneratorVal.dataset),
            100. * correct / len(dataGeneratorVal.dataset)))
        return test_loss

val()


  app.launch_new_instance()


tensor([[1],
        [1],
        [0],
        ...,
        [0],
        [0],
        [0]])

Test set: Average loss: 0.0000, Accuracy: 18338/22544 (81%)



0.0

In [13]:
get_metrics(dataGeneratorVal)

{'0': {'precision': 0.7219579066204338, 'recall': 0.9219441870044279, 'f1-score': 0.8097865412445729, 'support': 9711}, '1': {'precision': 0.9252686581879128, 'recall': 0.7313176965635471, 'f1-score': 0.8169394150417827, 'support': 12833}, 'accuracy': 0.8134315117104329, 'macro avg': {'precision': 0.8236132824041733, 'recall': 0.8266309417839874, 'f1-score': 0.8133629781431778, 'support': 22544}, 'weighted avg': {'precision': 0.8376910007858641, 'recall': 0.8134315117104329, 'f1-score': 0.8138582600806088, 'support': 22544}}
Precision: 0.8236132824041733, Recall: 0.8266309417839874, F1-Score: 0.8133629781431778, Accuracy: 0.8134315117104329, AccuracyCust: 0.8134315117104329


In [14]:
get_metrics(dataGeneratorTrain)

{'0': {'precision': 0.9411764705882353, 'recall': 1.0, 'f1-score': 0.9696969696969697, 'support': 32}, '1': {'precision': 1.0, 'recall': 0.9375, 'f1-score': 0.967741935483871, 'support': 32}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.9705882352941176, 'recall': 0.96875, 'f1-score': 0.9687194525904204, 'support': 64}, 'weighted avg': {'precision': 0.9705882352941176, 'recall': 0.96875, 'f1-score': 0.9687194525904204, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 22}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 42}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 29}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'supp

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 39}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 25}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 35}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 29}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 36}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 28}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 1.0, 

{'0': {'precision': 0.9473684210526315, 'recall': 0.972972972972973, 'f1-score': 0.9599999999999999, 'support': 37}, '1': {'precision': 0.9615384615384616, 'recall': 0.9259259259259259, 'f1-score': 0.9433962264150944, 'support': 27}, 'accuracy': 0.953125, 'macro avg': {'precision': 0.9544534412955465, 'recall': 0.9494494494494494, 'f1-score': 0.951698113207547, 'support': 64}, 'weighted avg': {'precision': 0.9533464068825912, 'recall': 0.953125, 'f1-score': 0.9529952830188679, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 0.9666666666666667, 'f1-score': 0.983050847457627, 'support': 30}, '1': {'precision': 0.9714285714285714, 'recall': 1.0, 'f1-score': 0.9855072463768115, 'support': 34}, 'accuracy': 0.984375, 'macro avg': {'precision': 0.9857142857142858, 'recall': 0.9833333333333334, 'f1-score': 0.9842790469172193, 'support': 64}, 'weighted avg': {'precision': 0.9848214285714285, 'recall': 0.984375, 'f1-score': 0.9843558093834439, 'support': 64}}
{'0': {'precision': 0.97368421052

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 30}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 34}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 0.9714285714285714, 'recall': 1.0, 'f1-score': 0.9855072463768115, 'support': 34}, '1': {'precision': 1.0, 'recall': 0.9666666666666667, 'f1-score': 0.983050847457627, 'support': 30}, 'accuracy': 0.984375, 'macro avg': {'precision': 0.9857142857142858, 'recall': 0.9833333333333334, 'f1-score': 0.9842790469172193, 'support': 64}, 'weighted avg': {'precision': 0.9848214285714285, 'recall': 0.984375, 'f1-score': 0.9843558093834439, 'support': 64}}
{'0': {'precision': 0.9736842105263158, 'recall': 1.0, 'f1-score': 0.9866666666666666, 'support': 37}, '1': {'precision': 1.0, 'recall': 0.9629629629629629, 'f1-score': 0.9811320754716981, 'support': 27}, '

{'0': {'precision': 0.9655172413793104, 'recall': 0.9655172413793104, 'f1-score': 0.9655172413793104, 'support': 29}, '1': {'precision': 0.9714285714285714, 'recall': 0.9714285714285714, 'f1-score': 0.9714285714285714, 'support': 35}, 'accuracy': 0.96875, 'macro avg': {'precision': 0.9684729064039409, 'recall': 0.9684729064039409, 'f1-score': 0.9684729064039409, 'support': 64}, 'weighted avg': {'precision': 0.96875, 'recall': 0.96875, 'f1-score': 0.96875, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 38}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 26}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 0.9696969696969697, 'recall': 1.0, 'f1-score': 0.9846153846153847, 'support': 32}, '1': {'precision': 1.0, 'recall': 0.96875, 'f1-score': 0.9841269841269841, 'support': 32}, 'accu

{'0': {'precision': 0.9375, 'recall': 0.967741935483871, 'f1-score': 0.9523809523809523, 'support': 31}, '1': {'precision': 0.96875, 'recall': 0.9393939393939394, 'f1-score': 0.9538461538461539, 'support': 33}, 'accuracy': 0.953125, 'macro avg': {'precision': 0.953125, 'recall': 0.9535679374389052, 'f1-score': 0.9531135531135531, 'support': 64}, 'weighted avg': {'precision': 0.95361328125, 'recall': 0.953125, 'f1-score': 0.9531364468864469, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 38}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 26}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 0.96875, 'recall': 1.0, 'f1-score': 0.9841269841269841, 'support': 31}, '1': {'precision': 1.0, 'recall': 0.9696969696969697, 'f1-score': 0.9846153846153847, 'support': 33}, 'accuracy': 0.984375

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 27}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 37}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 0.9705882352941176, 'recall': 1.0, 'f1-score': 0.9850746268656716, 'support': 33}, '1': {'precision': 1.0, 'recall': 0.967741935483871, 'f1-score': 0.9836065573770492, 'support': 31}, 'accuracy': 0.984375, 'macro avg': {'precision': 0.9852941176470589, 'recall': 0.9838709677419355, 'f1-score': 0.9843405921213604, 'support': 64}, 'weighted avg': {'precision': 0.9848345588235294, 'recall': 0.984375, 'f1-score': 0.9843635307071201, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 0.9705882352941176, 'f1-score': 0.9850746268656716, 'support': 34}, '1': {'precision': 0.967741935483871, 'recall': 1.0, 'f1-score': 0.9836065573770492, 'support': 30}, 'a

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 36}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 28}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 0.972972972972973, 'recall': 0.9473684210526315, 'f1-score': 0.9599999999999999, 'support': 38}, '1': {'precision': 0.9259259259259259, 'recall': 0.9615384615384616, 'f1-score': 0.9433962264150944, 'support': 26}, 'accuracy': 0.953125, 'macro avg': {'precision': 0.9494494494494494, 'recall': 0.9544534412955465, 'f1-score': 0.951698113207547, 'support': 64}, 'weighted avg': {'precision': 0.9538601101101101, 'recall': 0.953125, 'f1-score': 0.9532547169811321, 'support': 64}}
{'0': {'precision': 0.96875, 'recall': 1.0, 'f1-score': 0.9841269841269841, 'support': 31}, '1': {'precision': 1.0, 'recall': 0.9696969696969697, 'f1-score': 0.9846153846153847,

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 33}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 31}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 26}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 38}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64}}
{'0': {'precision': 0.9714285714285714, 'recall': 1.0, 'f1-score': 0.9855072463768115, 'support': 34}, '1': {'precision': 1.0, 'recall': 0.9666666666666667, 'f1-score': 0.983050847457627, 'support': 30}, 'accuracy': 0.984375, 'macro avg': {'precision': 0.9857142857142858, 'recall': 0.9833333333333334, 'f1-score': 0.9842790469172193, 'support': 64},

KeyboardInterrupt: 