In [1]:
import torch
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import sys
sys.path.append("../../")
from TCN.mnist_pixel.utils import data_generator
from TCN.mnist_pixel.model import TCN
import numpy as np
import argparse

In [2]:
def customOneHotEncoder(data):
    dataAdjust = data.ljust(200,'0')[:200] # padding if not of length and adjusting the data lenght to get a 200x39 input matrix
    # define universe of possible input values
    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz,._'
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    # integer encode input data
    integer_encoded = [char_to_int[char] for char in dataAdjust]
    #print(integer_encoded)
    # one hot encode
    onehot_encoded = list()
    for i, value in enumerate(integer_encoded):
        letter = [0 for _ in range(len(alphabet))]
        letter[value] = 1
        onehot_encoded.append(letter)
    #print(onehot_encoded) # the real encoding
    return onehot_encoded

# takes a .csv filename
def dataPreprocessing(fileName):
    df = pd.read_csv(fileName, header = None)
    
    #prepare the imput data
    xString = df.iloc[:,:41].to_string(header=False, index=False, index_names = False).split('\n')
    xList = [','.join(ele.split()) for ele in xString] # gives comma separated strings for each row of DataFrame
    xData = []
    for string in xList:
        stringLower = string.lower()
        oneHot = customOneHotEncoder(stringLower)
        xData.append(oneHot)
    xMid = np.array(xData)
    xArray = xMid.transpose(0,2,1) # convert xMid's dim (size, 200, 39) to (size, 39, 200)
    
    #prepare the label data
    df[41] = np.where(df[41]=='normal', 'normal', 'attack') # replacing anything except 'normal' with 'attack'
    Ydf = df[41]
    #labelName = Ydf.unique().tolist().sort() # sorted 38 label names
    #yArray = Ydf.str.get_dummies().to_numpy() # ndarray of shape(rows/lines, 38)
    yArray = Ydf.to_numpy()
    
    assert xArray.shape[0] == yArray.shape[0], 'unequal input and label sample size'
    
    
    return xArray, yArray # return processed array of input and label

In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

class NSLKDDDataset(Dataset):
    def __init__(self, fileName):
        self.data = pd.read_csv(fileName, header = None)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # prepare x data
        string = list(','.join('%s' %x for x in y) for y in self.data.iloc[[idx], :41].values)
        stringLower = string[0].lower()
        xData = customOneHotEncoder(stringLower) # Dim (200, 39)
        xMid = np.array(xData)
        xArray = xMid.transpose() # should be now (39, 200)
        
        # prepate y data
        #self.data.iloc[idx, 41] = np.where(self.data.iloc[idx, 41]=='normal', 0, 1) # replacing normals with 0 and anything else with 1
        yArray = np.where(self.data.iloc[idx, 41]=='normal', 0, 1)
        
        #yArray = Ydf.to_numpy()
    
        #assert xArray.shape == yArray.shape, 'unequal input and label sample size'
        
        return torch.from_numpy(xArray), torch.from_numpy(yArray) # returns torch tensor of x and y

In [4]:
#split contents of kddtrain
#output KDDVal.csv AND kddtrain.csv


In [5]:
params = {'batch_size': 64, 'shuffle': True}
fileNameTrain = 'KDDTrain+.csv'
fileNameTest = 'KDDTest+.csv'
#split kddtrain
#datasetVal = NLSKDDDataset(filenameVal)
dataset = NSLKDDDataset(fileNameTrain)
total_length= 125973
train_length = int(0.8 * total_length)
val_length = total_length - train_length
datasetTrain, datasetVal = torch.utils.data.random_split(dataset, [train_length, val_length])#125973 
datasetTest = NSLKDDDataset(fileNameTest)
dataGeneratorTrain = DataLoader(datasetTrain, **params)
dataGeneratorTest = DataLoader(datasetTest, **params)
dataGeneratorVal = DataLoader(datasetVal, **params)

print(len(dataGeneratorTrain.dataset), len(dataGeneratorVal.dataset))


100778 25195


In [6]:
root = './data/mnist'
batch_size = 64
n_classes = 2
input_channels = 39
seq_length = int(200)
epochs = 100
steps = 0

In [7]:
#train_loader, test_loader = data_generator(root, batch_size)

permute = torch.Tensor(np.random.permutation(784).astype(np.float64)).long()
channel_sizes = [32] * 6 #hidden nodes times levels 
kernel_size = 5
model = TCN(input_channels, n_classes, channel_sizes, kernel_size=kernel_size, dropout=0.25)


lr = 1e-5
optimizer = getattr(optim, 'Adam')(model.parameters(), lr=lr)

In [8]:
def train(ep):
    global steps
    train_loss = 0
    model.train()
    for batch_idx, (data, target) in enumerate(dataGeneratorTrain):
        # print('data Shape: {} target shape: {} data type: {}'.format(data.shape, target.shape, type(data)))
        optimizer.zero_grad()
        data = data.view(-1, input_channels, seq_length)
        data, target = Variable(data), Variable(target)
        #print('data Shape: {} target shape: {} data type: {}'.format(data.shape, target.shape, type(data)))
        #print(target)
        optimizer.zero_grad()
        #print(data[0])
        data = data.type(torch.FloatTensor)
        output = model(data)
        #print(output.shape)
        target = target.type(torch.LongTensor)
        #loss1 = torch.nn.CrossEntropyLoss()
        loss = F.nll_loss(output, target) # negative log likelihood
        #loss = loss1(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss
        steps += seq_length
        if batch_idx > 0 and batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tSteps: {}'.format(
                ep, batch_idx * batch_size, len(dataGeneratorTrain.dataset),
                100. * batch_idx / len(dataGeneratorTrain), train_loss.item()/100, steps))
            train_loss = 0

In [9]:
def test():
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in dataGeneratorVal:
            model.eval()
            #data = data.view(-1, input_channels, seq_length)
            data = data.type(torch.FloatTensor)
            target = target.type(torch.LongTensor)
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            #loss1 = torch.nn.CrossEntropyLoss()
            test_loss += F.nll_loss(output, target, size_average=False).item()
            #test_loss += loss1(output, target).item()
            #print(output.data.max(1, keepdim=True)[1])
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()

        test_loss /= len(dataGeneratorTest.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(dataGeneratorVal.dataset),
            100. * correct / len(dataGeneratorVal.dataset)))
        return test_loss

In [10]:
if __name__ == "__main__":
    for epoch in range(1, epochs+1):
        train(epoch)
        test()
        if epoch % 10 == 0:
            lr /= 10
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr



  # Remove the CWD from sys.path while we load stuff.



Test set: Average loss: 0.7627, Accuracy: 13357/25195 (53%)


Test set: Average loss: 0.2021, Accuracy: 23612/25195 (94%)


Test set: Average loss: 0.1374, Accuracy: 24346/25195 (97%)


Test set: Average loss: 0.1191, Accuracy: 24417/25195 (97%)


Test set: Average loss: 0.1077, Accuracy: 24444/25195 (97%)


Test set: Average loss: 0.0962, Accuracy: 24498/25195 (97%)


Test set: Average loss: 0.0747, Accuracy: 24597/25195 (98%)


Test set: Average loss: 0.0528, Accuracy: 24849/25195 (99%)




Test set: Average loss: 0.0456, Accuracy: 24933/25195 (99%)


Test set: Average loss: 0.0413, Accuracy: 24955/25195 (99%)


Test set: Average loss: 0.0408, Accuracy: 24954/25195 (99%)


Test set: Average loss: 0.0404, Accuracy: 24956/25195 (99%)


Test set: Average loss: 0.0401, Accuracy: 24954/25195 (99%)



KeyboardInterrupt: 

In [11]:
from sklearn.metrics import classification_report
def get_metrics(dataLoader):
    total = 0
    correct = 0
    precision = 0
    recall = 0
    f1_score = 0
    accuracy = 0
    precisionList = []
    recallList = []
    f1_scoreList = []
    accuracyList = []
    with torch.no_grad():
        for data, target in dataLoader: # just 1 batch
            model.eval()
            data = data.type(torch.FloatTensor)
            target = target.type(torch.LongTensor)
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            #print(predicted)
            total+=target.size(0)
            correct+=(predicted == target).sum().item()
            report = classification_report(target, predicted, output_dict=True)
            precision += report['macro avg']['precision']
            precisionList.append(report['macro avg']['precision'])
            recall += report['macro avg']['recall']
            recallList.append(report['macro avg']['recall'])
            f1_score += report['macro avg']['f1-score']
            f1_scoreList.append(report['macro avg']['f1-score'])
            accuracy += report['accuracy']
            accuracyList.append(report['accuracy'])
            #print(report)
    #print("Precision: {}, Recall: {}, F1-Score: {}, Accuracy: {}, AccuracyCust: {}".format(precision/total, recall/total, f1_score/total, accuracy/total, correct/total))
    return precisionList, recallList, f1_scoreList, accuracyList

In [12]:
fileNameVal = 'Ds.csv'
datasetVal = NSLKDDDataset(fileNameVal)
params = {'batch_size': 22544, 'shuffle': True}
dataGeneratorVal = DataLoader(datasetVal, **params)
#RuntimeError: expected scalar type Int but found Float
#get_metrics(dataGeneratorVal)
def val():
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in dataGeneratorTest:
            model.eval()
            #data = data.view(-1, input_channels, seq_length)
            data = data.type(torch.FloatTensor)
            target = target.type(torch.LongTensor)
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            #loss1 = torch.nn.CrossEntropyLoss()
            #test_loss += F.nll_loss(output, target, size_average=False).item()
            #test_loss += loss1(output, target).item()
            #print(output.data.max(1, keepdim=True)[1])
            pred = output.data.max(1, keepdim=True)[1]
            #print(pred)
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()

        test_loss /= len(dataGeneratorTest.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(dataGeneratorVal.dataset),
            100. * correct / len(dataGeneratorVal.dataset)))
        return test_loss

val()


  app.launch_new_instance()



Test set: Average loss: 0.0000, Accuracy: 18349/22544 (81%)



0.0

In [13]:
get_metrics(dataGeneratorTrain)

([1.0,
  0.9880952380952381,
  1.0,
  1.0,
  0.986842105263158,
  1.0,
  1.0,
  1.0,
  0.9848484848484849,
  0.9833333333333334,
  0.9687194525904204,
  1.0,
  1.0,
  0.986842105263158,
  1.0,
  0.967741935483871,
  0.9722222222222222,
  0.9861111111111112,
  0.9848484848484849,
  1.0,
  0.98,
  0.9676113360323887,
  0.9827586206896552,
  1.0,
  0.9857142857142858,
  1.0,
  0.9838709677419355,
  0.9543589743589743,
  1.0,
  0.9827586206896552,
  1.0,
  1.0,
  0.9875,
  0.9857142857142858,
  0.9655172413793103,
  1.0,
  1.0,
  0.9807692307692308,
  0.9833333333333334,
  0.9642857142857143,
  1.0,
  0.9671794871794872,
  0.9864864864864865,
  0.9848484848484849,
  0.9696969696969697,
  1.0,
  1.0,
  1.0,
  0.9687194525904204,
  1.0,
  1.0,
  1.0,
  1.0,
  0.9848484848484849,
  0.9848484848484849,
  1.0,
  0.9827586206896552,
  0.9782608695652174,
  1.0,
  0.9714285714285714,
  1.0,
  1.0,
  1.0,
  0.9848484848484849,
  0.9687194525904204,
  1.0,
  1.0,
  0.9861111111111112,
  0.984848484

In [15]:
get_metrics(dataGeneratorVal)

([0.8230120314303593],
 [0.8265459602989258],
 [0.8138184762114316],
 [0.8139194464158978])

In [16]:
pList, rList, fList, accList = get_metrics(dataGeneratorTest)

In [17]:
pArr = np.array(pList)
pArr.mean()


0.8226786962065603

In [18]:
rArr = np.array(rList)
rArr.mean()


0.8264853246357611

In [19]:
fArr = np.array(fList)
fArr.mean()


0.8111749478177027

In [20]:
accArr = np.array(accList)
accArr.mean()



0.8136508498583569