In [1]:
import torch
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
import sys
sys.path.append("../../")
from TCN.mnist_pixel.model import TCN

In [2]:
def customOneHotEncoder(data):
    dataAdjust = data.ljust(200,'0')[:200] # padding if not of length and adjusting the data lenght to get a 200x39 input matrix
    # define universe of possible input values
    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz,._'
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    # integer encode input data
    integer_encoded = [char_to_int[char] for char in dataAdjust]
    #print(integer_encoded)
    # one hot encode
    onehot_encoded = list()
    for i, value in enumerate(integer_encoded):
        letter = [0 for _ in range(len(alphabet))]
        letter[value] = 1
        onehot_encoded.append(letter)
    #print(onehot_encoded) # the real encoding
    return onehot_encoded

In [3]:
def dataPreprocessing(df):
    
    #prepare the imput data
    xString = df.iloc[:,:41].to_string(header=False, index=False, index_names = False).split('\n')
    xList = [','.join(ele.split()) for ele in xString] # gives comma separated strings for each row of DataFrame
    xData = []
    for string in xList:
        stringLower = string.lower()
        oneHot = customOneHotEncoder(stringLower)
        xData.append(oneHot)
    xMid = np.array(xData)
    xArray = xMid.transpose(0,2,1) # convert xMid's dim (size, 200, 39) to (size, 39, 200)
    
    #prepare the label data
    df.iloc[:, 41] = np.where(df.iloc[:, 41]=='normal', 0, 1) # replacing normals with 0 and anything else with 1
    Ydf = df.iloc[:, 41]
    #labelName = Ydf.unique().tolist().sort() # sorted 38 label names
    #yArray = Ydf.str.get_dummies().to_numpy() # ndarray of shape(rows/lines, 38)
    yArray = Ydf.to_numpy()
    
    assert xArray.shape[0] == yArray.shape[0], 'unequal input and label sample size'
    
    
    return xArray, yArray # return processed array of input and label

In [4]:
def datasetTorch(x, y):
    xTorch = torch.tensor(x)
    yTorch = torch.tensor(y)
    Dataset = torch.utils.data.TensorDataset(xTorch, yTorch)
    return Dataset

In [18]:
def get_metrics(dataLoader):
    total = 0
    correct = 0
    precision = 0
    recall = 0
    f1_score = 0
    accuracy = 0
    with torch.no_grad():
        for data, target in dataLoader: # just 1 batch
            model.eval()
            data = data.type(torch.FloatTensor)
            target = target.type(torch.LongTensor)
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            #total+=target.size[0]
            #correct+=(predicted == target).sum().item()
            report = classification_report(target, predicted)
            print(report)
            '''precision += report['macro avg']['precision']
            recall += report['macro avg']['recall']
            f1_score += report['macro avg']['f1_score']
            accuracy += report['accuracy']'''
    #print("Precision: {}, Recall: {}, F1-Score: {}, Accuracy: {}, AccuracyCust: {}".format(precision, recall, f1_score, accuracy))

In [14]:
def train(ep,dataGeneratorTrain):
    global steps
    train_loss = 0
    model.train()
    for batch_idx, (data, target) in enumerate(dataGeneratorTrain):
        # print('data Shape: {} target shape: {} data type: {}'.format(data.shape, target.shape, type(data)))
        optimizer.zero_grad()
        data = data.view(-1, input_channels, seq_length)
        data, target = Variable(data), Variable(target)
        #print('data Shape: {} target shape: {} data type: {}'.format(data.shape, target.shape, type(data)))
        #print(target)
        optimizer.zero_grad()
        #print(data[0])
        data = data.type(torch.FloatTensor)
        output = model(data)
        #print(output.shape)
        target = target.type(torch.LongTensor)
        #loss1 = torch.nn.CrossEntropyLoss()
        loss = F.nll_loss(output, target) # negative log likelihood
        #loss = loss1(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss
        steps += seq_length
        if batch_idx > 0 and batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tSteps: {}'.format(
                ep, batch_idx * batch_size, len(dataGeneratorTrain.dataset),
                100. * batch_idx / len(dataGeneratorTrain), train_loss.item()/100, steps))
            train_loss = 0

In [7]:
def test(dataGeneratorTest):
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in dataGeneratorTest:
            model.eval()
            #data = data.view(-1, input_channels, seq_length)
            data = data.type(torch.FloatTensor)
            target = target.type(torch.LongTensor)
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            #loss1 = torch.nn.CrossEntropyLoss()
            test_loss += F.nll_loss(output, target, size_average=False).item()
            #test_loss += loss1(output, target).item()
            #print(output.data.max(1, keepdim=True)[1])
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()

        test_loss /= len(dataGeneratorTest.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(dataGeneratorTest.dataset),
            100. * correct / len(dataGeneratorTest.dataset)))
        return test_loss

In [8]:
batch_size = 64
n_classes = 2
input_channels = 39
seq_length = int(200)
epochs = 10
steps = 0

In [9]:
channel_sizes = [32] * 6 #hidden nodes times levels 
kernel_size = 5
model = TCN(input_channels, n_classes, channel_sizes, kernel_size=kernel_size, dropout=0.05)


lr = 1e-5
optimizer = getattr(optim, 'Adam')(model.parameters(), lr=lr)

In [10]:
# splitting the whole data into train and test data
totDf = pd.read_csv('KDDTrain+.csv', header = None)
trainDf, testDf = train_test_split(totDf, test_size = 0.2, random_state = 2)

# generating test dataset
xTest, yTest = dataPreprocessing(testDf)
DatasetTest = datasetTorch(xTest, yTest)
DataGeneratorTest = DataLoader(DatasetTest, batch_size = len(DatasetTest), shuffle = True)

# add model TCN or TCAN

# generating folds for cross validation
xTrain, yTrain = dataPreprocessing(trainDf)
kf= KFold(n_splits=8)
kf.get_n_splits(xTrain) # splitting up the traindata
fold = 0
for trainIndex, valIndex in kf.split(xTrain):
    #print("Train: ", trainIndex, "TEST: ", valIndex)
    fold += 1
    print("Now running fold {}.".format(fold))
    X_train, X_val = xTrain[trainIndex], xTrain[valIndex]
    Y_train, Y_val = yTrain[trainIndex], yTrain[valIndex]
    # generating train dataset
    DatasetTrain = datasetTorch(X_train, Y_train)
    DataGeneratorTrain = DataLoader(DatasetTrain, batch_size = 64, shuffle = True)
    # generating validation dataset
    DatasetVal = datasetTorch(X_val, Y_val)
    DataGeneratorVal = DataLoader(DatasetVal, batch_size = len(DatasetVal), shuffle = True)

    # train function on DataGeneratorTrain
    # test function on DataGenertatorVal and average the accuracy for all the validation folds
    
    # make code below a function
    if __name__ == "__main__":
        for epoch in range(1, epochs+1):
            train(epoch, DataGeneratorTrain)
            test(DataGeneratorTest)
            if epoch % 10 == 0:
                lr /= 10
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
    
# get_metric function on DataGeneratorTest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)




  data, target = Variable(data, volatile=True), Variable(target)



Test set: Average loss: 0.6907, Accuracy: 3398/6452 (53%)


Test set: Average loss: 0.6880, Accuracy: 3398/6452 (53%)


Test set: Average loss: 0.6799, Accuracy: 3398/6452 (53%)


Test set: Average loss: 0.6455, Accuracy: 4352/6452 (67%)


Test set: Average loss: 0.4460, Accuracy: 5883/6452 (91%)


Test set: Average loss: 0.1924, Accuracy: 5954/6452 (92%)


Test set: Average loss: 0.1514, Accuracy: 6189/6452 (96%)


Test set: Average loss: 0.1307, Accuracy: 6205/6452 (96%)


Test set: Average loss: 0.1173, Accuracy: 6220/6452 (96%)


Test set: Average loss: 0.1069, Accuracy: 6230/6452 (97%)


Test set: Average loss: 0.1060, Accuracy: 6229/6452 (97%)


Test set: Average loss: 0.1051, Accuracy: 6230/6452 (97%)


Test set: Average loss: 0.1043, Accuracy: 6230/6452 (97%)


Test set: Average loss: 0.1034, Accuracy: 6230/6452 (97%)


Test set: Average loss: 0.1025, Accuracy: 6228/6452 (97%)


Test set: Average loss: 0.1017, Accuracy: 6230/6452 (97%)


Test set: Average loss: 0.1009, Accurac

KeyboardInterrupt: 

In [19]:
get_metrics(DataGeneratorTest)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      3398
           1       0.97      0.96      0.96      3054

    accuracy                           0.97      6452
   macro avg       0.97      0.96      0.97      6452
weighted avg       0.97      0.97      0.97      6452



In [21]:
fileNameTest = 'KDDTest+.csv'
testDf = pd.read_csv(fileNameTest, header = None)
xTest, yTest = dataPreprocessing(testDf)
DatasetTest = datasetTorch(xTest, yTest)
DataGeneratorTest = DataLoader(DatasetTest, batch_size = len(DatasetTest), shuffle = True)

In [22]:
get_metrics(DataGeneratorTest)

              precision    recall  f1-score   support

           0       0.67      0.91      0.77      3594
           1       0.91      0.67      0.77      4902

    accuracy                           0.77      8496
   macro avg       0.79      0.79      0.77      8496
weighted avg       0.81      0.77      0.77      8496

