In [1]:
import pandas as pd
import numpy as np

def customOneHotEncoder(data):
    dataAdjust = data.ljust(200,'0')[:200] # padding if not of length and adjusting the data lenght to get a 200x39 input matrix
    # define universe of possible input values
    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz,._'
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    # integer encode input data
    integer_encoded = [char_to_int[char] for char in dataAdjust]
    #print(integer_encoded)
    # one hot encode
    onehot_encoded = list()
    for i, value in enumerate(integer_encoded):
        letter = [0 for _ in range(len(alphabet))]
        letter[value] = 1
        onehot_encoded.append(letter)
    #print(onehot_encoded) # the real encoding
    return onehot_encoded

# takes a .csv filename
def dataPreprocessing(fileName):
    df = pd.read_csv(fileName, header = None)
    
    #prepare the imput data
    xString = df.iloc[:,:41].to_string(header=False, index=False, index_names = False).split('\n')
    xList = [','.join(ele.split()) for ele in xString] # gives comma separated strings for each row of DataFrame
    xData = []
    for string in xList:
        stringLower = string.lower()
        oneHot = customOneHotEncoder(stringLower)
        xData.append(oneHot)
    xMid = np.array(xData)
    xArray = xMid.transpose(0,2,1) # convert xMid's dim (size, 200, 39) to (size, 39, 200)
    
    #prepare the label data
    df[41] = np.where(df[41]=='normal', 'normal', 'attack') # replacing anything except 'normal' with 'attack'
    Ydf = df[41]
    #labelName = Ydf.unique().tolist().sort() # sorted 38 label names
    #yArray = Ydf.str.get_dummies().to_numpy() # ndarray of shape(rows/lines, 38)
    yArray = Ydf.to_numpy()
    
    assert xArray.shape[0] == yArray.shape[0], 'unequal input and label sample size'
    
    
    return xArray, yArray # return processed array of input and label

In [2]:
fileName = 'Ds.csv'
x, y = dataPreprocessing(fileName)

In [3]:
x.shape

(22544, 39, 200)

In [4]:
y.shape

(22544,)