In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [3]:
df = pd.read_csv('./data/XSS_dataset.csv', encoding='utf-8-sig')
df.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [4]:
print(df['Label'])

0        0
1        1
2        0
3        0
4        0
        ..
13681    1
13682    1
13683    1
13684    0
13685    0
Name: Label, Length: 13686, dtype: int64


In [4]:
def data2char_index(X, max_len):
    """
    This function takes a list of strings and converts each string to a list of integers, where each integer represents
    the index of a character in the alphabet string. The alphabet string contains all the characters that are allowed
    in the input strings. The resulting list of integer lists is then padded with zeros to ensure that all lists have
    the same length, which is equal to the max_len parameter.

    Args:
    - X: list of strings
    - max_len: integer, maximum length of each list of integers after padding

    Returns:
    - X_char: numpy array of shape (len(X), max_len), where each element is an integer representing the index of a
              character in the alphabet string
    """
    alphabet = " abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    result = [] 
    for data in X:
        mat = []
        for ch in data:
            if ch not in alphabet:
                continue
            mat.append(alphabet.index(ch))
        result.append(mat)   
    X_char = tf.keras.preprocessing.sequence.pad_sequences(np.array(result, dtype=object), padding='post',
                                                           truncating='post', maxlen=max_len)
    return X_char

In [5]:
data = df['Sentence'].values
label = df['Label'].values

trainX, testX, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)

x_train = data2char_index(trainX, max_len=1000)
x_test = data2char_index(testX, max_len=1000)

In [1]:
x_train.shape

NameError: name 'x_train' is not defined

In [7]:
x_test.shape

(2738, 1000)

In [8]:
def get_charcnn_model(max_len):
    """
    This function returns a compiled Keras model for character-level convolutional neural network (CNN) for XSS classification.
    The model takes as input a tensor of shape (max_len,), where max_len is the maximum length of the input strings.
    The model architecture consists of three parallel 1D convolutional layers with different filter sizes (5, 10, and 15),
    followed by max pooling layers, and a concatenation layer that combines the outputs of the three parallel layers.
    The concatenated output is then flattened and passed through two fully connected (Dense) layers with ReLU activation,
    followed by a final output layer with sigmoid activation.

    Args:
    - max_len: integer, maximum length of the input strings

    Returns:
    - model: compiled Keras model
    """
    main_input = tf.keras.layers.Input(shape=(max_len,))
    
    embedder = tf.keras.layers.Embedding(
        input_dim=70,  
        output_dim=80, 
        input_length=max_len,
        trainable=False
    )
    embed = embedder(main_input)
    
    cnn1 = tf.keras.layers.Conv1D(32, 5, padding='same', strides=1, activation='relu')(embed)
    cnn1 = tf.keras.layers.MaxPooling1D(pool_size=12)(cnn1)
    
    cnn2 = tf.keras.layers.Conv1D(32, 10, padding='same', strides=1, activation='relu')(embed)
    cnn2 = tf.keras.layers.MaxPooling1D(pool_size=11)(cnn2)
    
    cnn3 = tf.keras.layers.Conv1D(32, 15, padding='same', strides=1, activation='relu')(embed)
    cnn3 = tf.keras.layers.MaxPooling1D(pool_size=10)(cnn3)
    
    cnn = tf.keras.layers.concatenate([cnn1, cnn2, cnn3], axis=1)
    flat = tf.keras.layers.Flatten()(cnn)
    drop = tf.keras.layers.Dropout(0.2)(flat)
    dense1 = tf.keras.layers.Dense(1024, activation='relu')(drop)
    dense2 = tf.keras.layers.Dense(128, activation='relu')(dense1)
    main_output = tf.keras.layers.Dense(1, activation='sigmoid')(dense2)
    model = tf.keras.Model(inputs=main_input, outputs=main_output)
    return model

In [9]:
model = get_charcnn_model(max_len=1000)
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1000, 80)     5600        ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 1000, 32)     12832       ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 1000, 32)     25632       ['embedding[0][0]']              
                                                                                              

In [10]:
batch_size = 128
num_epoch = 5
model_log = model.fit(
    x_train, 
    y_train,
    batch_size=batch_size,
    epochs=num_epoch,
    verbose=1,
    validation_data=(x_test, y_test)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
pred = model.predict(x_test)
y_pred = np.int64(pred>0.5)



In [12]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(" Accuracy : {0} \n Precision : {1} \n Recall : {2}".format(accuracy, precision, recall))

 Accuracy : 0.9989043097151206 
 Precision : 1.0 
 Recall : 0.9979702300405954
