In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import torch.backends.cudnn as cudnn

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv('XSS_dataset.csv', index_col = 0)
df.head()

Unnamed: 0,Sentence,Label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"\t </span> <span class=""reference-text"">Steeri...",0
3,"\t </span> <span class=""reference-text""><cite ...",0
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [5]:
# split train and test
train_df = df[:int(len(df)*0.8)]
test_df = df[int(len(df)*0.8):]
train_df

Unnamed: 0,Sentence,Label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"\t </span> <span class=""reference-text"">Steeri...",0
3,"\t </span> <span class=""reference-text""><cite ...",0
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0
...,...,...
10943,\t </span> </li>,0
10944,"<li><a href=""/wiki/William_Whewell"" title=""Wil...",0
10945,"<li><a href=""/wiki/Niklas_Luhmann"" title=""Nikl...",0
10946,"<sub onmousedown=""alert(1)"">test</sub>",1


In [9]:
sentence_flask = train_df['Sentence'].values[0]
# sentence = <tt onmouseover="alert(1)">test</tt>
label_flask = train_df['Label'].values[0]

<li><a href="/wiki/File:Socrates.png" class="image"><img alt="Socrates.png" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png" decoding="async" width="18" height="28" class="noviewer" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/27px-Socrates.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/36px-Socrates.png 2x" data-file-width="326" data-file-height="500" /> </a> <a href="/wiki/Portal:Philosophy" title="Portal:Philosophy">Philosophy&#32;portal </a> </li> </ul>


In [7]:
def data2char_index(data, max_len):
    alphabet = " abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    mat = []
    for ch in data:
        if ch not in alphabet:
            continue
        mat.append(alphabet.index(ch))
    if len(mat) < max_len:
        mat += [0] * (max_len - len(mat))
    elif len(mat) > max_len:
        mat = mat[:max_len]
    return mat

In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len) -> None:
        self.df = df
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        sentence = self.df['Sentence'].values[index]
        label = self.df['Label'].values[index]
        return torch.tensor(data2char_index(sentence, self.max_len)), torch.tensor(label)


sentence_flask = train_df['Sentence'].values[0]
# sentence = <tt onmouseover="alert(1)">test</tt>
label_flask = train_df['Label'].values[0]
max_len = 1000
#Preprocess the sentence
processed_sentence_flask = data2char_index(sentence_flask, max_len)
processed_label_flask = label_flask

#Torched the sentence
torched_sentence_flask = torch.tensor(processed_sentence_flask)
torched_label_flask = torch.tensor(processed_label_flask)

trainDataset = Dataset(train_df, 1000)
testDataset = Dataset(test_df, 1000)
len(trainDataset), len(testDataset)

(10948, 2738)

In [8]:
trainGenerator = torch.utils.data.DataLoader(trainDataset, batch_size=128, shuffle=True)
testGenerator = torch.utils.data.DataLoader(testDataset, batch_size=128, shuffle=True)
for data, label in trainGenerator:
    print(data.shape)
    print(label.shape)
    break

torch.Size([128, 1000])
torch.Size([128])


In [13]:
class TextCNN(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size, num_classes, kernel_sizes, num_kernels):
        super(TextCNN, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        self.convs = torch.nn.ModuleList(
            [torch.nn.Conv2d(1, num_kernels, (K, embedding_size)) for K in kernel_sizes])
        self.dropout = torch.nn.Dropout(0.5)
        self.fc = torch.nn.Linear(len(kernel_sizes) * num_kernels, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, sequence_length, embedding_size]
        x = x.unsqueeze(1)  # [batch_size, 1, sequence_length, embedding_size]
        x = [torch.nn.functional.relu(conv(x)).squeeze(3) for conv in self.convs]  # [batch_size, num_kernels, sequence_length - kernel_size + 1] * len(kernel_sizes)
        x = [torch.nn.functional.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [batch_size, num_kernels] * len(kernel_sizes)
        x = torch.cat(x, 1) # [batch_size, num_kernels * len(kernel_sizes)]
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc(x)  # (N, C)
        return logit

model = TextCNN(vocab_size=70, embedding_size=64, num_classes=2, kernel_sizes=[3, 4, 5], num_kernels=128)
model(torch.tensor(data2char_index('hello world', 1000)).unsqueeze(0))

tensor([[ 0.2185, -0.8211]], grad_fn=<AddmmBackward0>)

In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss = torch.nn.CrossEntropyLoss()
epochs = 10

In [15]:
# use GPU to train
model = model.cuda()
for epoch in range(epochs):
    model.train()
    for data, label in tqdm(trainGenerator):
        data = data.cuda()
        label = label.cuda()
        optimizer.zero_grad()
        output = model(data)
        l = loss(output, label)
        l.backward()
        optimizer.step()
    print(f'epoch: {epoch}, loss: {l}', end="")
    torch.save({'epoch': epoch,
                    'model': model,
                    'optimizer': optimizer},
                   'checkpoint_textcnn.pth.tar')
    # eval
    # model = torch.load('checkpoint_textcnn.pth.tar')['model']
    model.cuda()
    model.eval()
    right_num = 0
    for data, label in testGenerator:
        data = data.cuda()
        label = label.cuda()
        output = model(data)
        right_num += (torch.argmax(output, dim=1) == label).sum().item()
    print(f"Test accuracy: {right_num / len(testDataset)}")

100%|██████████| 86/86 [00:03<00:00, 24.44it/s]


epoch: 0, loss: 2.6998521207133308e-05Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 25.87it/s]


epoch: 1, loss: 0.0001227237080456689Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 25.76it/s]


epoch: 2, loss: 0.0003590828273445368Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 25.92it/s]


epoch: 3, loss: 2.535591193009168e-05Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 26.04it/s]


epoch: 4, loss: 0.00012331618927419186Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 25.87it/s]


epoch: 5, loss: 8.4545390564017e-06Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 25.54it/s]


epoch: 6, loss: 0.00036836578510701656Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 25.47it/s]


epoch: 7, loss: 0.0029694533441215754Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 26.05it/s]


epoch: 8, loss: 0.0003497801080811769Test accuracy: 0.9989043097151206


100%|██████████| 86/86 [00:03<00:00, 25.55it/s]


epoch: 9, loss: 0.0012010738719254732Test accuracy: 0.9989043097151206


# eval

In [12]:
class Eval():
    def __init__(self):
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0

    def add(self, pred, label):
        if pred == 1 and label == 1:
            self.tp += 1
        elif pred == 1 and label == 0:
            self.fp += 1
        elif pred == 0 and label == 1:
            self.fn += 1
        elif pred == 0 and label == 0:
            self.tn += 1

    def accuracy(self):
        return (self.tp + self.tn) / (self.tp + self.fp + self.fn + self.tn)

    def precision(self):
        return self.tp / (self.tp + self.fp)

    def recall(self):
        return self.tp / (self.tp + self.fn)

In [18]:
model = 0

In [19]:
eval = Eval()
model = torch.load('checkpoint_textcnn.pth.tar')['model']
model.cuda()
model.eval()
for data, label in testGenerator:
    data = data.cuda()
    label = label
    output = model(data).argmax(dim=1).cpu()
    for pred, l in zip(output, label):
        eval.add(pred, l)
print(f"accuracy: {eval.accuracy()}")
print(f"precision: {eval.precision()}")
print(f"recall: {eval.recall()}")

accuracy: 0.9989043097151206
precision: 1.0
recall: 0.9979550102249489


In [10]:
class TextCNN(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size, num_classes, kernel_sizes, num_kernels):
        super(TextCNN, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        self.convs = torch.nn.ModuleList(
            [torch.nn.Conv2d(1, num_kernels, (K, embedding_size)) for K in kernel_sizes])
        self.dropout = torch.nn.Dropout(0.5)
        self.fc = torch.nn.Linear(len(kernel_sizes) * num_kernels, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, sequence_length, embedding_size]
        x = x.unsqueeze(1)  # [batch_size, 1, sequence_length, embedding_size]
        x = [torch.nn.functional.relu(conv(x)).squeeze(3) for conv in self.convs]  # [batch_size, num_kernels, sequence_length - kernel_size + 1] * len(kernel_sizes)
        x = [torch.nn.functional.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [batch_size, num_kernels] * len(kernel_sizes)
        x = torch.cat(x, 1) # [batch_size, num_kernels * len(kernel_sizes)]
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc(x)  # (N, C)
        return logit
    
#What is the mat?
def data2char_index(data, max_len):
    alphabet = " abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    mat = []
    for ch in data:
        if ch not in alphabet:
            continue
        mat.append(alphabet.index(ch))
    if len(mat) < max_len:
        mat += [0] * (max_len - len(mat))
    elif len(mat) > max_len:
        mat = mat[:max_len]
    return mat

def modelService():
    sentence_flask = train_df['Sentence'].values[0]
# sentence = <tt onmouseover="alert(1)">test</tt>
    label_flask = train_df['Label'].values[0]
    max_len = 1000
#Preprocess the sentence
    processed_sentence_flask = data2char_index(sentence_flask, max_len)
    processed_label_flask = label_flask

#Torched the sentence
    torched_sentence_flask = torch.tensor(processed_sentence_flask)
    torched_label_flask = torch.tensor(processed_label_flask)
    model = torch.load('checkpoint_textcnn.pth.tar')['model']
    model.cuda()
    model.eval()
    data = torched_sentence_flask.unsqueeze(0).cuda()
    label = torched_label_flask
    output = model(data).argmax(dim=1).cpu()
    print(output)

modelService(data2char_index)

tensor([0])
