# import libraries

In [1]:
import torch
from torch import nn as nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm

# Read data in CSV file

In [2]:
dataset = pd.read_csv('../input/malicious-urls-dataset/malicious_phish.csv')
dataset

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


# class and functions to get label from index and index from label

In [3]:
class Label_Index:
    def __init__(self, dataset):
        self.labels = dataset['type'].unique()
        self.label_index = {label: index for index, label in enumerate(self.labels)}
        self.index_label = {index: label for index, label in enumerate(self.labels)}
    
    def indexes_labels(self, dataset):
        return dataset['type'].map(self.index_label)

    def labels_indexes(self, dataset):
        return dataset['type'].map(self.label_index)
    def __call__(self, label):
        return self.label_index[label]

label_index = Label_Index(dataset)
label_index('phishing')

0

In [4]:
class Char_Index:
    def __init__(self, urls) -> None:
        self.char_index = {}
        self.index_char = {}
        for url in urls:
            for char in url:
                if char not in self.char_index:
                    self.char_index[char] = len(self.char_index)
                    self.index_char[len(self.index_char)] = char
    
    def string_indexes(self, string):
        return [self.char_index[char] for char in string]

char_index = Char_Index(dataset['url'])
char_index.string_indexes(dataset.url[0]), len(char_index.char_index)

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4, 6, 10, 9, 0, 1], 333)

# split data into train and test in ratio of 80:20

In [5]:
# shuffle data
dataset = dataset.sample(frac=1).reset_index(drop=True)


train_data = dataset[:int(len(dataset)*0.8)]
test_data = dataset[int(len(dataset)*0.8):].reset_index(drop=True)

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, char_index: Char_Index, label_index: Label_Index) -> None:
        self.df = df
        self.char_index = char_index
        self.label_index = label_index

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        url = self.df.url[index]
        label = self.label_index(self.df.type[index])
        return torch.tensor(self.char_index.string_indexes(url)), torch.tensor(label)

trainDataset = Dataset(train_data, char_index, label_index)
testDataset = Dataset(test_data, char_index, label_index)
len(trainDataset), len(testDataset)

(520952, 130239)

# function to pad the tensors to make them equal length and generate batch for training

In [7]:
def collate_fn(batch):
    urls, labels = zip(*batch)
    urls = nn.utils.rnn.pad_sequence(urls, batch_first=True)
    return urls, torch.tensor(labels)

trainGenerator = torch.utils.data.DataLoader(trainDataset, batch_size=256, shuffle=True, collate_fn=collate_fn, num_workers = 2)
testGenerator = torch.utils.data.DataLoader(testDataset, batch_size=256, shuffle=True, collate_fn=collate_fn, num_workers = 2)
for inputs, labels in trainGenerator:
    print(inputs, labels)
    break

tensor([[13, 22,  1,  ...,  0,  0,  0],
        [26, 26, 26,  ...,  0,  0,  0],
        [66, 13, 15,  ...,  0,  0,  0],
        ...,
        [ 0, 13, 27,  ...,  0,  0,  0],
        [ 0,  3,  6,  ...,  0,  0,  0],
        [19, 20, 20,  ...,  0,  0,  0]]) tensor([1, 0, 1, 1, 2, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 2, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2,
        2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 0, 0, 2, 1, 1, 0,
        1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 0, 1,
        1, 0, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 3, 2, 1, 1, 2,
        0, 2, 0, 0, 1, 1, 2, 2, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 2, 1, 0,
        2, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 3, 1, 1, 1, 1, 0, 2, 2, 1, 0, 0, 1, 0,
        1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1,
        2, 1, 1, 3, 1, 0, 0, 3, 0, 0, 1, 0, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1, 1,
        1, 2, 1, 1, 2, 1, 

In [8]:
teststr = "https://web.whatsapp.com/"
#teststr = Char_Index(dataset['url'])
#teststr.string_indexes("www.web.whatsapp.com"), len(teststr.char_index)
predictDataset = Dataset(teststr, char_index, label_index)

In [9]:
len(predictDataset)

25

# implementation of GRU and forward pass

In [10]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, num_layers=1, dropout=0.2, bidirectional=False):
        super(GRU, self).__init__()
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)   # [batch_size, seq_len] -> [batch_size, seq_len, embedding_size]
        if self.bidirectional == True:
            h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)   # [num_layers*2, batch_size, hidden_size]
        else:
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # [num_layers, batch_size, hidden_size]
        out, _ = self.gru(x, h0)    # [batch_size, seq_len, hidden_size]
        if self.bidirectional == True:
            out = out[:, -1, :self.hidden_size] + out[:, 0, self.hidden_size:]  # [batch_size, hidden_size]
        return self.fc(out)
gru_model = GRU(len(char_index.char_index), 128, 128, len(label_index.labels), bidirectional=True, num_layers=1)
gru_model(inputs)

  "num_layers={}".format(dropout, num_layers))


tensor([[-0.2747, -0.1274,  0.3276,  0.0098],
        [ 0.1230, -0.0037,  0.1447,  0.2633],
        [-0.1264, -0.2434,  0.1468, -0.0941],
        ...,
        [-0.1006, -0.1365,  0.0906, -0.0488],
        [ 0.1369, -0.1260,  0.0284, -0.0267],
        [-0.2293,  0.1638,  0.2110,  0.2513]], grad_fn=<AddmmBackward0>)

In [12]:
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.001)
loss = torch.nn.CrossEntropyLoss()
epochs = 10

# training for 10 epochs with binary cross entropy loss

In [None]:
best_right = 0
early_stop = 0
gru_model.to('cuda')
for epoch in range(epochs):
    loss_value = 0.0
    gru_model.train()
    for inputs, label in tqdm(trainGenerator):
        inputs = inputs.cuda()
        label = label.cuda()
        optimizer.zero_grad()
        output = gru_model(inputs)
        l = loss(output, label)
        l.backward()
        loss_value += l.item()
        optimizer.step()
    print(f'epoch: {epoch+1}, train loss: {loss_value/len(trainGenerator)}')

    # eval
    gru_model.cuda()
    gru_model.eval()
    loss_value = 0.0
    right_num = 0
    for inputs, label in testGenerator:
        inputs = inputs.cuda()
        label = label.cuda()
        output = gru_model(inputs)
        l = loss(output, label)
        loss_value += l.item()
        right_num += (torch.argmax(output, dim=1) == label).sum().item()
    print(f'\t test loss: {loss_value/len(testGenerator)}, test acc: {right_num/len(testDataset)}')
    
    # save model or early stop
    if right_num > best_right:
        best_right = right_num
        torch.save(gru_model.state_dict(), './gru_model.pth')
        print('save model')
        early_stop = 0
    else:
        early_stop += 1
        if early_stop > 3:
            print('early stop')
            break

100%|██████████| 2035/2035 [01:54<00:00, 17.71it/s]

epoch: 1, train loss: 0.15189851038249472





	 test loss: 0.08408992221919281, test acc: 0.9739862867497447
save model


100%|██████████| 2035/2035 [01:54<00:00, 17.81it/s]


epoch: 2, train loss: 0.06922656444864718
	 test loss: 0.06623309007196272, test acc: 0.9788696166278918
save model


100%|██████████| 2035/2035 [01:54<00:00, 17.79it/s]

epoch: 3, train loss: 0.055749132745745944





	 test loss: 0.06626128961564516, test acc: 0.9795606538747994
save model


100%|██████████| 2035/2035 [01:54<00:00, 17.82it/s]

epoch: 4, train loss: 0.04752531959068863





	 test loss: 0.059648715245056244, test acc: 0.9810041539016731
save model


100%|██████████| 2035/2035 [01:53<00:00, 17.86it/s]

epoch: 6, train loss: 0.03670043167646709





	 test loss: 0.056870798430312244, test acc: 0.9825628268030313


100%|██████████| 2035/2035 [01:53<00:00, 17.92it/s]

epoch: 7, train loss: 0.03296187347709602





	 test loss: 0.056324206159060385, test acc: 0.9830004837260728
save model


100%|██████████| 2035/2035 [01:52<00:00, 18.10it/s]

epoch: 8, train loss: 0.029956169159057796





	 test loss: 0.05620339370473546, test acc: 0.9831924385168805
save model


100%|██████████| 2035/2035 [01:54<00:00, 17.85it/s]

epoch: 9, train loss: 0.027142286673188208





	 test loss: 0.05821270885747402, test acc: 0.9831694039419836


100%|██████████| 2035/2035 [01:53<00:00, 17.96it/s]

epoch: 10, train loss: 0.025118042731826957





	 test loss: 0.06204727367993832, test acc: 0.982201951796313
