# import libraries

In [26]:
import torch
from torch import nn as nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm

# Read data in CSV file

In [35]:
dataset = pd.read_excel('/kaggle/input/url-data/data_imbal - 55000.xlsx')
dataset

Unnamed: 0,type,url
0,1,https://www.ujhyjhujhyjhyuj.ga/CC_POSTALE/f2a83/
1,1,https://stitch-statichosting-prod.s3.amazonaws...
2,0,https://www.reservoirgroup.com/careers/
3,0,https://www.camosy.com/themes/juicy/images/doo...
4,0,https://www.liveapps.com.au/
...,...,...
54995,0,https://www.knaack.com/support/literature
54996,0,https://plus.google.com/107396682494324876800
54997,0,https://www.jaegers.com/daily-pricing
54998,0,https://www.benhet.be/batonrouge/voorwaarden.html


In [36]:
dataset.type = dataset.type.map( {1:'phishing' ,0:'benign'} )

In [37]:
dataset

Unnamed: 0,type,url
0,phishing,https://www.ujhyjhujhyjhyuj.ga/CC_POSTALE/f2a83/
1,phishing,https://stitch-statichosting-prod.s3.amazonaws...
2,benign,https://www.reservoirgroup.com/careers/
3,benign,https://www.camosy.com/themes/juicy/images/doo...
4,benign,https://www.liveapps.com.au/
...,...,...
54995,benign,https://www.knaack.com/support/literature
54996,benign,https://plus.google.com/107396682494324876800
54997,benign,https://www.jaegers.com/daily-pricing
54998,benign,https://www.benhet.be/batonrouge/voorwaarden.html


In [17]:
dataset1 = pd.read_csv('/kaggle/input/malicious-urls-dataset/malicious_phish.csv')
dataset1

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


# class and functions to get label from index and index from label

In [38]:
class Label_Index:
    def __init__(self, dataset):
        self.labels = dataset['type'].unique()
        self.label_index = {label: index for index, label in enumerate(self.labels)}
        self.index_label = {index: label for index, label in enumerate(self.labels)}
    
    def indexes_labels(self, dataset):
        return dataset['type'].map(self.index_label)

    def labels_indexes(self, dataset):
        return dataset['type'].map(self.label_index)
    def __call__(self, label):
        return self.label_index[label]

label_index = Label_Index(dataset)
label_index('phishing')

0

In [39]:
class Char_Index:
    def __init__(self, urls) -> None:
        self.char_index = {}
        self.index_char = {}
        for url in urls:
            for char in url:
                if char not in self.char_index:
                    self.char_index[char] = len(self.char_index)
                    self.index_char[len(self.index_char)] = char
    
    def string_indexes(self, string):
        return [self.char_index[char] for char in string]

char_index = Char_Index(dataset['url'])
char_index.string_indexes(dataset.url[0]), len(char_index.char_index)

([0,
  1,
  1,
  2,
  3,
  4,
  5,
  5,
  6,
  6,
  6,
  7,
  8,
  9,
  0,
  10,
  9,
  0,
  8,
  9,
  0,
  10,
  9,
  0,
  10,
  8,
  9,
  7,
  11,
  12,
  5,
  13,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  5,
  22,
  23,
  12,
  24,
  25,
  5],
 446)

# split data into train and test in ratio of 80:20

In [40]:
# shuffle data
dataset = dataset.sample(frac=1).reset_index(drop=True)


train_data = dataset[:int(len(dataset)*0.8)]
test_data = dataset[int(len(dataset)*0.8):].reset_index(drop=True)

In [41]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, char_index: Char_Index, label_index: Label_Index) -> None:
        self.df = df
        self.char_index = char_index
        self.label_index = label_index

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        url = self.df.url[index]
        label = self.label_index(self.df.type[index])
        return torch.tensor(self.char_index.string_indexes(url)), torch.tensor(label)

trainDataset = Dataset(train_data, char_index, label_index)
testDataset = Dataset(test_data, char_index, label_index)
len(trainDataset), len(testDataset)

(44000, 11000)

# function to pad the tensors to make them equal length and generate batch for training

In [42]:
def collate_fn(batch):
    urls, labels = zip(*batch)
    urls = nn.utils.rnn.pad_sequence(urls, batch_first=True)
    return urls, torch.tensor(labels)

trainGenerator = torch.utils.data.DataLoader(trainDataset, batch_size=256, shuffle=True, collate_fn=collate_fn, num_workers = 2)
testGenerator = torch.utils.data.DataLoader(testDataset, batch_size=256, shuffle=True, collate_fn=collate_fn, num_workers = 2)
for inputs, labels in trainGenerator:
    print(inputs, labels)
    break

tensor([[0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0],
        ...,
        [0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0]]) tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 

# implementation of GRU and forward pass

In [45]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, num_layers=1, dropout=0.2, bidirectional=False):
        super(GRU, self).__init__()
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)   # [batch_size, seq_len] -> [batch_size, seq_len, embedding_size]
        if self.bidirectional == True:
            h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)   # [num_layers*2, batch_size, hidden_size]
        else:
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # [num_layers, batch_size, hidden_size]
        out, _ = self.gru(x, h0)    # [batch_size, seq_len, hidden_size]
        if self.bidirectional == True:
            out = out[:, -1, :self.hidden_size] + out[:, 0, self.hidden_size:]  # [batch_size, hidden_size]
        return self.fc(out)
gru_model = GRU(len(char_index.char_index), 128, 128, len(label_index.labels), bidirectional=True, num_layers=1)
gru_model(inputs)

  "num_layers={}".format(dropout, num_layers))


tensor([[ 0.0660, -0.3531],
        [ 0.0638, -0.3536],
        [ 0.0644, -0.3531],
        [ 0.0646, -0.3539],
        [ 0.0655, -0.3529],
        [ 0.0638, -0.3537],
        [ 0.0647, -0.3530],
        [ 0.0643, -0.3534],
        [ 0.0643, -0.3534],
        [ 0.0641, -0.3532],
        [ 0.0649, -0.3528],
        [ 0.0654, -0.3532],
        [ 0.0648, -0.3533],
        [ 0.0657, -0.3536],
        [ 0.0648, -0.3535],
        [ 0.0655, -0.3526],
        [ 0.0649, -0.3528],
        [ 0.0652, -0.3530],
        [ 0.0646, -0.3537],
        [ 0.0571, -0.3530],
        [ 0.0646, -0.3535],
        [ 0.0649, -0.3533],
        [ 0.0658, -0.3529],
        [ 0.0641, -0.3533],
        [ 0.0640, -0.3535],
        [ 0.0648, -0.3537],
        [ 0.0659, -0.3532],
        [ 0.0661, -0.3530],
        [ 0.0635, -0.3537],
        [ 0.0647, -0.3535],
        [ 0.0640, -0.3533],
        [ 0.0637, -0.3538],
        [ 0.0649, -0.3526],
        [ 0.0639, -0.3535],
        [ 0.0642, -0.3447],
        [ 0.0641, -0

In [46]:
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.001)
loss = torch.nn.CrossEntropyLoss()
epochs = 10

# training for 10 epochs with binary cross entropy loss

In [47]:
best_right = 0
early_stop = 0
gru_model.to('cuda')
for epoch in range(epochs):
    loss_value = 0.0
    gru_model.train()
    for inputs, label in tqdm(trainGenerator):
        inputs = inputs.cuda()
        label = label.cuda()
        optimizer.zero_grad()
        output = gru_model(inputs)
        l = loss(output, label)
        l.backward()
        loss_value += l.item()
        optimizer.step()
    print(f'epoch: {epoch+1}, train loss: {loss_value/len(trainGenerator)}')

    # eval
    gru_model.cuda()
    gru_model.eval()
    loss_value = 0.0
    right_num = 0
    for inputs, label in testGenerator:
        inputs = inputs.cuda()
        label = label.cuda()
        output = gru_model(inputs)
        l = loss(output, label)
        loss_value += l.item()
        right_num += (torch.argmax(output, dim=1) == label).sum().item()
    print(f'\t test loss: {loss_value/len(testGenerator)}, test acc: {right_num/len(testDataset)}')
    
    # save model or early stop
    if right_num > best_right:
        best_right = right_num
        torch.save(gru_model.state_dict(), './gru_model.pth')
        print('save model')
        early_stop = 0
    else:
        early_stop += 1
        if early_stop > 3:
            print('early stop')
            break

100%|██████████| 172/172 [00:11<00:00, 15.35it/s]

epoch: 1, train loss: 0.17109107923542344





	 test loss: 0.11030160999575327, test acc: 0.9550909090909091
save model


100%|██████████| 172/172 [00:10<00:00, 15.70it/s]


epoch: 2, train loss: 0.09288722033027647
	 test loss: 0.0727890415420366, test acc: 0.9733636363636363
save model


100%|██████████| 172/172 [00:11<00:00, 15.62it/s]

epoch: 3, train loss: 0.06387150250808445





	 test loss: 0.07143506624324378, test acc: 0.9751818181818181
save model


100%|██████████| 172/172 [00:11<00:00, 15.43it/s]

epoch: 4, train loss: 0.04870552796502273





	 test loss: 0.06045731585905995, test acc: 0.977909090909091
save model


100%|██████████| 172/172 [00:11<00:00, 15.39it/s]

epoch: 5, train loss: 0.03729057630394087





	 test loss: 0.05084578437340814, test acc: 0.9826363636363636
save model


100%|██████████| 172/172 [00:11<00:00, 15.42it/s]

epoch: 6, train loss: 0.02942201602651716





	 test loss: 0.04953553555749877, test acc: 0.9832727272727273
save model


100%|██████████| 172/172 [00:11<00:00, 15.34it/s]


epoch: 7, train loss: 0.023449591190424248
	 test loss: 0.05172349003598441, test acc: 0.9837272727272727
save model


100%|██████████| 172/172 [00:10<00:00, 15.76it/s]

epoch: 8, train loss: 0.01736031415964325





	 test loss: 0.05703064751659715, test acc: 0.9825454545454545


100%|██████████| 172/172 [00:11<00:00, 14.87it/s]

epoch: 9, train loss: 0.01330723195048428





	 test loss: 0.06317677106275114, test acc: 0.982


100%|██████████| 172/172 [00:10<00:00, 15.84it/s]

epoch: 10, train loss: 0.010193069962423218





	 test loss: 0.06717484483365403, test acc: 0.9830909090909091
