In [11]:
import pandas as pd


data = pd.read_csv("../../data/interim/phishing_site_urls.csv")

In [12]:
data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [13]:
len(data.index)

549346

In [14]:
data.Label.unique()

array(['bad', 'good'], dtype=object)

In [15]:
mapping = {"bad": 1, "good": 2}
data["Label"] = data["Label"].replace(mapping)

In [16]:
data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
3,mail.printakid.com/www.online.americanexpress....,1
4,thewhiskeydregs.com/wp-content/themes/widescre...,1


In [17]:
data.to_csv("../../data/interim/phishing_site_urls_int_label.csv")

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


class URLTranDataset(Dataset):
    def __init__(self, filepath, tokenizer):
        super(URLTranDataset).__init__()
        self.df = pd.read_csv(filepath)
        self.df = self.df.sample(frac=1.0)  # shuffle samples
        self.url_data = self.df.url.values.tolist()
        self.labels = self.df.Label.astype(int).values.tolist()
        self.encodings = preprocess(self.url_data, tokenizer)

    def __getitem__(self, idx):
        obs_dict = {k: v[idx] for k, v in self.encodings.items()}
        obs_dict["label"] = self.labels[idx]
        return obs_dict

    def __len__(self):
        return len(self.encodings.input_ids)


def preprocess(url_data, tokenizer):
    inputs = tokenizer(
        url_data, return_tensors="pt", max_length=128, truncation=True, padding=True
    )

    inputs["mlm_labels"] = inputs.input_ids.detach().clone()
    return inputs


def masking_step(inputs):
    rand = torch.rand(inputs.shape)
    # mask array that replicates BERT approach for MLM
    # ensure that [cls], [sep], [mask] remain untouched
    mask_arr = (rand < 0.15) * (inputs != 101) * (inputs != 102) * (inputs != 0)

    selection = [
        torch.flatten(mask_arr[i].nonzero()).tolist() for i in range(inputs.shape[0])
    ]

    for i in range(inputs.shape[0]):
        inputs[i, selection[i]] = 103

    return inputs


def split_data(dataset_path):
    df_final = pd.read_csv(dataset_path)
    X = df_final.URL
    y = df_final.Label.astype(int)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42
    )

    cnames = ["URL", "Label"]
    train_df = pd.DataFrame(zip(X_train.values, y_train.values), columns=cnames)
    test_df = pd.DataFrame(zip(X_test.values, y_test.values), columns=cnames)
    return train_df, test_df

In [None]:
train_df, test_df = split_data("../../data/interim/phishing_site_urls_int_label.csv")

In [None]:
train_df.to_csv("../../data/final-csv/train.csv")

In [None]:
test_df.to_csv("../../data/final-csv/test.csv")

In [None]:
https://www.kaggle.com/code/nadergo/classifying-hate-speech-with-a-pytorch-transformer/notebook