**Introduction**
The Phishing Detection Neural Network is a machine learning model trained to identify phishing websites.
It uses dense layers and ReLU activations to analyze encoded URLs, distinguishing between safe and malicious links. 
This model is key for enhancing online security against phishing threats.


In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import ast
import psutil
from tqdm import tqdm

# Display available memory
available_memory = psutil.virtual_memory().available
print(f"Available Memory: {available_memory / (1024 * 1024)} MB")


Available Memory: 2725.26171875 MB


In [2]:
class CharEncoder:
    def __init__(self, max_length):
        self.char_to_int = {'<PAD>': 0}
        self.int_to_char = {0: '<PAD>'}
        self.max_length = max_length

    def fit(self, urls):
        unique_chars = set(''.join(urls))
        for i, char in enumerate(unique_chars, start=1):
            self.char_to_int[char] = i
            self.int_to_char[i] = char

    def encode(self, url):
        encoded = [self.char_to_int.get(char, 0) for char in url]
        return encoded + [self.char_to_int['<PAD>']] * (self.max_length - len(encoded))


In [3]:
chunk_size = 5000
processed_data_path = 'new_processed_data.csv'  
max_length = 0

# Calculate total number of chunks for progress percentage
total_chunks = sum(1 for _ in pd.read_csv('new_data_urls.csv', chunksize=chunk_size))

url_chunks = pd.read_csv('new_data_urls.csv', chunksize=chunk_size)
for chunk in tqdm(url_chunks, total=total_chunks, desc="Calculating max length", unit="chunk"):
    max_length = max(max_length, chunk['url'].str.len().max())

encoder = CharEncoder(max_length=max_length)

url_chunks = pd.read_csv('new_data_urls.csv', chunksize=chunk_size)
for chunk_no, chunk in enumerate(tqdm(url_chunks, total=total_chunks, desc="Encoding URLs", unit="chunk")):
    chunk['encoded'] = chunk['url'].apply(encoder.encode)
    mode = 'w' if chunk_no == 0 else 'a'
    header = mode == 'w'
    chunk.to_csv(processed_data_path, mode=mode, header=header, index=False)


Calculating max length: 100%|██████████| 165/165 [00:01<00:00, 129.13chunk/s]
Encoding URLs: 100%|██████████| 165/165 [08:24<00:00,  3.06s/chunk]


In [10]:
class PhishingDetectionNN(nn.Module):
    def __init__(self, input_size):
        super(PhishingDetectionNN, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.network(x.float())


In [11]:
def data_generator(file_path, batch_size, input_size):
    # Process the file in chunks only once per epoch
    for chunk in pd.read_csv(file_path, chunksize=batch_size):
        try:
            features_list = [ast.literal_eval(row) for row in chunk['encoded']]
            if not all(len(f) == input_size for f in features_list):
                continue

            features = np.array(features_list, dtype=np.int32)
            labels = chunk['status'].values.astype(np.float32)

            yield torch.tensor(features), torch.tensor(labels).unsqueeze(1)

        except ValueError as e:
            print(f"Error processing chunk: {e}")
            break


In [12]:
def save_model(model, path):
    torch.save(model.state_dict(), path)


In [13]:
def load_model(model, path):
    model.load_state_dict(torch.load(path))
    model.eval()  # Set the model to evaluation mode


In [32]:

batch_size = 5000
def train_model(model, data_path, epochs, batch_size, input_size, save_path=None):
    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)
    train_gen = data_generator(data_path, batch_size, input_size)

    for epoch in tqdm(range(epochs), desc="Training Epochs", unit="epoch"):
        print(f"Starting epoch {epoch + 1}...")
        model.train()
        train_gen = data_generator(data_path, batch_size, input_size)

        for batch_no, (inputs, labels) in enumerate(train_gen):
            print(f"Processing batch {batch_no + 1}...")  # Debugging print

            if inputs.nelement() == 0:
                print("Empty batch, breaking out of loop")  # Debugging print
                break

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            print(f"Batch {batch_no + 1} processed, Loss: {loss.item()}")
            print(f"Batch {batch_no + 1} size: {inputs.size(0)}")  
            print(f"Batch {batch_no + 1} memory: {inputs.element_size() * inputs.nelement() / (1024 * 1024)} MB")
        save_model(model, f"{save_path}_epoch_{epoch+1}.pt")
        print(f"Model saved after epoch {epoch+1}")


def evaluate(model, file_path, batch_size, input_size):
    model.eval()
    predictions, targets = [], []
    test_generator = data_generator(file_path, batch_size, input_size)
       
    with torch.no_grad():
        for inputs, labels in test_generator:
            #print(f"Processing batch {inputs + 1}...")  # Debug: Batch processing print
            if inputs.nelement() == 0:
                print("Empty batch, skipping...")  # Debug: Empty batch check
                break
            outputs = model(inputs)
            predicted = (outputs > 0.5).float().view(-1)
            predictions.extend(predicted.numpy())
            targets.extend(labels.view(-1).numpy())
            #print(f"Batch {batch_no + 1}: Inputs shape: {inputs.shape}, Labels shape: {labels.shape}")
            #print(f"Batch {batch_no + 1}: Outputs: {outputs[:5]}")  # Print first 5 outputs
            #print(f"Batch {batch_no + 1}: Predicted: {predicted[:5]}")  # Print first 5 predictions
            #print(f"Batch {batch_no + 1}: Actual: {labels[:5]}")  # Print first 5 actual labels
            
    
    return accuracy_score(targets, predictions), precision_score(targets, predictions), recall_score(targets, predictions), f1_score(targets, predictions)


In [15]:
def split_data(file_path, test_size=0.2):
    iter_csv = pd.read_csv(file_path, iterator=True, chunksize=chunk_size)
    df = pd.concat([chunk for chunk in iter_csv], ignore_index=True)
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
    train_df.to_csv('train_data.csv', index=False)
    test_df.to_csv('test_data.csv', index=False)
    return 'train_data.csv', 'test_data.csv' 
    



In [16]:
train_data_path, test_data_path = split_data(processed_data_path)


In [18]:
train_data_sample = pd.read_csv(train_data_path, nrows=10)
test_data_sample = pd.read_csv(test_data_path, nrows=10)
print("Sample from Training Data:")
print(train_data_sample)
print("\nSample from Testing Data:")
print(test_data_sample)

Sample from Training Data:
                                                 url  status  \
0  websters-online-dictionary.org/definitions/cha...       1   
1                   https://id-auoneon-jp.nwbd1tn.cn       0   
2  https://otomati-srl.com/shop/zohoUpdated/zoh/z...       0   
3  http://boutiquell.almteam-consulting.com/js/ma...       0   
4                               transfer-society.com       0   
5  http://thecommitmentproject.net/wp-content/the...       0   
6  https://www.etc-meisia.elbwx.shop/btatxkemg.ph...       0   
7                       https://smbc-card.ugzkz.com/       0   
8                        http://www.daskeyboard.com/       1   
9  http://travelsardinia.com/wp-includes/js/2015f...       0   

                                             encoded  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [19]:
# Test the data_generator function

input_size = max_length
your_data_path = train_data_path  # or the path to your data
test_gen = data_generator(your_data_path, batch_size, input_size=input_size)
test_batch_features, test_batch_labels = next(iter(test_gen))

print(f"Features batch shape: {test_batch_features.shape}")
print(f"Labels batch shape: {test_batch_labels.shape}")

# Optionally, inspect the actual data
print(test_batch_features)
print(test_batch_labels)


Features batch shape: torch.Size([5000, 3992])
Labels batch shape: torch.Size([5000, 1])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)
tensor([[1.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [0.]])


In [20]:

model = PhishingDetectionNN(input_size)
train_model(model, train_data_path, epochs=1, batch_size=5000, input_size=input_size, save_path="phishing_model")



Training Epochs:   0%|          | 0/1 [00:00<?, ?epoch/s]

Starting epoch 1...
Processing batch 1...
Batch 1 processed, Loss: 0.6929675340652466
Batch 1 size: 5000
Batch 1 memory: 76.141357421875 MB
Processing batch 2...
Batch 2 processed, Loss: 0.6914961934089661
Batch 2 size: 5000
Batch 2 memory: 76.141357421875 MB
Processing batch 3...
Batch 3 processed, Loss: 0.6916424036026001
Batch 3 size: 5000
Batch 3 memory: 76.141357421875 MB
Processing batch 4...
Batch 4 processed, Loss: 0.6918656229972839
Batch 4 size: 5000
Batch 4 memory: 76.141357421875 MB
Processing batch 5...
Batch 5 processed, Loss: 0.69230055809021
Batch 5 size: 5000
Batch 5 memory: 76.141357421875 MB
Processing batch 6...
Batch 6 processed, Loss: 0.6923937797546387
Batch 6 size: 5000
Batch 6 memory: 76.141357421875 MB
Processing batch 7...
Batch 7 processed, Loss: 0.6922711133956909
Batch 7 size: 5000
Batch 7 memory: 76.141357421875 MB
Processing batch 8...
Batch 8 processed, Loss: 0.6934009790420532
Batch 8 size: 5000
Batch 8 memory: 76.141357421875 MB
Processing batch 9...


Training Epochs: 100%|██████████| 1/1 [2:23:44<00:00, 8624.11s/epoch]

Processing batch 132...
Batch 132 processed, Loss: 0.6932032108306885
Batch 132 size: 2608
Batch 132 memory: 39.71533203125 MB
Model saved after epoch 1





In [35]:
accuracy, precision, recall, f1 = evaluate(model, test_data_path, batch_size=5000, input_size=input_size)
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Accuracy: 0.5187284826218659, Precision: 0.5187284826218659, Recall: 1.0, F1 Score: 0.683108914539294


In [36]:
model = PhishingDetectionNN(input_size)
load_model(model, "phishing_model_epoch_1.pt")
train_model(model, train_data_path, epochs=2, batch_size=5000, input_size=input_size, save_path="phishing_model")


Training Epochs:   0%|          | 0/2 [00:00<?, ?epoch/s]

Starting epoch 1...
Processing batch 1...
Batch 1 processed, Loss: 0.6929185390472412
Batch 1 size: 5000
Batch 1 memory: 76.141357421875 MB
Processing batch 2...
Batch 2 processed, Loss: 0.6915481686592102
Batch 2 size: 5000
Batch 2 memory: 76.141357421875 MB
Processing batch 3...
Batch 3 processed, Loss: 0.6916806101799011
Batch 3 size: 5000
Batch 3 memory: 76.141357421875 MB
Processing batch 4...
Batch 4 processed, Loss: 0.6918901801109314
Batch 4 size: 5000
Batch 4 memory: 76.141357421875 MB
Processing batch 5...
Batch 5 processed, Loss: 0.692298173904419
Batch 5 size: 5000
Batch 5 memory: 76.141357421875 MB
Processing batch 6...
Batch 6 processed, Loss: 0.6923834085464478
Batch 6 size: 5000
Batch 6 memory: 76.141357421875 MB
Processing batch 7...
Batch 7 processed, Loss: 0.6922657489776611
Batch 7 size: 5000
Batch 7 memory: 76.141357421875 MB
Processing batch 8...
Batch 8 processed, Loss: 0.6933441758155823
Batch 8 size: 5000
Batch 8 memory: 76.141357421875 MB
Processing batch 9...

Training Epochs:  50%|█████     | 1/2 [2:07:31<2:07:31, 7651.80s/epoch]

Processing batch 132...
Batch 132 processed, Loss: 0.6932108402252197
Batch 132 size: 2608
Batch 132 memory: 39.71533203125 MB
Model saved after epoch 1
Starting epoch 2...
Processing batch 1...
Batch 1 processed, Loss: 0.6929239630699158
Batch 1 size: 5000
Batch 1 memory: 76.141357421875 MB
Processing batch 2...
Batch 2 processed, Loss: 0.6914844512939453
Batch 2 size: 5000
Batch 2 memory: 76.141357421875 MB
Processing batch 3...
Batch 3 processed, Loss: 0.691641092300415
Batch 3 size: 5000
Batch 3 memory: 76.141357421875 MB
Processing batch 4...
Batch 4 processed, Loss: 0.6918773651123047
Batch 4 size: 5000
Batch 4 memory: 76.141357421875 MB
Processing batch 5...
Batch 5 processed, Loss: 0.692298173904419
Batch 5 size: 5000
Batch 5 memory: 76.141357421875 MB
Processing batch 6...
Batch 6 processed, Loss: 0.6923810839653015
Batch 6 size: 5000
Batch 6 memory: 76.141357421875 MB
Processing batch 7...
Batch 7 processed, Loss: 0.6922648549079895
Batch 7 size: 5000
Batch 7 memory: 76.14135

Training Epochs: 100%|██████████| 2/2 [4:14:30<00:00, 7635.38s/epoch]  

Processing batch 132...
Batch 132 processed, Loss: 0.6932109594345093
Batch 132 size: 2608
Batch 132 memory: 39.71533203125 MB
Model saved after epoch 2





In [37]:
accuracy, precision, recall, f1 = evaluate(model, test_data_path, batch_size=5000, input_size=input_size)
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Accuracy: 0.5187284826218659, Precision: 0.5187284826218659, Recall: 1.0, F1 Score: 0.683108914539294
