In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

# Change the current working directory to the parent directory
os.chdir(parent_dir)

# Optional - Create a testing parquet

In [28]:
import pandas as pd
import numpy as np

# List of input Parquet files
input_files = [
    'testdata/malware_bp.parquet',
    'testdata/misp_2402.parquet',
    'testdata/umbrella_benign_FINISHED.parquet'
]

# Number of rows to select
n_rows = 20

# Read all Parquet files into a single DataFrame
dataframes = [pd.read_parquet(file) for file in input_files]
combined_df = pd.concat(dataframes, ignore_index=True)

# Randomly select n_rows rows from the combined DataFrame
selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility

# Save the selected rows to a new Parquet file
selected_rows.to_parquet('testdata/sample.parquet')

# Display the selected rows
print(selected_rows)





                                  domain_name                label  \
259954                         p24.dynect.net  benign_2310:unknown   
358356                     policy.revasia.com  benign_2310:unknown   
258589                      investvoyager.com  benign_2310:unknown   
331248                              foxtv.com  benign_2310:unknown   
59056                               griml.com   misp_2310:phishing   
...                                       ...                  ...   
70781      honduraaslogin--interbanco.repl.co   misp_2310:phishing   
322033                           alexmods.com  benign_2310:unknown   
239317                       mx0.mail.ovh.net  benign_2310:unknown   
216287                   automate.isgtech.com  benign_2310:unknown   
91573   my-business-106590-107005.square.site   misp_2310:phishing   

        dns_has_dnskey  dns_A_count  dns_AAAA_count  dns_MX_count  \
259954             0.0            0               0             0   
358356             0.

In [6]:
print(selected_rows["label"])

260277             benign_2310:unknown
215554              misp_2310:phishing
199363              misp_2310:phishing
604467             benign_2310:unknown
454427             benign_2310:unknown
605213             benign_2310:unknown
121365                 malware:unknown
370528             benign_2310:unknown
168811    phishing_since_2402:phishing
327895             benign_2310:unknown
118768                 malware:unknown
531382             benign_2310:unknown
267407             benign_2310:unknown
561547             benign_2310:unknown
130976    phishing_since_2402:phishing
459505             benign_2310:unknown
67756                  malware:unknown
164475    phishing_since_2402:phishing
569688             benign_2310:unknown
108328                 malware:unknown
Name: label, dtype: object


# Make a test run

In [2]:
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)

for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    print(f"{index}: {domain_name} - {original_label}")
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    ndf = pp.NDF(feature_vector, 'phishing')
    print(type(ndf))
    print(ndf)
    

260277: sos.oregon.gov - benign_2310:unknown
<class 'dict'>
{'name': 'dataset_2024-05-17', 'features': tensor([[0.5000, 0.5178, 0.5000, 0.5227, 0.5384, 0.5104, 0.7311, 0.5000, 0.5622,
         0.5000, 0.5611, 0.5938, 0.6713, 0.6608, 0.5014, 0.5014, 0.6554, 0.6225,
         0.6225, 0.5709, 0.5139, 0.5580, 0.6879, 0.5622, 0.5498, 0.5700, 0.6825,
         0.5005, 0.5008, 0.5051, 0.5000, 0.5000, 0.6344, 0.6742, 0.5870, 0.6672,
         0.5000, 0.7311, 0.5000, 0.5000, 0.5250, 0.5589, 0.7311, 0.5498, 0.5482,
         0.5710, 0.6397, 0.5826, 0.6225, 0.6225, 0.6225, 0.5622, 0.5416, 0.5000,
         0.7311, 0.7311, 0.6225, 0.5622, 0.5498, 0.5563, 0.7311, 0.5000, 0.5167,
         0.6792, 0.6225, 0.5563, 0.7311, 0.5000, 0.5622, 0.5026, 0.5000, 0.5071,
         0.5498, 0.5622, 0.5263, 0.5000, 0.5000, 0.5000, 0.5000, 0.5927, 0.5000,
         0.5442, 0.5405, 0.6705, 0.5000, 0.5000, 0.5000, 0.5440, 0.6225, 0.5326,
         0.6225, 0.5000, 0.5000, 0.5156, 0.5416, 0.5277, 0.5654, 0.5000, 0.5000,
      

<class 'dict'>
{'name': 'dataset_2024-05-17', 'features': tensor([[0.5000, 0.5178, 0.5277, 0.5227, 0.5384, 0.5826, 0.7311, 0.5000, 0.5000,
         0.5000, 0.5481, 0.6374, 0.7021, 0.6608, 0.5012, 0.5014, 0.6713, 0.6225,
         0.6792, 0.5709, 0.5277, 0.5546, 0.6911, 0.5622, 0.5498, 0.5732, 0.6819,
         0.5016, 0.5100, 0.5017, 0.5018, 0.5000, 0.6255, 0.6755, 0.5727, 0.6719,
         0.5826, 0.7311, 0.5000, 0.5000, 0.5695, 0.5075, 0.6225, 0.5357, 0.5558,
         0.5743, 0.6247, 0.5826, 0.6225, 0.6225, 0.6225, 0.5622, 0.5416, 0.5000,
         0.7311, 0.7311, 0.6225, 0.5622, 0.5498, 0.5563, 0.7311, 0.5000, 0.5167,
         0.6792, 0.6225, 0.5563, 0.7311, 0.5000, 0.5622, 0.5026, 0.5000, 0.5071,
         0.5498, 0.5622, 0.5184, 0.5000, 0.5000, 0.5826, 0.5416, 0.5927, 0.7311,
         0.6995, 0.5472, 0.6642, 0.5000, 0.5000, 0.5000, 0.5440, 0.6055, 0.5541,
         0.6713, 0.5000, 0.5000, 0.5312, 0.5709, 0.5000, 0.6037, 0.5000, 0.5000,
         0.5312, 0.6642, 0.5000, 0.5000, 0.5234, 0.

<class 'dict'>
{'name': 'dataset_2024-05-17', 'features': tensor([[0.5000, 0.5534, 0.5826, 0.5227, 0.5384, 0.5208, 0.7311, 0.5000, 0.5000,
         0.5000, 0.5481, 0.6080, 0.7021, 0.6608, 0.5014, 0.5015, 0.6225, 0.6971,
         0.6225, 0.6055, 0.5139, 0.5781, 0.6835, 0.5622, 0.5498, 0.5606, 0.6869,
         0.5015, 0.5067, 0.5017, 0.5000, 0.5000, 0.5543, 0.6893, 0.5722, 0.6688,
         0.5277, 0.7311, 0.5000, 0.5000, 0.5939, 0.5021, 0.6225, 0.5783, 0.6455,
         0.6370, 0.5733, 0.5622, 0.6225, 0.6225, 0.6225, 0.5622, 0.5416, 0.5000,
         0.7311, 0.7311, 0.6225, 0.5622, 0.5498, 0.5563, 0.7311, 0.5000, 0.5498,
         0.7311, 0.6225, 0.5563, 0.7311, 0.5000, 0.5622, 0.5026, 0.5000, 0.5071,
         0.5498, 0.5622, 0.5184, 0.5000, 0.5000, 0.5000, 0.5416, 0.5927, 0.7311,
         0.6995, 0.5472, 0.6400, 0.5000, 0.5000, 0.5000, 0.5440, 0.6055, 0.5434,
         0.6391, 0.5000, 0.5000, 0.5156, 0.5357, 0.5000, 0.5654, 0.5000, 0.5000,
         0.5312, 0.6400, 0.5000, 0.5000, 0.5234, 0.

In [9]:
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)

#import pyarrow.parquet as pq
#d = pq.read_table(test_dataset)
#domain_data = d.to_pandas()


domain_data.drop(columns=['domain_name', 'label'], inplace=True)


ndf = pp.NDF(domain_data, 'phishing')
print(type(ndf))
print(ndf)
    

<class 'dict'>
{'name': 'dataset_2024-05-17', 'features': tensor([[0.5000, 0.5178, 0.5000,  ..., 0.6134, 0.6112, 0.5000],
        [0.5000, 0.5178, 0.5277,  ..., 0.6088, 0.6163, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5000, 0.5000, 0.5000],
        ...,
        [0.5000, 0.5178, 0.5277,  ..., 0.6088, 0.6163, 0.5000],
        [0.5000, 0.5000, 0.5000,  ..., 0.5630, 0.6331, 0.5000],
        [0.5000, 0.5178, 0.5000,  ..., 0.5000, 0.5000, 0.5000]]), 'labels': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], 'dimension': 179, 'feature_names': Index(['dns_has_dnskey', 'dns_A_count', 'dns_AAAA_count', 'dns_MX_count',
       'dns_NS_count', 'dns_TXT_count', 'dns_SOA_count', 'dns_CNAME_count',
       'dns_zone_level', 'dns_zone_digit_count',
       ...
       'rdap_ip_v6_count', 'rdap_ip_shortest_v4_prefix_len',
       'rdap_ip_longest_v4_prefix_len', 'rdap_ip_shortest_v6_prefix_len',
       'rdap_ip_longest_v6_pref

# Testing success on sample dataset

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
import datetime
import pyarrow.parquet as pq
import joblib
import math

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = math.ceil(n**0.5)**2
    return next_square

class Net(nn.Module):
    def __init__(self, side_size):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, 3, 1)
        self.bn3 = nn.BatchNorm2d(128)
        
        self.dropout = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(128 * (side_size-6)**2, 1024)  # Adjust the size calculation based on the number of convolutional layers
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 128)
        self.fc4 = nn.Linear(128, 2)
        print("CNN model created")

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        return self.fc4(x)
    
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'
pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)
domain_data.drop(columns=['domain_name', 'label'], inplace=True)
preprocessed_data = pp.NDF(domain_data, 'phishing')
feature_size = preprocessed_data["features"].shape[1]

desired_size = next_perfect_square(feature_size)  # Next perfect square
side_size = int(desired_size**0.5)

# Load the trained CNN model
model = Net(side_size=side_size)
model.load_state_dict(torch.load('models/phishing_cnn_model.pth', map_location=torch.device('cpu')))
model.eval()


test_dataset = 'testdata/sample3.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)


for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    preprocessed_data = pp.NDF(feature_vector, 'phishing')

    # Ensure the features tensor is in the correct shape for prediction
    # Reshape logic based on the error encountered
    data_tensor = preprocessed_data['features']
    feature_size = data_tensor.shape[1]
    desired_size = next_perfect_square(feature_size)
    padding = desired_size - feature_size
    if padding > 0:
        data_tensor_padded = F.pad(data_tensor, (0, padding), 'constant', 0)
    else:
        data_tensor_padded = data_tensor
    side_size = int(np.sqrt(desired_size))
    data_tensor_reshaped = data_tensor_padded.view(-1, 1, side_size, side_size)

    # Predict and calculate probabilities for the single record
    with torch.no_grad():
        outputs = model(data_tensor_reshaped)
        probabilities = F.softmax(outputs, dim=1)
        probabilities_np = probabilities.detach().cpu().numpy()
        probabilities_np_rounded = np.round(probabilities_np, decimals=3)

        _, predicted = torch.max(outputs, 1)
        predicted_np = predicted.detach().cpu().numpy()
        
    
    predicted = "BENIGN"
    if predicted_np[0] == 1:
        predicted = "MALIGN"
        
    result = "WRONG"
    if (original_label.startswith("benign") and predicted == "BENIGN"):
        result = "OK"
    if (not original_label.startswith("benign") and predicted == "MALIGN"):
        result = "OK"
        

    print(f"{result} | {domain_name} - {original_label} - predict: {predicted} | {probabilities_np_rounded} ")
        

CNN model created
OK | p24.dynect.net - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | policy.revasia.com - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | investvoyager.com - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | foxtv.com - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
WRONG | griml.com - misp_2310:phishing - predict: BENIGN | [[1. 0.]] 
OK | epdg.epc.mnc001.mcc250.pub.3gppnetwork.org - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | www.convert-me.com - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | static-lax3-2.xx.fbcdn.net - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | samsung.cn - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | pottery-barn.o3ae.net - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | velaw.com - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | katapult.com - benign_2310:unknown - predict: BENIGN | [[1. 0.]] 
OK | sindonews.net - benign_2310:unknown - predict: BENIGN | [[1. 0