In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

# Change the current working directory to the parent directory
os.chdir(parent_dir)

# Optional - Create a testing parquet

In [3]:
create_test_parquet = False

if create_test_parquet:
    import pandas as pd
    import numpy as np
    
    # List of input Parquet files
    input_files = [
        'testdata/malware_bp.parquet',
        'testdata/misp_2402.parquet',
        'testdata/umbrella_benign_FINISHED.parquet'
    ]
    
    # Number of rows to select
    n_rows = 20
    
    # Read all Parquet files into a single DataFrame
    dataframes = [pd.read_parquet(file) for file in input_files]
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/sample.parquet')
    
    # Display the selected rows
    print(selected_rows)





# Make a test run of conversion into NDF

In [4]:
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)

for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    print(f"{index}: {domain_name} - {original_label}")
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    ndf = pp.NDF(feature_vector, 'phishing')
    print(type(ndf))
    print(ndf)
    

260277: sos.oregon.gov - benign_2310:unknown
<class 'dict'>
{'name': 'dataset_2024-05-20', 'features': tensor([[0.5000, 0.5178, 0.5000, 0.5227, 0.5384, 0.5104, 0.7311, 0.5000, 0.5622,
         0.5000, 0.5611, 0.5938, 0.6713, 0.6608, 0.5014, 0.5014, 0.6554, 0.6225,
         0.6225, 0.5709, 0.5139, 0.5580, 0.6879, 0.5622, 0.5498, 0.5700, 0.6825,
         0.5005, 0.5008, 0.5051, 0.5000, 0.5000, 0.6344, 0.6742, 0.5870, 0.6672,
         0.5000, 0.7311, 0.5000, 0.5000, 0.5250, 0.5589, 0.7311, 0.5498, 0.5482,
         0.5710, 0.6397, 0.5826, 0.6225, 0.6225, 0.6225, 0.5622, 0.5416, 0.5000,
         0.7311, 0.7311, 0.6225, 0.5622, 0.5498, 0.5563, 0.7311, 0.5000, 0.5167,
         0.6792, 0.6225, 0.5563, 0.7311, 0.5000, 0.5622, 0.5026, 0.5000, 0.5071,
         0.5498, 0.5622, 0.5263, 0.5000, 0.5000, 0.5000, 0.5000, 0.5927, 0.5000,
         0.5442, 0.5405, 0.6705, 0.5000, 0.5000, 0.5000, 0.5440, 0.6225, 0.5326,
         0.6225, 0.5000, 0.5000, 0.5156, 0.5416, 0.5277, 0.5654, 0.5000, 0.5000,
      

<class 'dict'>
{'name': 'dataset_2024-05-17', 'features': tensor([[0.5000, 0.5534, 0.5826, 0.5227, 0.5384, 0.5208, 0.7311, 0.5000, 0.5000,
         0.5000, 0.5481, 0.6080, 0.7021, 0.6608, 0.5014, 0.5015, 0.6225, 0.6971,
         0.6225, 0.6055, 0.5139, 0.5781, 0.6835, 0.5622, 0.5498, 0.5606, 0.6869,
         0.5015, 0.5067, 0.5017, 0.5000, 0.5000, 0.5543, 0.6893, 0.5722, 0.6688,
         0.5277, 0.7311, 0.5000, 0.5000, 0.5939, 0.5021, 0.6225, 0.5783, 0.6455,
         0.6370, 0.5733, 0.5622, 0.6225, 0.6225, 0.6225, 0.5622, 0.5416, 0.5000,
         0.7311, 0.7311, 0.6225, 0.5622, 0.5498, 0.5563, 0.7311, 0.5000, 0.5498,
         0.7311, 0.6225, 0.5563, 0.7311, 0.5000, 0.5622, 0.5026, 0.5000, 0.5071,
         0.5498, 0.5622, 0.5184, 0.5000, 0.5000, 0.5000, 0.5416, 0.5927, 0.7311,
         0.6995, 0.5472, 0.6400, 0.5000, 0.5000, 0.5000, 0.5440, 0.6055, 0.5434,
         0.6391, 0.5000, 0.5000, 0.5156, 0.5357, 0.5000, 0.5654, 0.5000, 0.5000,
         0.5312, 0.6400, 0.5000, 0.5000, 0.5234, 0.

# Testing success on the sample dataset

In [6]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os
import datetime
import pyarrow.parquet as pq
import joblib
import math

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = math.ceil(n**0.5)**2
    return next_square


class Net(nn.Module):
    def __init__(self, side_size):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 256, 3, 1)
        
        
        self.dropout = nn.Dropout(0.1)
        
        self.fc1 = nn.Linear(256 * (side_size-4)**2, 1024)
        self.fc2 = nn.Linear(1024, 2048)
        self.fc3 = nn.Linear(2048, 1024)
        self.fc4 = nn.Linear(1024, 512)
        self.fc5 = nn.Linear(512, 2)
        print("CNN model created")
    
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'
pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)
domain_data.drop(columns=['domain_name', 'label'], inplace=True)
preprocessed_data = pp.NDF(domain_data, 'phishing')
feature_size = preprocessed_data["features"].shape[1]

desired_size = next_perfect_square(feature_size)  # Next perfect square
side_size = int(desired_size**0.5)

# Load the trained CNN model
model = torch.load('models/phishing_cnn_model.pth')
model.eval()


test_dataset = 'testdata/sample3.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)


for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    preprocessed_data = pp.NDF(feature_vector, 'phishing')

    # Ensure the features tensor is in the correct shape for prediction
    # Reshape logic based on the error encountered
    data_tensor = preprocessed_data['features']
    feature_size = data_tensor.shape[1]
    desired_size = next_perfect_square(feature_size)
    padding = desired_size - feature_size
    if padding > 0:
        data_tensor_padded = F.pad(data_tensor, (0, padding), 'constant', 0)
    else:
        data_tensor_padded = data_tensor
    side_size = int(np.sqrt(desired_size))
    data_tensor_reshaped = data_tensor_padded.view(-1, 1, side_size, side_size)

    # Predict and calculate probabilities for the single record
    with torch.no_grad():
        outputs = model(data_tensor_reshaped)
        probabilities = F.softmax(outputs, dim=1)
        probabilities_np = probabilities.detach().cpu().numpy()
        probabilities_np_rounded = np.round(probabilities_np, decimals=3)

        _, predicted = torch.max(outputs, 1)
        predicted_np = predicted.detach().cpu().numpy()
        
    
    predicted = "BENIGN"
    if predicted_np[0] == 1:
        predicted = "MALIGN"
        
    result = "WRONG"
    if (original_label.startswith("benign") and predicted == "BENIGN"):
        result = "OK"
    if (not original_label.startswith("benign") and predicted == "MALIGN"):
        result = "OK"
        

    print(f"{result} | {domain_name} - {original_label} - predict: {predicted} | {probabilities_np_rounded} ")
        

AttributeError: 'collections.OrderedDict' object has no attribute 'eval'