In [1]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /mnt/c/Users/petrp/Desktop/FETA/zabijteme/domainradar-clf/classifiers


# Optional - Create a testing parquet

In [2]:
create_test_parquet = True

if create_test_parquet:
    import pandas as pd
    import numpy as np
    
    # List of input Parquet files
    input_files = [
        'testdata/misp_2402.parquet',
        'testdata/benign_2312.parquet'
    ]
    
    # Number of rows to select
    n_rows = 400
    
    # Read all Parquet files into a single DataFrame
    dataframes = [pd.read_parquet(file) for file in input_files]
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/sample.parquet')
    
    # Display the selected rows
    print(selected_rows)





                                   domain_name                label  \
200645                        mp4moviez.sydney  benign_2312:unknown   
238746                          tsunagu.ryukyu  benign_2312:unknown   
247069  6036825.global.siteimproveanalytics.io  benign_2312:unknown   
166511                       assalaamair.co.tz  benign_2312:unknown   
284693                             ogorod.guru  benign_2312:unknown   
...                                        ...                  ...   
352476                           enpleinair.sm  benign_2312:unknown   
541535                       static.gem.gov.in  benign_2312:unknown   
427160                              mano.ku.lt  benign_2312:unknown   
264737                               varkom.hr  benign_2312:unknown   
133044                     webguide.nat.gov.tw  benign_2312:unknown   

        dns_has_dnskey  dns_A_count  dns_AAAA_count  dns_MX_count  \
200645             0.0            2               2             0   
238746   

# Make a test run of conversion into NDF

In [3]:
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)

for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    print(f"{index}: {domain_name} - {original_label}")
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    ndf = pp.NDF(feature_vector, 'phishing')
    print(type(ndf))
    print(ndf)
    

200645: mp4moviez.sydney - benign_2312:unknown
<class 'dict'>
{'name': 'dataset_2024-05-21', 'features': tensor([[0.5000, 0.5622, 0.5826, 0.5000, 0.5416, 0.5192, 0.7311, 0.5000, 0.5000,
         0.5312, 0.5749, 0.6056, 0.6713, 0.6608, 0.5094, 0.5177, 0.6554, 0.6834,
         0.6713, 0.6225, 0.5192, 0.6049, 0.6823, 0.5709, 0.5312, 0.5840, 0.6901,
         0.5007, 0.5069, 0.5250, 0.5015, 0.5000, 0.5000, 0.5000, 0.5361, 0.6875,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5920, 0.5032, 0.6225, 0.5622, 0.6401,
         0.6422, 0.5606, 0.5416, 0.7311, 0.6225, 0.6225, 0.5568, 0.5568, 0.7311,
         0.5498, 0.5000, 0.5000, 0.6225, 0.7311, 0.5000, 0.5000, 0.5498, 0.6225,
         0.5622, 0.5622, 0.5622, 0.5032, 0.5622, 0.5622, 0.6225, 0.5139, 0.5416,
         0.5374, 0.7311, 0.5000, 0.5000, 0.5000, 0.5678, 0.5000, 0.6634, 0.5603,
         0.6502, 0.5498, 0.5370, 0.5000, 0.5498, 0.5826, 0.5592, 0.6354, 0.5000,
         0.5000, 0.5333, 0.5553, 0.5000, 0.6225, 0.5000, 0.5000, 0.5333, 0.6502,
    

# Testing success on the sample dataset

In [4]:
import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
import math
from preprocessor import Preprocessor
from models.net import Net  # Ensure the model definition is available in models.py or adjust import accordingly

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = math.ceil(n**0.5)**2
    return next_square

# Load the test dataset
test_dataset = 'testdata/sample.parquet'
pp = Preprocessor()
domain_data = pd.read_parquet(test_dataset)
feature_size = domain_data.shape[1] - 2  # -2 = Domain and Label

# Calculate the desired size for padding
desired_size = next_perfect_square(feature_size)
side_size = int(desired_size**0.5)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

state_dict = torch.load('models/phishing_cnn_model_state_dict.pth')

# Initialize the model and load the state dict
model = Net(side_size).to(device)
model.load_state_dict(state_dict)
model.eval()

# Initialize counters for predictions
prediction_ok = 0
prediction_wrong = 0

# Iterate through each row in the dataset
for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    preprocessed_data = pp.NDF(feature_vector, 'phishing')
    data_tensor = torch.tensor(preprocessed_data['features'], dtype=torch.float32)
    feature_size = data_tensor.shape[1]
    desired_size = next_perfect_square(feature_size)
    padding = desired_size - feature_size
    
    if padding > 0:
        data_tensor_padded = F.pad(data_tensor, (0, padding), 'constant', 0)
    else:
        data_tensor_padded = data_tensor
    
    side_size = int(np.sqrt(desired_size))
    data_tensor_reshaped = data_tensor_padded.view(-1, 1, side_size, side_size)
    data_tensor_reshaped = data_tensor_reshaped.to(device)

    with torch.no_grad():
        outputs = model(data_tensor_reshaped)
        probabilities = F.softmax(outputs, dim=1)
        probabilities_np = probabilities.detach().cpu().numpy()
        probabilities_np_rounded = np.round(probabilities_np, decimals=3)

        _, predicted = torch.max(outputs, 1)
        predicted_np = predicted.detach().cpu().numpy()
    
    # Corrected label assignment
    predicted_label = "MALIGN" if predicted_np[0] == 1 else "BENIGN"
    result = "WRONG"
    
    if (original_label.startswith("benign") and predicted_label == "BENIGN") or (original_label.startswith("malign") or original_label.startswith("misp") and predicted_label == "MALIGN"):
        result = "OK"
        prediction_ok += 1
    else:
        prediction_wrong += 1

    print(f"Result: {result}")
    print(f"Domain: {domain_name}")
    print(f"Original Label: {original_label}")
    print(f"Predicted Label: {predicted_label}")
    print(f"Probabilities: {probabilities_np_rounded}")
    print('-' * 50)

print(f"Total OK Predictions: {prediction_ok}")
print(f"Total WRONG Predictions: {prediction_wrong}")
print(f"Accuracy Ratio: {prediction_ok / (prediction_ok + prediction_wrong):.2f}")


CNN model created
Result: OK
Domain: mp4moviez.sydney
Original Label: benign_2312:unknown
Predicted Label: BENIGN
Probabilities: [[1. 0.]]
--------------------------------------------------
Result: OK
Domain: tsunagu.ryukyu
Original Label: benign_2312:unknown
Predicted Label: BENIGN
Probabilities: [[1. 0.]]
--------------------------------------------------
Result: OK
Domain: 6036825.global.siteimproveanalytics.io
Original Label: benign_2312:unknown
Predicted Label: BENIGN
Probabilities: [[1. 0.]]
--------------------------------------------------
Result: OK
Domain: assalaamair.co.tz
Original Label: benign_2312:unknown
Predicted Label: BENIGN
Probabilities: [[0.997 0.003]]
--------------------------------------------------
Result: OK
Domain: ogorod.guru
Original Label: benign_2312:unknown
Predicted Label: BENIGN
Probabilities: [[1. 0.]]
--------------------------------------------------
Result: WRONG
Domain: haaberkorn.github.io
Original Label: phishing_since_2402:phishing
Predicted La