In [1]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /home/ihranicky/git/domainradar-clf/classifiers


# Optional - Create a testing parquet

In [2]:
create_test_parquet = True

if create_test_parquet:
    import pandas as pd
    import numpy as np
    
    # List of input Parquet files
    input_files = [
        'testdata/misp_2402.parquet',
        'testdata/benign_2312.parquet'
    ]
    
    # Number of rows to select
    n_rows = 1000
    
    # Read all Parquet files into a single DataFrame
    dataframes = [pd.read_parquet(file) for file in input_files]
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/sample.parquet')
    
    # Display the selected rows
    print(selected_rows)





                                   domain_name                label  \
200645                        mp4moviez.sydney  benign_2312:unknown   
238746                          tsunagu.ryukyu  benign_2312:unknown   
247069  6036825.global.siteimproveanalytics.io  benign_2312:unknown   
166511                       assalaamair.co.tz  benign_2312:unknown   
284693                             ogorod.guru  benign_2312:unknown   
...                                        ...                  ...   
522556                          mni.1slogir.vg  benign_2312:unknown   
302846                        monteazul.com.pe  benign_2312:unknown   
109903                         kaducollect.com   misp_2310:phishing   
156130   search-bcarchives.royalbcmuseum.bc.ca  benign_2312:unknown   
432252                     n.cngunadvised.buzz  benign_2312:unknown   

        dns_has_dnskey  dns_A_count  dns_AAAA_count  dns_MX_count  \
200645             0.0            2               2             0   
238746   

# Make a test run of conversion into NDF

In [4]:
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)

for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    print(f"{index}: {domain_name} - {original_label}")
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    ndf = pp.NDF(feature_vector, 'phishing')
    print(type(ndf))
    

200645: mp4moviez.sydney - benign_2312:unknown
<class 'dict'>
238746: tsunagu.ryukyu - benign_2312:unknown
<class 'dict'>
247069: 6036825.global.siteimproveanalytics.io - benign_2312:unknown
<class 'dict'>
166511: assalaamair.co.tz - benign_2312:unknown
<class 'dict'>
284693: ogorod.guru - benign_2312:unknown
<class 'dict'>
39856: haaberkorn.github.io - phishing_since_2402:phishing
<class 'dict'>
491855: anuarioseguros.lat - benign_2312:unknown
<class 'dict'>
401138: maavarim.mod.gov.il - benign_2312:unknown
<class 'dict'>
550439: athonor.defensoria-nsjp.gob.mx - benign_2312:unknown
<class 'dict'>
547103: dickofdogcom.z00.monster - benign_2312:unknown
<class 'dict'>
270013: skygx.store - benign_2312:unknown
<class 'dict'>
414224: mop-shoot.tauri.hu - benign_2312:unknown
<class 'dict'>
335453: exidmet.justice.gov.az - benign_2312:unknown
<class 'dict'>
183535: misericordia.mb.ca - benign_2312:unknown
<class 'dict'>
511909: cassady.family - benign_2312:unknown
<class 'dict'>
254001: h109

# Testing success on the sample dataset

In [5]:
import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
import math
from preprocessor import Preprocessor
from models.phishing_cnn_net import Net  # Ensure the model definition is available in models.py or adjust import accordingly

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = math.ceil(n**0.5)**2
    return next_square

# Load the test dataset
test_dataset = 'testdata/sample.parquet'
pp = Preprocessor()
domain_data = pd.read_parquet(test_dataset)
feature_size = domain_data.shape[1] - 2  # -2 = Domain and Label

# Calculate the desired size for padding
desired_size = next_perfect_square(feature_size)
side_size = int(desired_size**0.5)
print(side_size)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

state_dict = torch.load('models/phishing_cnn_model.pth')

# Initialize the model and load the state dict
model = Net(side_size).to(device)
model.load_state_dict(state_dict)
model.eval()

# Initialize counters for predictions
prediction_ok = 0
prediction_wrong = 0

fp = 0
fn = 0

tp = 0
tn = 0

# Iterate through each row in the dataset
for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    # print shape of feature_vector

    
    preprocessed_data = pp.NDF(feature_vector, 'phishing')
    # print shape of preprocessed_data['features']
    data_tensor = torch.tensor(preprocessed_data['features'], dtype=torch.float32)
    feature_size = data_tensor.shape[1]
    desired_size = next_perfect_square(feature_size)
    padding = desired_size - feature_size
    
    if padding > 0:
        data_tensor_padded = F.pad(data_tensor, (0, padding), 'constant', 0)
    else:
        data_tensor_padded = data_tensor
    
    side_size = int(np.sqrt(desired_size))
    data_tensor_reshaped = data_tensor_padded.view(-1, 1, side_size, side_size)
    data_tensor_reshaped = data_tensor_reshaped.to(device)

    with torch.no_grad():
        outputs = model(data_tensor_reshaped)
        probabilities = F.softmax(outputs, dim=1)
        probabilities_np = probabilities.detach().cpu().numpy()
        probabilities_np_rounded = np.round(probabilities_np, decimals=3)

        _, predicted = torch.max(outputs, 1)
        predicted_np = predicted.detach().cpu().numpy()
    
    # Corrected label assignment
    predicted_label = "MALIGN" if predicted_np[0] > 0.5 else "BENIGN"
    result = "WRONG"
    
    if (original_label.startswith("benign") and predicted_label == "BENIGN") or (original_label.startswith("malign") or original_label.startswith("misp") or original_label.startswith("phish") and predicted_label == "MALIGN"):
        result = "OK"
        prediction_ok += 1
        if predicted_label == "BENIGN":
            tn += 1
        elif predicted_label == "MALIGN":
            tp += 1
    else:
        prediction_wrong += 1
        if predicted_label == "BENIGN":
            fp += 1
        elif predicted_label == "MALIGN":
            fn += 1

    # print(f"Result: {result}")
    # print(f"Domain: {domain_name}")
    # print(f"Original Label: {original_label}")
    # print(f"Predicted Label: {predicted_label}")
    # print(f"Probabilities: {probabilities_np_rounded}")
    # print('-' * 50)

# compute f1 score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Total OK Predictions: {prediction_ok}")
print(f"Total WRONG Predictions: {prediction_wrong}")
print(f"Accuracy Ratio: {prediction_ok / (prediction_ok + prediction_wrong):.2f}")


14
CNN model created
Precision: 0.78
Recall: 1.00
F1 Score: 0.88
Total OK Predictions: 967
Total WRONG Predictions: 33
Accuracy Ratio: 0.97
