In [5]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /home/ihranicky/git


# Optional - Create a testing parquet

In [2]:
create_test_parquet = True

if create_test_parquet:
    import pandas as pd
    import numpy as np
    
    # List of input Parquet files
    input_files = [
        'testdata/misp_2402.parquet',
        'testdata/umbrella_benign_FINISHED.parquet'
    ]
    
    # Number of rows to select
    n_rows = 100
    
    # Read all Parquet files into a single DataFrame
    dataframes = [pd.read_parquet(file) for file in input_files]
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/sample.parquet')
    
    # Display the selected rows
    print(selected_rows)





                                 domain_name                         label  \
259954                        p24.dynect.net           benign_2310:unknown   
358356                    policy.revasia.com           benign_2310:unknown   
258589                     investvoyager.com           benign_2310:unknown   
331248                             foxtv.com           benign_2310:unknown   
59056                              griml.com            misp_2310:phishing   
...                                      ...                           ...   
311535  ci-bfp.aws-stsd.capitalonegslbex.com           benign_2310:unknown   
23748               maliyeistrgov-tr5319.com  phishing_since_2402:phishing   
173682                          ithinkfi.org           benign_2310:unknown   
138127                       tags.feedad.com           benign_2310:unknown   
133477    edge-038.ussea3.icloud-content.com           benign_2310:unknown   

        dns_has_dnskey  dns_A_count  dns_AAAA_count  dns_MX_cou

# Make a test run of conversion into NDF

In [9]:
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)

for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    print(f"{index}: {domain_name} - {original_label}")
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    ndf = pp.NDF(feature_vector, 'phishing')
    print(type(ndf))
    print(ndf)
    

FileNotFoundError: [Errno 2] No such file or directory: 'boundaries/dga_scaler.joblib'

# Testing success on the sample dataset

In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
from preprocessor import Preprocessor

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = math.ceil(n**0.5)**2
    return next_square

test_dataset = 'testdata/sample.parquet'
pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)
feature_size = domain_data.shape[1] - 2  # -2 = Domain and Label

desired_size = next_perfect_square(feature_size)  # Next perfect square
side_size = int(desired_size**0.5)

class Net(nn.Module):
    def __init__(self, side_size):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 256, 3, 1)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(256 * (side_size-4)**2, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 512)
        self.fc5 = nn.Linear(512, 2)
        print("CNN model created")

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        return self.fc5(x)

# Load the trained CNN model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load('models/phishing_cnn_model.pth', map_location=device)
model = model.to(device)
model.eval()


for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    preprocessed_data = pp.NDF(feature_vector, 'phishing')

    data_tensor = preprocessed_data['features']
    feature_size = data_tensor.shape[1]
    desired_size = next_perfect_square(feature_size)
    padding = desired_size - feature_size
    if padding > 0:
        data_tensor_padded = F.pad(data_tensor, (0, padding), 'constant', 0)
    else:
        data_tensor_padded = data_tensor
    side_size = int(np.sqrt(desired_size))
    data_tensor_reshaped = data_tensor_padded.view(-1, 1, side_size, side_size)

    # Move the input tensor to the same device as the model
    data_tensor_reshaped = data_tensor_reshaped.to(device)

    # Debugging information
    #print(f"Model device: {next(model.parameters()).device}")
    #print(f"Data tensor device: {data_tensor_reshaped.device}")

    # Predict and calculate probabilities for the single record
    with torch.no_grad():
        outputs = model(data_tensor_reshaped)
        probabilities = F.softmax(outputs, dim=1)
        probabilities_np = probabilities.detach().cpu().numpy()
        probabilities_np_rounded = np.round(probabilities_np, decimals=3)

        _, predicted = torch.max(outputs, 1)
        predicted_np = predicted.detach().cpu().numpy()
    
    predicted_label = "BENIGN" if predicted_np[0] == 1 else "MALIGN"
    result = "WRONG"
    if (original_label.startswith("benign") and predicted_label == "BENIGN") or (not original_label.startswith("benign") and predicted_label == "MALIGN"):
        result = "OK"

    print(f"{result} | {domain_name} - {original_label} - predict: {predicted_label} | {probabilities_np_rounded}")


WRONG | p24.dynect.net - benign_2310:unknown - predict: MALIGN | [[0.948 0.052]]
OK | policy.revasia.com - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
OK | investvoyager.com - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
OK | foxtv.com - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
WRONG | griml.com - misp_2310:phishing - predict: BENIGN | [[0. 1.]]
OK | epdg.epc.mnc001.mcc250.pub.3gppnetwork.org - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
OK | www.convert-me.com - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
OK | static-lax3-2.xx.fbcdn.net - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
WRONG | samsung.cn - benign_2310:unknown - predict: MALIGN | [[0.999 0.001]]
OK | pottery-barn.o3ae.net - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
OK | velaw.com - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
OK | katapult.com - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
OK | sindonews.net - benign_2310:unknown - predict: BENIGN | [[0. 1.]]
OK | pod