## Columns to be removed from training/validation

In [5]:
disqualified_columns = ["tls_joint_isoitu_policy_crt_count", "rdap_time_from_last_change", "lex_www_flag"]

# Load Tensorflow and check GPU availability

In [3]:
import numpy as np
import tensorflow as tf
import torch
import pickle
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

import sys

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load input datasets

In [207]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pandas import DataFrame
from pandas.core.dtypes import common as com
from pyarrow import Table


def union_tables(tables: [pa.Table]) -> pa.Table:
    union_table = tables[0]
    for table in tables[1:]:
        right_not_in_union = union_table.join(right_table=table, keys='domain_name', join_type='right anti',
                                              coalesce_keys=True, use_threads=True)
        union_table = pa.concat_tables([union_table, right_not_in_union])
    return union_table

# #############################################################
# EDIT this to specify benign / malicious datasets to use     #
# #############################################################
benign_dataset_filenames = [
    '../feature-extraction/floor/benign_2312.parquet',
]
malicious_dataset_filenames = [
    '../feature-extraction/floor/phishing_2406_strict.parquet'
]
# #############################################################
# EDIT this for to set appropriate labels (malware, dga, ...) #
# #############################################################
benign_label = "benign"
malicious_label = "phishing"
# #############################################################



# Unify malicious datasets and benign datasets
schema = (pq.read_table(malicious_dataset_filenames[0])).schema # Use the schema from the first malicious filename
benign_tables = [pq.read_table(filename).cast(schema) for filename in benign_dataset_filenames]
malicious_tables = [pq.read_table(filename).cast(schema) for filename in malicious_dataset_filenames]
malicious = union_tables(malicious_tables)
benign = union_tables(benign_tables)

# Convert pyarrow tables to pandas dataframes
df_benign = benign.to_pandas()
df_malicious = malicious.to_pandas()

# Set appropriate labels
df_benign["label"] = benign_label
df_malicious["label"] = malicious_label
class_map = {benign_label: 0, malicious_label: 1}

# print column count
print(f"Benign columns: {len(df_benign.columns)}")
print(f"Malicious columns: {len(df_malicious.columns)}")



# ===================
# AUTO BALANCING !!!
# Subsample benign to match the size of malicious
# df_benign = df_benign.sample(n=len(df_malicious))
# ===================

# Concatentate benign and malicious
df = pd.concat([df_benign, df_malicious])


def cast_timestamp(df: DataFrame):
    """
    Cast timestamp fields to seconds since epoch.
    """
    for col in df.columns:
        if com.is_timedelta64_dtype(df[col]):
            df[col] = df[col].dt.total_seconds()  # This converts timedelta to float (seconds)
        elif com.is_datetime64_any_dtype(df[col]):
            df[col] = df[col].astype(np.int64) // 10**9  # Converts datetime64 to Unix timestamp (seconds)

    return df

df = cast_timestamp(df)

# Handle NaNs
df.fillna(-1, inplace=True)


# SUBSAMPLE (OPTIONAL)
subsample = 1.0 # 1.0 means no subsample
if subsample < 1.0:
    df = df.sample(frac=subsample)

# Drop the domain name column
df.drop("domain_name", axis=1, inplace=True)

# Remove disqualified columns
for column in disqualified_columns:
    if column in df.columns:
        df.drop(column, axis=1, inplace=True)

labels = df['label'].apply(lambda x: class_map[x]) # y vector
features = df.drop('label', axis=1).copy() # X matrix

print(f"Total samples: {len(df)}")
print(f"Benign count: {len(df_benign)}")
print(f"Malicious count: {len(df_malicious)}")



Benign columns: 178
Malicious columns: 178
Total samples: 626617
Benign count: 462192
Malicious count: 164425


# Data preprocessing

In [208]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
 
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(features)
features = pd.DataFrame(scaled_data, columns=features.columns)

# Save the scaler
joblib.dump(scaler, "boundaries/malware_general_scaler.joblib")

pd.set_option('display.max_columns', None)

# Train-test split

In [209]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
  features,
  labels,
  test_size=0.2,
  random_state=42,
  shuffle=True, 
  stratify=labels
)


# Define the NN model

In [6]:
class Net(nn.Module):
    def __init__(self, feature_size):
        super(Net, self).__init__()


        # Adjust the size calculation based on the number of convolutional layers
        self.fc1 = nn.Linear(feature_size, 2048)
        self.fc2 = nn.Linear(2048, 512)
        self.fc3 = nn.Linear(512, 128)
        self.fc4 = nn.Linear(128, 1)
        
        # Optionally use dropout
        self.dropout1 = nn.Dropout(0.1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(F.relu(self.fc2(x)))
        x = self.dropout1(F.relu(self.fc3(x)))

        return self.fc4(x) 


In [11]:
def compute_metrics(data_loader, model):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            true_labels.extend(target.cpu().numpy())
            predictions.extend(torch.sigmoid(output).round().cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    return accuracy, f1, predictions, true_labels

In [8]:
import torch.nn.functional as F
import math

def prepare_dataset(X_train, Y_train, X_test, Y_test):

    # Convert data to torch tensors
    x_train = torch.tensor(X_train.values, dtype=torch.float32)
    y_train = torch.tensor(Y_train.values, dtype=torch.long)
    x_test = torch.tensor(X_test.values, dtype=torch.float32)
    y_test = torch.tensor(Y_test.values, dtype=torch.long)

    return x_train, y_train, x_test, y_test, side_size


# print feature size

In [213]:
x_train, y_train, x_test, y_test, side_size = prepare_dataset(X_train, Y_train, X_test, Y_test)

#x_train = torch.cat((x_train, pickle.load(open('old_phishing_x.pkl', 'rb'))))
#y_train = torch.cat((y_train, pickle.load(open('old_phishing_y.pkl', 'rb'))))




# Train the model

In [214]:
from sklearn.utils import class_weight
import torch.optim as optim

# Hyperparameters
LEARNING_RATE = 0.0005
BATCH_SIZE = 512
EPOCHS = 80


# # Calculate class weights
class_weights = {0: 1.0, 1: 1.0} 
weights = torch.tensor([class_weights[1]], dtype=torch.float).to(device)


model = Net(side_size=14).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=weights)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_data = TensorDataset(x_train.to(device), y_train.float().unsqueeze(1).to(device))  # Ensure y_train is float and of shape (batch_size, 1)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

test_data = TensorDataset(x_test.to(device), y_test.float().unsqueeze(1).to(device))  # Ensure y_test is float and of shape (batch_size, 1)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

epoch_losses = []
epoch_accuracies = []
epoch_f1s = []


# # Training loop
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)  # Move data to the device
        optimizer.zero_grad()
        output = torch.sigmoid(model(data))
        
        # negate the output to match the class weights
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Compute average loss
    avg_loss = running_loss / len(train_loader)
    epoch_losses.append(avg_loss)

    # Evaluate model and store metrics
    train_accuracy, train_f1, _, _ = compute_metrics(train_loader, model)
    epoch_accuracies.append(train_accuracy)
    epoch_f1s.append(train_f1)

    # Enhanced logging
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.4f}, F1 Score: {train_f1:.4f}")




CNN model created
Epoch 1/80 - Loss: 0.1106, Accuracy: 0.9719, F1 Score: 0.9466
Epoch 2/80 - Loss: 0.0762, Accuracy: 0.9800, F1 Score: 0.9611
Epoch 3/80 - Loss: 0.0665, Accuracy: 0.9809, F1 Score: 0.9628
Epoch 4/80 - Loss: 0.0609, Accuracy: 0.9823, F1 Score: 0.9654
Epoch 5/80 - Loss: 0.0566, Accuracy: 0.9847, F1 Score: 0.9703
Epoch 6/80 - Loss: 0.0528, Accuracy: 0.9837, F1 Score: 0.9680
Epoch 7/80 - Loss: 0.0501, Accuracy: 0.9861, F1 Score: 0.9730
Epoch 8/80 - Loss: 0.0472, Accuracy: 0.9856, F1 Score: 0.9725
Epoch 9/80 - Loss: 0.0454, Accuracy: 0.9864, F1 Score: 0.9739
Epoch 10/80 - Loss: 0.0429, Accuracy: 0.9876, F1 Score: 0.9762
Epoch 11/80 - Loss: 0.0408, Accuracy: 0.9889, F1 Score: 0.9786
Epoch 12/80 - Loss: 0.0387, Accuracy: 0.9897, F1 Score: 0.9802
Epoch 13/80 - Loss: 0.0369, Accuracy: 0.9900, F1 Score: 0.9807
Epoch 14/80 - Loss: 0.0353, Accuracy: 0.9906, F1 Score: 0.9819
Epoch 15/80 - Loss: 0.0331, Accuracy: 0.9905, F1 Score: 0.9818
Epoch 16/80 - Loss: 0.0317, Accuracy: 0.9901, 

# Save the model

In [215]:
# save trained model 
torch.save(model.state_dict(), './models/malware_deep.pth')

# Testing trained model

In [12]:
import matplotlib.pyplot as plt


# load model
model = Net(feature_size=173).to(device)
model.load_state_dict(torch.load('./models/malware_deep.pth'))

# Evaluate on the test set
test_accuracy, test_f1, predictions, true_labels = compute_metrics(test_loader, model)
print(f"Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}")

# Plotting the confusion matrix
cm = confusion_matrix(true_labels, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Test Data')
plt.show()

# Plotting the training progress
plt.figure(figsize=(12, 4))

# Plot for Loss
plt.subplot(1, 3, 1)
plt.plot(epoch_losses, linestyle='--', marker='o', color='#2ba7fc', label=f'Loss (Best: {min(epoch_losses):.4f})')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()

# Plot for Accuracy
plt.subplot(1, 3, 2)
plt.plot(epoch_accuracies, linestyle='--', marker='o', color='#61d484', label=f'Accuracy (Best: {max(epoch_accuracies):.4f})')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training Accuracy')
plt.legend()

# Plot for F1 Score
plt.subplot(1, 3, 3)
plt.plot(epoch_f1s, linestyle='--', marker='o', color='#b85e4f', label=f'F1 Score (Best: {max(epoch_f1s):.4f})')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.title('Training F1 Score')
plt.legend()

plt.suptitle('Training Progress')
plt.tight_layout()
plt.show()

# Plotting the testing results
plt.figure(figsize=(6, 4))

metrics = ['Accuracy', 'F1 Score']
values = [test_accuracy, test_f1]
colors = ['#61d484', '#b85e4f']

plt.bar(metrics, values, color=colors)
plt.ylim(0, 1)
for i, v in enumerate(values):
    plt.text(i, v + 0.02, f"{v:.4f}", ha='center', va='bottom')
plt.ylabel('Score')
plt.title('Test Metrics')
plt.show()

# bylo 2k na 2k


CNN model created


NameError: name 'test_loader' is not defined

# Validate the model on a separate Validation dataset

In [9]:
import pyarrow.parquet as pq
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib
from tensorflow.keras.models import load_model
from pandas.core.dtypes import common as com
import numpy as np

def cast_timestamp(df: pd.DataFrame):
    """
    Cast timestamp fields to seconds since epoch.
    """
    for col in df.columns:
        if com.is_timedelta64_dtype(df[col]):
            df[col] = df[col].dt.total_seconds()  # This converts timedelta to float (seconds)
        elif com.is_datetime64_any_dtype(df[col]):
            df[col] = df[col].astype(np.int64) // 10**9  # Converts datetime64 to Unix timestamp (seconds)
    return df

# Load the validation dataset
validation_dataset_filename = '../testdata/validation_malware.parquet'
df_validation = pq.read_table(validation_dataset_filename).to_pandas()

# Cast timestamps and handle NaNs
df_validation = cast_timestamp(df_validation)
df_validation.fillna(-1, inplace=True)

# Remove disqualified columns
for column in disqualified_columns:
    if column in df_validation.columns:
        df_validation.drop(column, axis=1, inplace=True)

# Map the labels
df_validation['label'] = df_validation['label'].map({'benign': 0, 'phishing': 1})

# print number of columns


# Extract features and labels
X_val = df_validation.drop(['label', 'domain_name'], axis=1)
y_val = df_validation['label']

# Load the scaler
scaler = joblib.load("boundaries/malware_general_scaler.joblib")

# Scale the features
y_val = y_val.tolist()
X_val_scaled = scaler.transform(X_val)
# convert to tensor
X_val_scaled = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
# convert pandas series y_val to llist

# print shape of X_val_scaled and y_val_tensor

# print number of columns

# print shape of X_val_scaled and y_val_tensor
print(X_val_scaled.shape, y_val_tensor.shape)



# Load the trained model
model = Net(feature_size=173)
model.load_state_dict(torch.load('./models/malware_deep.pth'))
model.eval()
model.to('cuda:0' if torch.cuda.is_available() else 'cpu')

#### PADDING ###
# Helper function to find the next perfect square



pickle.dump(X_val_scaled, open('old_malware_sequence_x.pkl', 'wb'))
pickle.dump(y_val_tensor, open('old_malware_sequence_y.pkl', 'wb'))


val_data = TensorDataset(X_val_scaled, y_val_tensor)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

# Function to compute metrics and predictions
def compute_metrics_and_predictions(data_loader, model):
    model.eval()
    true_labels = []
    predictions = []
    probabilities = []

    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            print(torch.sigmoid(output).round())
            print(torch.sigmoid(output))
            input()
            true_labels.extend(target.cpu().numpy())
            predictions.extend(torch.sigmoid(output).round().cpu().numpy())


    return true_labels, predictions





# Get predictions and probabilities
test_true_labels, test_predictions= compute_metrics_and_predictions(val_loader, model)

# Calculate metrics
accuracy = accuracy_score(test_true_labels, test_predictions)
weighted_accuracy = (accuracy_score(np.array(test_true_labels) == 0, np.array(test_predictions) == 0) + accuracy_score(np.array(test_true_labels) == 1, np.array(test_predictions) == 1)) / 2
precision = precision_score(test_true_labels, test_predictions)
recall = recall_score(test_true_labels, test_predictions)
f1 = f1_score(test_true_labels, test_predictions)
tn, fp, fn, tp = confusion_matrix(test_true_labels, test_predictions).ravel()
false_positive_rate = fp / (fp + tn)

# Display metrics
print(f'Accuracy: {accuracy}')
print(f'Weighted Accuracy: {weighted_accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'False Positive Rate: {false_positive_rate}')

# Display confusion matrix
cm = confusion_matrix(test_true_labels, test_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['benign', 'phishing'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Test Data')
plt.show()

# Identify misclassified domain names
misclassified = df_validation.iloc[[i for i, (y_true, y_pred) in enumerate(zip(test_true_labels, test_predictions)) if y_true != y_pred]]
misclassified_domains = misclassified['domain_name'].tolist()
print(f'Misclassified domains: {misclassified_domains}')

# bylo x 1000
# 10 x 


torch.Size([2480, 173]) torch.Size([2480])
CNN model created
torch.Size([2480, 1, 14, 14])
torch.Size([2480])
tensor([[-37.2285],
        [-31.6005],
        [-17.6195],
        [ 72.9909],
        [-19.9294],
        [-45.5747],
        [-28.6610],
        [-10.1254],
        [-12.3164],
        [-28.5029],
        [ -8.3394],
        [ -8.2844],
        [ 90.8862],
        [-12.4166],
        [ -4.5517],
        [ 32.2141],
        [-17.9661],
        [-23.2476],
        [-20.8448],
        [-20.9034],
        [ -6.8471],
        [ -6.2636],
        [-22.9430],
        [ -7.7627],
        [-14.1914],
        [-28.8391],
        [-34.7457],
        [ 63.7602],
        [-16.3150],
        [ -9.0472],
        [-22.5990],
        [-40.9092],
        [-20.7916],
        [-20.5773],
        [-21.7763],
        [ -6.0011],
        [-24.1876],
        [-18.3626],
        [-10.9717],
        [ -9.7090],
        [-16.1542],
        [-16.2990],
        [-39.4854],
        [-11.4905],
        [-

# Make test predictions

In [218]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
import joblib
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pyarrow import Table

# Load the model and scaler
model = load_model('dga_binary_model.keras')
scaler = joblib.load("dga_binary_scaler.joblib")

# #############################################################
# EDIT this to specify benign / malicious datasets to use     #
# #############################################################
benign_dataset_filenames = [
    '../feature-extraction/floor/benign_2312_anonymized.parquet',
    '../feature-extraction/floor/umbrella_benign_FINISHED.parquet',
]
malicious_dataset_filenames = [
    '../feature-extraction/floor/lex-dga-830k-pick.parquet'
]
# #############################################################
# EDIT this for to set appropriate labels (malware, dga, ...) #
# #############################################################
benign_label = "benign"
malicious_label = "dga"
# #############################################################

def union_tables(tables: [pa.Table]) -> pa.Table:
    union_table = tables[0]
    for table in tables[1:]:
        right_not_in_union = union_table.join(right_table=table, keys='domain_name', join_type='right anti',
                                              coalesce_keys=True, use_threads=True)
        union_table = pa.concat_tables([union_table, right_not_in_union])
    return union_table

# Unify malicious datasets and benign datasets
schema = (pq.read_table(malicious_dataset_filenames[0])).schema # Use the schema from the first malicious filename
benign_tables = [pq.read_table(filename).cast(schema) for filename in benign_dataset_filenames]
malicious_tables = [pq.read_table(filename).cast(schema) for filename in malicious_dataset_filenames]
malicious = union_tables(malicious_tables)
benign = union_tables(benign_tables)

# Convert pyarrow tables to pandas dataframes
df_benign = benign.to_pandas()
df_malicious = malicious.to_pandas()

# Set appropriate labels
df_benign["label"] = benign_label
df_malicious["label"] = malicious_label
class_map = {benign_label: 0, malicious_label: 1}

# Concatentate benign and malicious
test_df = pd.concat([df_benign, df_malicious])

# Handle NaNs
test_df.fillna(-1, inplace=True)


# Take only N random samples
N = 500
test_df = test_df.sample(n=N, random_state=42)

total_predictions = 0
correct_predictions = 0

for index, row in test_df.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    # Drop "domain_name" and "label" columns
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)

     # Scale the feature vector using the loaded scaler
    scaled_feature_vector = scaler.transform(feature_vector)
    
    # Perform prediction
    prediction = model.predict(scaled_feature_vector, verbose=0)
    
    # Extract the predicted class
    predicted_label = "benign" if prediction < 0.5 else "dga"
    
    # Check if the prediction was correct
    if original_label == predicted_label:
        correct_predictions += 1
    
    total_predictions += 1
    
    # Print the result
    result="WRONG"
    if predicted_label == original_label:
        result="OK"
        
    pred_disp = "!!! DGA !!!"
    if predicted_label == "benign":
        pred_disp = "BENIGN"
        
    
    print(f"{result} | {domain_name} ({original_label}), Predicted: {pred_disp}, Prob: {prediction}")

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy}")




OSError: No file or directory found at dga_binary_model.keras