# Training NN for the final Decision Making

## Load Tensorflow and check GPU availability

In [8]:
import numpy as np
import tensorflow as tf
import torch
import pickle
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

import sys

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load input datasets

In [9]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pyarrow import Table

df = pd.read_parquet('data/preliminary_results_x.parquet')

# IMPORTANT: set labels:
df.loc[df['label'].str.startswith(('phishing', 'malware', 'misp', 'dga')), 'label'] = 'malign'
df.loc[df['label'] != 'malign', 'label'] = 'benign'

class_map = {"benign": 0, "malign": 1}

# Handle NaNs
df.fillna(-1, inplace=True)


# SUBSAMPLE (OPTIONAL)
subsample = 1.0 # 1.0 means no subsample
if subsample < 1.0:
    df = df.sample(frac=subsample)

# Drop the domain name column
df.drop("domain_name", axis=1, inplace=True)

labels = df['label'].apply(lambda x: class_map[x]) # y vector
features = df.drop('label', axis=1).copy() # X matrix

print(f"Total samples: {len(df)}")


pd.set_option('display.max_columns', None)
features

Total samples: 100000


Unnamed: 0,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,phishing_cnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_deepnn_result,phishing_dns_nn_result,phishing_rdap_nn_result,malware_cnn_result,malware_lgbm_result,malware_xgboost_result,dga_binary_nn_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,total_sum,total_avg,total_prod
54451,0.6,0.225,1.000000,0.833333,1.0,0.750,1.000000,0.833333,1.0,0.777778,1.0,0.008122,0.002815,1.366864e-03,0.066331,0.013630,1.0,0.000191,0.002436,6.067854e-23,1.092264,0.182044,2.825370e-11,1.002627,0.334209,4.655318e-07,2.094891,0.209489,7.981044e-40
247717,1.0,0.750,0.833333,0.291667,1.0,1.000,1.000000,0.708333,1.0,1.000000,0.0,0.000682,0.000621,3.579364e-04,0.007201,0.002786,0.0,0.000052,0.002233,3.490693e-03,0.011647,0.001941,0.000000e+00,0.002285,0.000762,0.000000e+00,0.017423,0.001742,0.000000e+00
15621,0.6,0.225,1.000000,0.833333,1.0,0.375,1.000000,0.708333,1.0,0.722222,1.0,0.794902,0.967936,9.999437e-01,0.689681,0.941273,0.0,0.000533,0.002408,7.239705e-07,5.393736,0.898956,4.994589e-01,0.002941,0.000980,0.000000e+00,5.396678,0.539668,0.000000e+00
255533,0.9,0.525,0.833333,0.291667,1.0,0.875,0.833333,0.250000,1.0,0.722222,0.0,0.000314,0.000381,3.409265e-07,0.055679,0.000929,0.0,0.000280,0.002364,1.192370e-35,0.057303,0.009551,0.000000e+00,0.002644,0.000881,0.000000e+00,0.059947,0.005995,0.000000e+00
89645,0.9,0.550,0.833333,0.291667,1.0,1.000,1.000000,0.708333,1.0,1.000000,0.0,0.000668,0.000933,6.330108e-04,0.071273,0.033787,0.0,0.000061,0.002233,5.018924e-04,0.107293,0.017882,0.000000e+00,0.002294,0.000765,0.000000e+00,0.110089,0.011009,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343814,1.0,0.700,0.833333,0.291667,1.0,1.000,0.833333,0.416667,1.0,1.000000,1.0,0.000353,0.000631,1.249553e-04,0.003323,0.000713,1.0,0.000052,0.002233,3.854592e-02,1.005146,0.167524,6.602678e-17,1.002286,0.334095,1.171542e-07,2.045977,0.204598,2.981647e-25
104123,0.6,0.200,1.000000,0.833333,1.0,0.500,0.833333,0.041667,1.0,0.666667,0.0,0.000520,0.000777,2.489036e-04,0.144647,0.010928,1.0,0.000305,0.002364,8.776547e-26,0.157121,0.026187,0.000000e+00,1.002669,0.334223,7.199457e-07,1.159789,0.115979,0.000000e+00
380872,1.0,0.725,0.833333,0.291667,1.0,0.625,1.000000,0.708333,1.0,0.722222,0.0,0.006750,0.010146,1.054552e-02,0.051278,0.027170,0.0,0.677820,0.568785,2.559731e-02,0.105889,0.017648,0.000000e+00,1.246605,0.415535,0.000000e+00,1.378092,0.137809,0.000000e+00
275135,1.0,0.700,0.833333,0.291667,1.0,1.000,1.000000,0.708333,1.0,1.000000,0.0,0.000425,0.000540,2.749782e-03,0.008847,0.002785,0.0,0.000050,0.002233,7.275866e-01,0.015347,0.002558,0.000000e+00,0.002283,0.000761,0.000000e+00,0.745217,0.074522,0.000000e+00


# Data preprocessing

In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
 
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(features)
features = pd.DataFrame(scaled_data, columns=features.columns)

# Save the scaler
joblib.dump(scaler, "boundaries/decision_nn_scaler.joblib")

pd.set_option('display.max_columns', None)
features

Unnamed: 0,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,phishing_cnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_deepnn_result,phishing_dns_nn_result,phishing_rdap_nn_result,malware_cnn_result,malware_lgbm_result,malware_xgboost_result,dga_binary_nn_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,total_sum,total_avg,total_prod
0,0.6,0.257143,1.000000,0.952381,1.0,0.750,1.000000,0.833333,1.0,0.777778,1.0,0.007990,0.002456,1.366864e-03,0.066430,0.013656,1.0,0.000149,0.000209,6.067854e-23,0.182268,0.182268,2.856502e-11,0.346461,0.346461,5.223303e-07,0.229910,0.229910,2.206768e-39
1,1.0,0.857143,0.833333,0.333333,1.0,1.000,1.000000,0.708333,1.0,1.000000,0.0,0.000549,0.000258,3.579364e-04,0.007211,0.002792,0.0,0.000009,0.000000,3.490693e-03,0.001812,0.001812,0.000000e+00,0.000003,0.000003,0.000000e+00,0.001477,0.001477,0.000000e+00
2,0.6,0.257143,1.000000,0.952381,1.0,0.375,1.000000,0.708333,1.0,0.722222,1.0,0.794877,0.969108,9.999437e-01,0.690715,0.943112,0.0,0.000494,0.000181,7.239705e-07,0.900585,0.900585,5.049623e-01,0.000230,0.000230,0.000000e+00,0.592965,0.592965,0.000000e+00
3,0.9,0.600000,0.833333,0.333333,1.0,0.875,0.826087,0.250000,1.0,0.722222,0.0,0.000181,0.000018,3.409265e-07,0.055762,0.000931,0.0,0.000238,0.000135,1.192370e-35,0.009436,0.009436,0.000000e+00,0.000127,0.000127,0.000000e+00,0.006152,0.006152,0.000000e+00
4,0.9,0.628571,0.833333,0.333333,1.0,1.000,1.000000,0.708333,1.0,1.000000,0.0,0.000535,0.000571,6.330108e-04,0.071380,0.033853,0.0,0.000017,0.000000,5.018924e-04,0.017784,0.017784,0.000000e+00,0.000006,0.000006,0.000000e+00,0.011666,0.011666,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.0,0.800000,0.833333,0.333333,1.0,1.000,0.826087,0.416667,1.0,1.000000,1.0,0.000221,0.000268,1.249553e-04,0.003328,0.000715,1.0,0.000009,0.000000,3.854592e-02,0.167720,0.167720,6.675432e-17,0.346342,0.346342,1.314479e-07,0.224531,0.224531,8.244289e-25
99996,0.6,0.228571,1.000000,0.952381,1.0,0.500,0.826087,0.041667,1.0,0.666667,0.0,0.000387,0.000415,2.489036e-04,0.144864,0.010949,1.0,0.000264,0.000135,8.776547e-26,0.026105,0.026105,0.000000e+00,0.346475,0.346475,8.077846e-07,0.127088,0.127088,0.000000e+00
99997,1.0,0.828571,0.833333,0.333333,1.0,0.625,1.000000,0.708333,1.0,0.722222,0.0,0.006618,0.009798,1.054552e-02,0.051355,0.027223,0.0,0.684465,0.585975,2.559731e-02,0.017550,0.017550,0.000000e+00,0.430960,0.430960,0.000000e+00,0.151092,0.151092,0.000000e+00
99998,1.0,0.800000,0.833333,0.333333,1.0,1.000,1.000000,0.708333,1.0,1.000000,0.0,0.000293,0.000177,2.749782e-03,0.008861,0.002791,0.0,0.000007,0.000000,7.275866e-01,0.002430,0.002430,0.000000e+00,0.000002,0.000002,0.000000e+00,0.081503,0.081503,0.000000e+00


# Train-test split

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
  features,
  labels,
  test_size=0.2,
  random_state=42,
  shuffle=True, 
  stratify=labels
)

# Define the NN model

In [20]:
class Net(nn.Module):
    def __init__(self, feature_size):
        super(Net, self).__init__()


        # Adjust the size calculation based on the number of convolutional layers
        self.fc1 = nn.Linear(feature_size, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 32)
        self.fc4 = nn.Linear(32, 1)
        
        # Optionally use dropout
        self.dropout1 = nn.Dropout(0.2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(F.relu(self.fc2(x)))
        x = self.dropout1(F.relu(self.fc3(x)))

        return self.fc4(x) 

In [17]:
def compute_metrics(data_loader, model):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            true_labels.extend(target.cpu().numpy())
            predictions.extend(torch.sigmoid(output).round().cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    return accuracy, f1, predictions, true_labels

In [18]:
x_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(Y_train.values, dtype=torch.long)
x_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(Y_test.values, dtype=torch.long)

print("Feature size:", X_train.shape[1])
print("Training samples:", len(X_train))

# Train the model

In [21]:
from sklearn.utils import class_weight
import torch.optim as optim

# Hyperparameters
LEARNING_RATE = 0.0045
BATCH_SIZE = 128
EPOCHS = 15


# # Calculate class weights
class_weights = {0: 1.0, 1: 0.3} 
weights = torch.tensor([class_weights[1]], dtype=torch.float).to(device)


model = Net(x_train.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=weights)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_data = TensorDataset(x_train.to(device), y_train.float().unsqueeze(1).to(device))  # Ensure y_train is float and of shape (batch_size, 1)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

test_data = TensorDataset(x_test.to(device), y_test.float().unsqueeze(1).to(device))  # Ensure y_test is float and of shape (batch_size, 1)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

epoch_losses = []
epoch_accuracies = []
epoch_f1s = []


# # Training loop
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)  # Move data to the device
        optimizer.zero_grad()
        output = model(data)
        
        # negate the output to match the class weights
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Compute average loss
    avg_loss = running_loss / len(train_loader)
    epoch_losses.append(avg_loss)

    # Evaluate model and store metrics
    train_accuracy, train_f1, _, _ = compute_metrics(train_loader, model)
    epoch_accuracies.append(train_accuracy)
    epoch_f1s.append(train_f1)

    # Enhanced logging
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.4f}, F1 Score: {train_f1:.4f}")

Epoch 1/15 - Loss: 0.0531, Accuracy: 0.9671, F1 Score: 0.9487
Epoch 2/15 - Loss: 0.0411, Accuracy: 0.9673, F1 Score: 0.9490
Epoch 3/15 - Loss: 0.0383, Accuracy: 0.9684, F1 Score: 0.9508
Epoch 4/15 - Loss: 0.0373, Accuracy: 0.9670, F1 Score: 0.9483
Epoch 5/15 - Loss: 0.0364, Accuracy: 0.9714, F1 Score: 0.9557
Epoch 6/15 - Loss: 0.0358, Accuracy: 0.9750, F1 Score: 0.9624
Epoch 7/15 - Loss: 0.0355, Accuracy: 0.9739, F1 Score: 0.9598
Epoch 8/15 - Loss: 0.0349, Accuracy: 0.9713, F1 Score: 0.9555
Epoch 9/15 - Loss: 0.0346, Accuracy: 0.9690, F1 Score: 0.9516
Epoch 10/15 - Loss: 0.0341, Accuracy: 0.9742, F1 Score: 0.9604
Epoch 11/15 - Loss: 0.0343, Accuracy: 0.9688, F1 Score: 0.9514
Epoch 12/15 - Loss: 0.0339, Accuracy: 0.9706, F1 Score: 0.9543
Epoch 13/15 - Loss: 0.0336, Accuracy: 0.9733, F1 Score: 0.9587
Epoch 14/15 - Loss: 0.0335, Accuracy: 0.9701, F1 Score: 0.9535
Epoch 15/15 - Loss: 0.0334, Accuracy: 0.9731, F1 Score: 0.9585


# Save the model

In [22]:
# save trained model 
torch.save(model.state_dict(), './models/decision_malware.pth')

# Display training results

In [24]:
import matplotlib.pyplot as plt


# load model
model = Net(feature_size=29).to(device)
model.load_state_dict(torch.load('./models/decision_malware.pth'))

# Evaluate on the test set
test_accuracy, test_f1, predictions, true_labels = compute_metrics(test_loader, model)
print(f"Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}")

# Plotting the confusion matrix
cm = confusion_matrix(true_labels, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Test Data')
plt.show()

# Plotting the training progress
plt.figure(figsize=(12, 4))

# Plot for Loss
plt.subplot(1, 3, 1)
plt.plot(epoch_losses, linestyle='--', marker='o', color='#2ba7fc', label=f'Loss (Best: {min(epoch_losses):.4f})')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()

# Plot for Accuracy
plt.subplot(1, 3, 2)
plt.plot(epoch_accuracies, linestyle='--', marker='o', color='#61d484', label=f'Accuracy (Best: {max(epoch_accuracies):.4f})')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training Accuracy')
plt.legend()

# Plot for F1 Score
plt.subplot(1, 3, 3)
plt.plot(epoch_f1s, linestyle='--', marker='o', color='#b85e4f', label=f'F1 Score (Best: {max(epoch_f1s):.4f})')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.title('Training F1 Score')
plt.legend()

plt.suptitle('Training Progress')
plt.tight_layout()
plt.show()

# Plotting the testing results
plt.figure(figsize=(6, 4))

metrics = ['Accuracy', 'F1 Score']
values = [test_accuracy, test_f1]
colors = ['#61d484', '#b85e4f']

plt.bar(metrics, values, color=colors)
plt.ylim(0, 1)
for i, v in enumerate(values):
    plt.text(i, v + 0.02, f"{v:.4f}", ha='center', va='bottom')
plt.ylabel('Score')
plt.title('Test Metrics')
plt.show()

# bylo 2k na 2k


  model.load_state_dict(torch.load('./models/decision_malware.pth'))


RuntimeError: Error(s) in loading state_dict for Net:
	size mismatch for fc1.weight: copying a param with shape torch.Size([512, 29]) from checkpoint, the shape in current model is torch.Size([512, 173]).

# Evaluate the model

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Evaluate the model
loss_and_metrics = model.evaluate(X_test, Y_test)
print('Loss = ', loss_and_metrics[0])
print('Accuracy = ', loss_and_metrics[1])

# Generate predictions
Y_pred = model.predict(X_test)
Y_pred = np.round(Y_pred).astype(int)  # Convert probabilities to binary predictions

# Calculate additional metrics
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Confusion matrix
cm = confusion_matrix(Y_test, Y_pred)

# False Positive Rate
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn)

# Display the metrics
print('\n=== RESULTS ===')
print(classification_report(Y_test, Y_pred, target_names=['Benign', 'Malicious'], digits=4))
print('False Positive Rate =', fpr)


# Display the confusion matrix
print('\nConfusion Matrix:')
print(cm)

# Optionally, plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
fig, ax = plt.subplots(figsize=(7, 7))  # Increase figure size for better readability
disp.plot(ax=ax, values_format='d')
for labels in disp.text_:
    for label in labels:
        label.set_fontsize(18) 
plt.show()

# Feature Importance Analysis

In [None]:
import shap

# Ensure that X_train and X_test are DataFrames with the correct column names
# You can set the column names from the 'features' DataFrame like this:
X_train.columns = features.columns
X_test.columns = features.columns

n_samples = 1000

# Convert your training set to a NumPy format if it's not already
background = X_train[:n_samples].to_numpy()

# Use the generic SHAP Explainer interface
explainer = shap.Explainer(model, background)

# Generate SHAP values for the test set
shap_values = explainer(X_test[:n_samples].to_numpy())

# Plotting the summary plot for feature importance
# Use the column names from the 'features' DataFrame as the feature names
shap.summary_plot(shap_values.values, X_test[:n_samples], feature_names=features.columns, max_display=30)


# Make test predictions

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
import joblib
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pyarrow import Table

# Load the model and scaler
model = load_model('dga_binary_model.keras')
scaler = joblib.load("dga_binary_scaler.joblib")

# #############################################################
# EDIT this to specify benign / malicious datasets to use     #
# #############################################################
benign_dataset_filenames = [
    '../feature-extraction/floor/lex-benign_2312_anonymized.parquet',
    '../feature-extraction/floor/lex-umbrella_benign_FINISHED.parquet',
]
malicious_dataset_filenames = [
    '../feature-extraction/floor/lex-dga-830k-pick.parquet'
]
# #############################################################
# EDIT this for to set appropriate labels (malware, dga, ...) #
# #############################################################
benign_label = "benign"
malicious_label = "dga"
# #############################################################

def union_tables(tables: [pa.Table]) -> pa.Table:
    union_table = tables[0]
    for table in tables[1:]:
        right_not_in_union = union_table.join(right_table=table, keys='domain_name', join_type='right anti',
                                              coalesce_keys=True, use_threads=True)
        union_table = pa.concat_tables([union_table, right_not_in_union])
    return union_table

# Unify malicious datasets and benign datasets
schema = (pq.read_table(malicious_dataset_filenames[0])).schema # Use the schema from the first malicious filename
benign_tables = [pq.read_table(filename).cast(schema) for filename in benign_dataset_filenames]
malicious_tables = [pq.read_table(filename).cast(schema) for filename in malicious_dataset_filenames]
malicious = union_tables(malicious_tables)
benign = union_tables(benign_tables)

# Convert pyarrow tables to pandas dataframes
df_benign = benign.to_pandas()
df_malicious = malicious.to_pandas()

# Set appropriate labels
df_benign["label"] = benign_label
df_malicious["label"] = malicious_label
class_map = {benign_label: 0, malicious_label: 1}

# Concatentate benign and malicious
test_df = pd.concat([df_benign, df_malicious])

# Handle NaNs
test_df.fillna(-1, inplace=True)


# Take only N random samples
N = 500
test_df = test_df.sample(n=N, random_state=42)

total_predictions = 0
correct_predictions = 0

for index, row in test_df.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    # Drop "domain_name" and "label" columns
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)

     # Scale the feature vector using the loaded scaler
    scaled_feature_vector = scaler.transform(feature_vector)
    
    # Perform prediction
    prediction = model.predict(scaled_feature_vector, verbose=0)
    
    # Extract the predicted class
    predicted_label = "benign" if prediction < 0.5 else "dga"
    
    # Check if the prediction was correct
    if original_label == predicted_label:
        correct_predictions += 1
    
    total_predictions += 1
    
    # Print the result
    result="WRONG"
    if predicted_label == original_label:
        result="OK"
        
    pred_disp = "!!! DGA !!!"
    if predicted_label == "benign":
        pred_disp = "BENIGN"
        
    
    print(f"{result} | {domain_name} ({original_label}), Predicted: {pred_disp}, Prob: {prediction}")

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy}")


