# Test of the clasification pipeline

In [4]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /home/ihranicky/git/domainradar-clf


## Optional: Create testing dataset
Note: If you want do to this, set create_test_parquet to **True**

In [2]:
create_test_parquet = True

if create_test_parquet:
    import pandas as pd
    import numpy as np

    # List of input Parquet files along with their maximum rows and desired labels
    input_files = [
        {'file': 'testdata/2405_clftest_benign_filtered.parquet', 'max_rows': 4000, 'label': 'benign'},
        {'file': 'testdata/2405_clftest_phishing_filtered.parquet', 'max_rows': 480, 'label': 'phishing'},
        {'file': 'testdata/2405_clftest_malware_filtered.parquet', 'max_rows': 292, 'label': 'malware'},
        {'file': 'testdata/dga_2310.parquet', 'max_rows': 300, 'label': 'dga'},
    ]

    # Number of rows to select in total
    n_rows = 5072
    
    # Read the first file to get the initial columns and create the first dataframe
    first_file_info = input_files[0]
    combined_df = pd.read_parquet(first_file_info['file'])
    
    # Limit the number of rows if necessary for the first file
    if len(combined_df) > first_file_info['max_rows']:
        combined_df = combined_df.sample(n=first_file_info['max_rows'], random_state=1)
    
    # Overwrite the "label" column with the specified label for the first file
    combined_df['label'] = first_file_info['label']
    
    # Get the columns from the first dataframe
    all_columns = combined_df.columns.tolist()

    # Process the remaining files
    for file_info in input_files[1:]:
        df = pd.read_parquet(file_info['file'])
        
        # Limit the number of rows if necessary
        if len(df) > file_info['max_rows']:
            df = df.sample(n=file_info['max_rows'], random_state=1)
        
        # Overwrite the "label" column with the specified label
        df['label'] = file_info['label']
        
        # Ensure all columns from the first dataframe are present
        for col in all_columns:
            if col not in df.columns:
                df[col] = None
        
        # Align the dataframe to the columns of the first dataframe
        df = df[all_columns]
        
        # Append the dataframe to the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/validation.parquet')


  combined_df = pd.concat([combined_df, df], ignore_index=True)


## Run classification

In [5]:
# Specify the parquet file with the dataset for classification
test_dataset = 'testdata/validation.parquet'

# Number of domain names to classify with each run of the pipeline (0 = classify all)
CHUNK_SIZE = 30

In [6]:
import pandas as pd
import numpy as np
import shap
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from classifiers.pipeline import Pipeline
from classifiers.options import PipelineOptions
#import matplotlib.pyplot as plt

# Initialize the classification pipeline
clf_options = PipelineOptions()
clf = Pipeline(clf_options)

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

# Function to map labels to 'benign' or 'malign'
def map_label(label):
    if label == 'benign':
        return 'benign'
    else:
        return f'malign ({label})'

# Function to convert labels to binary classes
def binary_label(label):
    return 'negative' if label == 'benign' else 'positive'

# Apply label mapping
input_df['mapped_label'] = input_df['label'].apply(map_label)
input_df['binary_label'] = input_df['label'].apply(binary_label)

# Ensure SHAP JavaScript initialization
#shap.initjs()

# Determine the number of chunks
num_chunks = (len(input_df) + CHUNK_SIZE - 1) // CHUNK_SIZE if CHUNK_SIZE > 0 else 1

# Initialize counters for overall statistics
total_true_labels = []
total_pred_labels = []

# Format string for aligned output
header_format_str = "{:<3} | {:<50} | {:<18} | {:<10} | {:<10}"
data_format_str = "{:<3} | {:<50} | {:<18} | {:<10} | {:.6f}"

# Process the dataframe in chunks
for i in range(num_chunks):
    if CHUNK_SIZE > 0:
        start_idx = i * CHUNK_SIZE
        end_idx = start_idx + CHUNK_SIZE
        chunk_df = input_df[start_idx:end_idx]
    else:
        chunk_df = input_df

    # Perform your classification or processing on the working_df here
    print(f"===== Processing chunk {i+1}/{num_chunks} =====")

    chunk_without_label = chunk_df.drop(columns=['label', 'mapped_label', 'binary_label']) # Label should not be known to classifiers
    chunk_results = clf.classify_domains(chunk_without_label)

    # Collect predictions and true labels
    true_labels = chunk_df['binary_label'].values
    pred_labels = []
    for result in chunk_results:
        pred_label = 'negative' if result['aggregate_probability'] < 0.5 else 'positive'
        pred_labels.append(pred_label)

    # Update overall statistics
    total_true_labels.extend(true_labels)
    total_pred_labels.extend(pred_labels)

    # Display header for results
    print(header_format_str.format("Res", "Domain Name", "Actual Label", "Predicted", "Probability"))

    # Display results for each domain
    for idx, result in enumerate(chunk_results):
        actual_label = chunk_df.iloc[idx]['mapped_label']
        predicted_label = 'benign' if pred_labels[idx] == 'negative' else 'malign'
        domain_name = result['domain_name']
        aggregate_probability = result['aggregate_probability']
        status = "OK" if pred_labels[idx] == true_labels[idx] else "ER"
        print(data_format_str.format(status, domain_name[:50], actual_label, predicted_label, aggregate_probability))
        
        # Run debug_domain method for misclassified domains
        #if status == "ER":
            #print(f"Debugging misclassified domain: {domain_name}")
            #ndf_data = clf.pp.df_to_NDF(chunk_df, "phishing")  # Convert to NDF
            #debug_data = clf.clf_phishing_cnn.debug_domain(domain_name, ndf_data, chunk_df, n_top_features=10)
            
            # Print out the top n feature importances and values for each classifier
            #for classifier, data in debug_data.items():
            #    print(f"\nClassifier: {classifier}")
            #    print(f"Top {len(data['top_features'])} features for domain '{domain_name}':")
            #    for feature_info in data['top_features']:
            #        print(f"Feature: {feature_info['feature']}, Value: {feature_info['value']}, SHAP Value: {feature_info['shap_value']}")
                
                # Display the force plot for phishing_lgbm
                #if classifier == "phishing_cnn":
                #    base_value, shap_values, domain_row = data['force_plot_data']
                #    shap.force_plot(base_value, shap_values, domain_row)
                #    plt.show()

    # Calculate metrics for the current chunk
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, pos_label='positive', average='binary')
    recall = recall_score(true_labels, pred_labels, pos_label='positive', average='binary')
    f1 = f1_score(true_labels, pred_labels, pos_label='positive', average='binary')

    tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels, labels=['negative', 'positive']).ravel()
    false_positives = fp
    false_negatives = fn
    total_positives = tp + fp
    total_negatives = tn + fn

    fp_ratio = (false_positives / total_positives) if total_positives > 0 else 0
    fn_ratio = (false_negatives / total_negatives) if total_negatives > 0 else 0

    print(f"Chunk {i+1}/{num_chunks} metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"False Positives: {false_positives} ({fp_ratio * 100:.2f}%)")
    print(f"False Negatives: {false_negatives} ({fn_ratio * 100:.2f}%)")
    print(f"===== Chunk {i+1}/{num_chunks} completed. =====")

# Calculate overall metrics
overall_accuracy = accuracy_score(total_true_labels, total_pred_labels)
overall_precision = precision_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')
overall_recall = recall_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')
overall_f1 = f1_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')

overall_tn, overall_fp, overall_fn, overall_tp = confusion_matrix(total_true_labels, total_pred_labels, labels=['negative', 'positive']).ravel()
overall_false_positives = overall_fp
overall_false_negatives = overall_fn
overall_total_positives = overall_tp + overall_fp
overall_total_negatives = overall_tn + overall_fn

overall_fp_ratio = (overall_false_positives / overall_total_positives) if overall_total_positives > 0 else 0
overall_fn_ratio = (overall_false_negatives / overall_total_negatives) if overall_total_negatives > 0 else 0

print("Overall metrics:")
print(f"Overall Accuracy: {overall_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1 Score: {overall_f1}")
print(f"Overall False Positives: {overall_false_positives} ({overall_fp_ratio * 100:.2f}%)")
print(f"Overall False Negatives: {overall_false_negatives} ({overall_fn_ratio * 100:.2f}%)")


CNN model created
===== Processing chunk 1/170 =====
Res | Domain Name                                        | Actual Label       | Predicted  | Probability
OK  | yps.link                                           | benign             | benign     | 0.027865
OK  | www.sacramentospice.com                            | benign             | benign     | 0.170981
OK  | kasmat.cz                                          | benign             | benign     | 0.056560
OK  | www.stolniceni.cz                                  | benign             | benign     | 0.000000
ER  | loans837113.life                                   | benign             | malign     | 0.790721
OK  | www.jabkolevne.cz                                  | benign             | benign     | 0.000000
OK  | marketbusinessnews.com                             | benign             | benign     | 0.000000
OK  | vh380.timeweb.ru                                   | benign             | benign     | 0.000000
OK  | ih1.redbubble.net   

KeyboardInterrupt: 

## Optional: Generate preliminary results for training the final aggregation classifier

In [3]:
import pandas as pd
import numpy as np
from classifiers.pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

input_df = input_df.sample(frac=1).reset_index(drop=True)

#preliminary_results_df = clf.generate_preliminary_results(input_df, output_file="test_preliminary_results.parquet")
preliminary_results_df = clf.generate_preliminary_results(input_df, add_final=False)

2024-06-23 17:41:02.126608: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-23 17:41:02.126630: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-23 17:41:02.127496: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-23 17:41:02.132311: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-23 17:41:03.282045: E external/local_xla/xla/

CNN model created


In [4]:
pd.reset_option('display.max_rows')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.options.display.float_format = '{:.6f}'.format
preliminary_results_df

Unnamed: 0,domain_name,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,label,phishing_cnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_deepnn_result,phishing_dns_nn_result,phishing_rdap_nn_result,malware_cnn_result,malware_lgbm_result,malware_xgboost_result,dga_binary_nn_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,total_sum,total_avg,total_prod
0,ftpcontent.worldnow.com,0.600000,0.225000,1.000000,0.750000,1.000000,0.375000,1.000000,0.833333,1.000000,0.722222,benign,-5.229845,0.001774,0.002645,0.001751,0.125077,0.003481,0.000000,0.000274,0.002408,0.000000,-5.095116,-0.849186,-0.000000,0.002682,0.000894,0.000000,-5.092434,-0.509243,-0.000000
1,kamencovejezero.cz,0.950000,0.600000,1.000000,0.750000,1.000000,0.625000,1.000000,0.625000,1.000000,0.666667,benign,-8.180626,0.047848,0.005811,0.003053,0.024633,0.023075,0.000000,0.021636,0.225499,0.130021,-8.076206,-1.346034,-0.000000,0.247135,0.082378,0.000000,-7.699050,-0.769905,-0.000000
2,veletrh-skol.msk.cz,0.600000,0.250000,1.000000,0.833333,1.000000,0.375000,1.000000,0.708333,1.000000,0.666667,benign,-13.687679,0.001122,0.001200,0.000396,0.011503,0.023628,0.000000,0.000320,0.002408,0.000000,-13.649830,-2.274972,-0.000000,0.002728,0.000909,0.000000,-13.647102,-1.364710,-0.000000
3,sun52.art,0.950000,0.675000,1.000000,0.833333,1.000000,0.750000,1.000000,0.750000,1.000000,1.000000,benign,-1.376371,0.399139,0.909088,0.954825,0.070644,0.318882,0.000000,0.452735,0.929244,0.011548,1.276207,0.212701,-0.010742,1.381979,0.460660,0.000000,2.669734,0.266973,-0.000000
4,youngtrailblazer.click,0.900000,0.550000,1.000000,0.833333,1.000000,0.750000,1.000000,0.750000,1.000000,1.000000,malware,-1.414446,0.348457,0.102005,0.474101,0.252974,0.342174,0.000000,0.903315,0.977357,0.005257,0.105266,0.017544,-0.002063,1.880672,0.626891,0.000000,1.991195,0.199119,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5067,prairieedge.com,1.000000,0.700000,1.000000,0.750000,1.000000,0.625000,1.000000,0.583333,1.000000,0.722222,benign,-6.680964,0.000515,0.000699,0.008716,0.027548,0.004519,0.000000,0.087814,0.075112,0.014828,-6.638967,-1.106495,-0.000000,0.162926,0.054309,0.000000,-6.461213,-0.646121,-0.000000
5068,siwa138.org,1.000000,0.750000,1.000000,0.833333,1.000000,0.875000,1.000000,0.833333,1.000000,0.722222,benign,-0.128335,0.877862,0.990018,0.864697,0.060197,0.095688,0.000000,0.941217,0.953016,0.118873,2.760128,0.460021,-0.000556,1.894234,0.631411,0.000000,4.773235,0.477323,-0.000000
5069,meethettich.webex.com,0.900000,0.500000,1.000000,0.833333,1.000000,0.375000,1.000000,0.708333,1.000000,0.666667,benign,-6.388254,0.000666,0.000711,0.010212,0.034424,0.026110,0.000000,0.000302,0.002408,0.000008,-6.316130,-1.052688,-0.000000,0.002710,0.000903,0.000000,-6.313411,-0.631341,-0.000000
5070,readme.so,0.900000,0.525000,1.000000,0.750000,1.000000,0.625000,0.833333,0.291667,1.000000,0.722222,benign,-4.217328,0.155504,0.025938,0.077755,0.179741,0.001354,0.000000,0.049779,0.232481,0.003665,-3.777035,-0.629506,-0.000000,0.282260,0.094087,0.000000,-3.491110,-0.349111,-0.000000


In [5]:
pd.set_option('display.max_rows', None)
#preliminary_results_df[["domain_name", "label", "badness_probability", "phishing_cnn_result", "phishing_deepnn_result", "phishing_lgbm_result", "phishing_xgboost_result", "malware_cnn_result", "malware_xgboost_result", "dga_binary_nn_result"]]
preliminary_results_df[["domain_name", "label", "phishing_cnn_result", "phishing_deepnn_result", "phishing_lgbm_result", "phishing_xgboost_result", "malware_cnn_result", "malware_lgbm_result", "malware_xgboost_result", "dga_binary_nn_result"]]

Unnamed: 0,domain_name,label,phishing_cnn_result,phishing_deepnn_result,phishing_lgbm_result,phishing_xgboost_result,malware_cnn_result,malware_lgbm_result,malware_xgboost_result,dga_binary_nn_result
0,ftpcontent.worldnow.com,benign,-5.229845,0.001751,0.001774,0.002645,0.0,0.000274,0.002408,0.0
1,kamencovejezero.cz,benign,-8.180626,0.003053,0.047848,0.005811,0.0,0.021636,0.225499,0.130021
2,veletrh-skol.msk.cz,benign,-13.687679,0.000396,0.001122,0.0012,0.0,0.00032,0.002408,0.0
3,sun52.art,benign,-1.376371,0.954825,0.399139,0.909088,0.0,0.452735,0.929244,0.011548
4,youngtrailblazer.click,malware,-1.414446,0.474101,0.348457,0.102005,0.0,0.903315,0.977357,0.005257
5,smart-hpl-322.pages.dev,phishing,4.995804,0.999999,0.885206,0.894322,0.0,7.6e-05,0.002233,0.0
6,usahello.org,benign,-10.161005,0.000913,0.029288,0.092739,0.0,0.009758,0.016586,0.070465
7,nppbase2.cup.com.cn,benign,-13.193365,0.000531,0.000456,0.000485,0.0,0.000207,0.002408,0.0
8,abookapart.com,benign,-8.276295,0.289671,0.003417,0.005394,0.0,0.073479,0.114012,0.003019
9,www.chillimanufaktura.cz,benign,-4.136286,0.641983,0.099968,0.140739,0.0,0.001764,0.005576,0.0


In [None]:
preliminary_results_df[["domain_name", "label", "phishing_cnn_result", "phishing_deepnn_result", "phishing_lgbm_result", "phishing_xgboost_result", "phishing_dns_nn_result", "phishing_rdap_nn_result"]]

In [None]:
preliminary_results_df[["domain_name", "label", "malware_cnn_result", "malware_lgbm_result", "malware_xgboost_result"]]

In [None]:
preliminary_results_df[["domain_name", "label", "badness_probability"]]