# Generating preliminary results for the final aggregation classifier

In [1]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /home/ihranicky/git/domainradar-clf


## Optional: Create testing dataset
Note: If you want do to this, set create_test_parquet to **True**

In [2]:
create_test_parquet = True

if create_test_parquet:
    import pandas as pd
    import numpy as np

    # List of input Parquet files along with their maximum rows and desired labels
    
    input_files = [
        {'file': 'testdata/benign_2312.parquet', 'label': 'benign', 'max_rows': 150000},
        {'file': 'testdata/umbrella_benign_FINISHED.parquet', 'label': 'benign', 'max_rows': 150000},
        {'file': 'testdata/phishing_2406_strict.parquet', 'label': 'phishing', 'max_rows': 100000},
        {'file': 'testdata/malware_2406_strict.parquet', 'label': 'malware', 'max_rows': 100000},
        {'file': 'testdata/lex-dga-830k-pick.parquet', 'label': 'dga', 'max_rows': 100000},
    ]

    # Read the first file to get the initial columns and create the first dataframe
    first_file_info = input_files[0]
    combined_df = pd.read_parquet(first_file_info['file']).head(first_file_info['max_rows'])
    
    # Overwrite the "label" column with the specified label for the first file
    combined_df['label'] = first_file_info['label']
    
    # Get the columns from the first dataframe
    all_columns = combined_df.columns.tolist()

    # Process the remaining files
    for file_info in input_files[1:]:
        df = pd.read_parquet(file_info['file']).head(file_info['max_rows'])
        
        # Overwrite the "label" column with the specified label
        df['label'] = file_info['label']
        
        # Ensure all columns from the first dataframe are present
        for col in all_columns:
            if col not in df.columns:
                df[col] = -1
        
        # Align the dataframe to the columns of the first dataframe
        df = df[all_columns]
        
        # Append the dataframe to the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)


    ROW_LIMIT = 100000 # 0 - NO LIMIT

    # Subsample the combined dataframe to take N domains
    if ROW_LIMIT > 0:
        combined_df = combined_df.sample(n=ROW_LIMIT, random_state=42)
    
    # Shuffling the DataFrame
    shuffled_df = combined_df.sample(frac=1)

    # Save the selected rows to a new Parquet file
    shuffled_df.to_parquet('testdata/decision_training_2406.parquet')


  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1
  df[col] = -1


In [3]:
shuffled_df

Unnamed: 0,domain_name,label,dns_has_dnskey,dns_A_count,dns_AAAA_count,dns_MX_count,dns_NS_count,dns_TXT_count,dns_SOA_count,dns_CNAME_count,...,rdap_ip_v4_count,rdap_ip_v6_count,rdap_ip_shortest_v4_prefix_len,rdap_ip_longest_v4_prefix_len,rdap_ip_shortest_v6_prefix_len,rdap_ip_longest_v6_prefix_len,rdap_ip_avg_admin_name_len,rdap_ip_avg_admin_name_entropy,rdap_ip_avg_admin_email_len,rdap_ip_avg_admin_email_entropy
341066,decromph.pubgd.com,phishing,0.0,2,2,0,0,0,0,0,...,2,2,22,22,32,32,24.0,0.163262,0.0,0.000000
577522,sraiyr.com,dga,-1.0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,-1.000000,-1.0,-1.000000
306108,ollx-dispatch.9090897.xyz,phishing,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.000000,0.0,0.000000
212546,apm.tools.planetradio.co.uk,benign,0.0,2,0,0,0,0,0,0,...,2,0,11,14,0,0,13.0,0.249146,23.0,0.149881
494011,xinchaocicdda.org,malware,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515404,mgnedelcdqgeyvb.sh,dga,-1.0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,-1.000000,-1.0,-1.000000
159029,woteu34.xmpp.wargaming.net,benign,0.0,2,0,0,0,0,0,0,...,2,0,24,24,0,0,0.0,0.000000,0.0,0.000000
525001,bbmkrsensinaix.com,dga,-1.0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,-1.000000,-1.0,-1.000000
364791,noreplysky49846.weebly.com,phishing,0.0,0,0,0,0,0,0,1,...,2,0,22,22,0,0,13.0,0.256514,21.0,0.186482


In [4]:
df.dtypes

domain_name                        object
label                              object
dns_has_dnskey                      int64
dns_A_count                         int64
dns_AAAA_count                      int64
                                    ...  
rdap_ip_longest_v6_prefix_len       int64
rdap_ip_avg_admin_name_len          int64
rdap_ip_avg_admin_name_entropy      int64
rdap_ip_avg_admin_email_len         int64
rdap_ip_avg_admin_email_entropy     int64
Length: 178, dtype: object

## Optional: Generate preliminary results for training the final aggregation classifier

In [5]:
import pandas as pd
import numpy as np
from classifiers.pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet('testdata/decision_training_2406.parquet')


# Time measurement start
import time
start_time = time.time()

# DO
preliminary_results_df = clf.generate_preliminary_results(input_df, add_final=False, output_file="preliminary_results_2406_c.parquet")

# Time measurement stop
end_time = time.time()
elapsed_time_seconds = end_time - start_time
hours = int(elapsed_time_seconds // 3600)
minutes = int((elapsed_time_seconds % 3600) // 60)
seconds = elapsed_time_seconds % 60

print(f"Elapsed time: {hours} hours, {minutes} minutes, {seconds:.2f} seconds")

2024-07-03 18:30:29.744542: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 18:30:29.744567: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 18:30:29.745327: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-03 18:30:29.750172: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-03 18:30:30.884852: E external/local_xla/xla/

CNN model created
Elapsed time: 0 hours, 0 minutes, 49.17 seconds


In [6]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.options.display.float_format = '{:.6f}'.format
preliminary_results_df

Unnamed: 0,domain_name,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,phishing_cnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_deepnn_result,phishing_dns_nn_result,phishing_rdap_nn_result,phishing_geo_nn_result,phishing_ip_nn_result,malware_lgbm_result,malware_xgboost_result,malware_deepnn_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result,malware_ip_nn_result,dga_binary_deepnn_result,dga_binary_lgbm_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,dga_binary_sum,dga_binary_avg,dga_binary_prod,total_sum,total_avg,total_prod,label
341066,decromph.pubgd.com,0.600000,0.250000,1.000000,0.791667,1.000000,0.750000,1.000000,0.625000,1.000000,1.000000,0.969977,0.401387,0.560131,0.999981,0.186059,0.491978,0.397401,0.534317,0.057370,0.241182,0.756410,0.205106,0.224388,0.688216,0.552658,0.000001,0.000009,4.541230,0.567654,0.004239,2.725329,0.389333,0.000183,0.000010,0.000005,0.000000,7.266569,0.427445,0.000000,phishing
577522,sraiyr.com,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041667,0.041667,0.000000,0.000000,1.000000,0.000649,0.000632,0.000000,0.065006,0.000001,0.169900,0.145029,0.016357,0.079736,0.000000,0.573180,0.000000,0.383097,0.000000,0.971222,0.665956,1.381216,0.172652,0.000000,1.052370,0.150339,0.000000,1.637178,0.818589,0.646791,4.070764,0.239457,0.000000,dga
306108,ollx-dispatch.9090897.xyz,0.600000,0.125000,0.041667,0.000000,0.500000,0.000000,1.000000,0.416667,1.000000,0.000000,0.993279,0.268530,0.226760,0.872379,0.650564,0.059576,0.387495,0.414531,0.089238,0.068186,0.001444,0.892006,0.062825,0.604780,0.444184,0.000000,0.000001,3.873116,0.484139,0.000328,2.162662,0.308952,0.000000,0.000001,0.000001,0.000000,6.035779,0.355046,0.000000,phishing
212546,apm.tools.planetradio.co.uk,0.600000,0.200000,0.833333,0.250000,1.000000,0.750000,1.000000,0.583333,1.000000,0.666667,0.000166,0.000364,0.000435,0.000227,0.069534,0.008931,0.096990,0.817491,0.000431,0.002626,0.000063,0.016815,0.006094,0.185016,0.212463,0.000000,0.000002,0.994137,0.124267,0.000000,0.423507,0.060501,0.000000,0.000002,0.000001,0.000000,1.417647,0.083391,0.000000,benign
494011,xinchaocicdda.org,0.600000,0.125000,0.041667,0.000000,0.500000,0.000000,0.833333,0.000000,1.000000,0.000000,0.108271,0.018477,0.023709,0.002774,0.650564,0.020158,0.387495,0.414531,0.694809,0.908014,0.296402,0.892006,0.024412,0.604780,0.444184,0.036872,0.000506,1.625979,0.203247,0.000000,3.864608,0.552087,0.001094,0.037378,0.018689,0.000019,5.527965,0.325174,0.000000,malware
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515404,mgnedelcdqgeyvb.sh,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041667,0.041667,0.000000,0.000000,1.000000,0.001184,0.001425,0.000000,0.065006,0.000001,0.169900,0.145029,0.006561,0.010841,0.000000,0.573180,0.000000,0.383097,0.000000,0.998037,0.999581,1.382545,0.172818,0.000000,0.973679,0.139097,0.000000,1.997618,0.998809,0.997619,4.353842,0.256108,0.000000,dga
159029,woteu34.xmpp.wargaming.net,0.900000,0.500000,0.041667,0.000000,1.000000,0.625000,1.000000,0.541667,1.000000,0.666667,0.836232,0.000513,0.000654,0.000624,0.007691,0.001752,0.171863,0.203773,0.001645,0.003415,0.000052,0.009876,0.002099,0.238913,0.328852,0.000000,0.000002,1.223102,0.152888,0.000000,0.584853,0.083550,0.000000,0.000002,0.000001,0.000000,1.807957,0.106350,0.000000,benign
525001,bbmkrsensinaix.com,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041667,0.041667,0.000000,0.000000,1.000000,0.001024,0.000680,0.000000,0.065006,0.000001,0.169900,0.145029,0.024336,0.184622,0.000000,0.573180,0.000000,0.383097,0.000000,0.985654,0.692502,1.381640,0.172705,0.000000,1.165235,0.166462,0.000000,1.678157,0.839078,0.682568,4.225032,0.248531,0.000000,dga
364791,noreplysky49846.weebly.com,0.600000,0.200000,1.000000,0.750000,1.000000,0.375000,1.000000,0.708333,1.000000,0.722222,0.993279,0.971019,0.988039,0.999999,0.383450,0.994579,0.390085,0.716209,0.000739,0.002408,0.000040,0.079411,0.005142,0.307506,0.004565,0.000019,0.000005,6.436660,0.804582,0.101536,0.399810,0.057116,0.000000,0.000024,0.000012,0.000000,6.836494,0.402147,0.000000,phishing


In [7]:
preliminary_results_df[["domain_name", "label", "phishing_cnn_result", "phishing_lgbm_result", "malware_cnn_result", "malware_xgboost_result", "dga_binary_nn_result"]]

KeyError: "['malware_cnn_result', 'dga_binary_nn_result'] not in index"