# Test of the clasification pipeline

In [1]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /home/ihranicky/git/domainradar-clf


## Optional: Create testing dataset
Note: If you want do to this, set create_test_parquet to **True**

In [2]:
create_test_parquet = False

if create_test_parquet:
    import pandas as pd
    import numpy as np

    # List of input Parquet files along with their maximum rows and desired labels
    input_files = [
        {'file': 'testdata/misp_2402.parquet', 'max_rows': 500, 'label': 'phishing'},
        {'file': 'testdata/benign_2312.parquet', 'max_rows': 2000, 'label': 'benign'},
        {'file': 'testdata/malware_bp.parquet', 'max_rows': 500, 'label': 'malware'},
        {'file': 'testdata/dga_2310.parquet', 'max_rows': 500, 'label': 'dga'},
    ]

    # Number of rows to select in total
    n_rows = 1000
    
    # Read the first file to get the initial columns and create the first dataframe
    first_file_info = input_files[0]
    combined_df = pd.read_parquet(first_file_info['file'])
    
    # Limit the number of rows if necessary for the first file
    if len(combined_df) > first_file_info['max_rows']:
        combined_df = combined_df.sample(n=first_file_info['max_rows'], random_state=1)
    
    # Overwrite the "label" column with the specified label for the first file
    combined_df['label'] = first_file_info['label']
    
    # Get the columns from the first dataframe
    all_columns = combined_df.columns.tolist()

    # Process the remaining files
    for file_info in input_files[1:]:
        df = pd.read_parquet(file_info['file'])
        
        # Limit the number of rows if necessary
        if len(df) > file_info['max_rows']:
            df = df.sample(n=file_info['max_rows'], random_state=1)
        
        # Overwrite the "label" column with the specified label
        df['label'] = file_info['label']
        
        # Ensure all columns from the first dataframe are present
        for col in all_columns:
            if col not in df.columns:
                df[col] = None
        
        # Align the dataframe to the columns of the first dataframe
        df = df[all_columns]
        
        # Append the dataframe to the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/sample.parquet')

## Run classification

In [3]:
# Specify the parquet file with the dataset for classification
test_dataset = 'testdata/sample.parquet'

# Number of domain names to classify with each run of the pipeline (0 = classify all)
CHUNK_SIZE = 30

In [4]:
import pandas as pd
import numpy as np
from classifiers.pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

# Determine the number of chunks
num_chunks = (len(input_df) + CHUNK_SIZE - 1) // CHUNK_SIZE if CHUNK_SIZE > 0 else 1

# Process the dataframe in chunks
for i in range(num_chunks):
    if CHUNK_SIZE > 0:
        start_idx = i * CHUNK_SIZE
        end_idx = start_idx + CHUNK_SIZE
        chunk_df = input_df[start_idx:end_idx]
    else:
        chunk_df = input_df

    # Perform your classification or processing on the working_df here
    print(f"===== Processing chunk {i+1}/{num_chunks} =====")

    chunk_without_label = chunk_df.drop(columns=['label']) # Label should not be known to classifiers
    chunk_results = clf.classify_domains(chunk_without_label)

    for result in chunk_results:
        print(result)
     
    print(f"===== Chunk {i+1}/{num_chunks} completed. =====")


  from .autonotebook import tqdm as notebook_tqdm
2024-06-12 20:04:04.364946: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-12 20:04:04.364969: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-12 20:04:04.365794: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-12 20:04:04.370459: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
202

===== Processing chunk 1/34 =====
{'domain_name': 'cocinortemtb.com.do', 'aggregate_probability': 0.007019625511020422, 'aggregate_description': '...', 'classification_results': [{'classification_date': 1718215449174, 'classifier': 'Phishing', 'probability': 0.00015122901994765785, 'description': 'No phishing detected.', 'details': {'CNN phishing classifier': '0.0%', 'LightGBM phishing classifier': '0.03%'}}, {'classification_date': 1718215449174, 'classifier': 'Malware', 'probability': 1.1824990906461608e-06, 'description': 'No malware detected.', 'details': {'CNN malware classifier': '0.0%', 'XGBoost malware classifier': '0.0%'}}, {'classification_date': 1718215449174, 'classifier': 'DGA', 'probability': 2.1868197563890135e-06, 'description': 'No DGA detected.', 'details': {}}]}
{'domain_name': 'beautysalon.melbourne', 'aggregate_probability': 0.07370343059301376, 'aggregate_description': '...', 'classification_results': [{'classification_date': 1718215449174, 'classifier': 'Phishing

## Optional: Generate preliminary results for training the final aggregation classifier

In [5]:
import pandas as pd
import numpy as np
from classifiers.pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

input_df = input_df.sample(frac=1).reset_index(drop=True)

#preliminary_results_df = clf.generate_preliminary_results(input_df, output_file="test_preliminary_results.parquet")
preliminary_results_df = clf.generate_preliminary_results(input_df, add_final=True)

In [6]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.options.display.float_format = '{:.6f}'.format
preliminary_results_df

Unnamed: 0,domain_name,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,label,phishing_cnn_result,phishing_lgbm_result,malware_cnn_result,malware_xgboost_result,dga_binary_nn_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,total_sum,total_avg,total_prod,badness_probability,phishing_xgboost_result,phishing_deepnn_result
0,healthsupplements.sg,1.0,0.7,1.0,0.458333,1.0,1.0,0.833333,0.291667,1.0,1.0,benign,1.0,0.000484,1.0,0.0,0.050805,1.000484,0.500242,0.000484,1.0,0.5,0.0,2.051289,0.410258,0.0,0.015227,2.4e-05,0.005414
1,www2.rakuten.co.jplogin.xgwczhh.cn,0.6,0.125,0.041667,0.0,0.625,0.125,0.833333,0.0,1.0,0.0,phishing,1.0,0.01569,1.0,0.910851,0.0,1.01569,0.507845,0.01569,1.910851,0.955425,0.910851,2.92654,0.585308,0.0,0.883275,0.997395,0.998264
2,tacachico.gob.sv,1.0,0.75,1.0,0.458333,1.0,0.625,0.833333,0.291667,1.0,0.722222,benign,0.0,0.000304,0.0,0.005434,0.0,0.000304,0.000152,0.0,0.005434,0.002717,0.0,0.005738,0.001148,0.0,0.009505,0.001192,0.023172
3,saresep.kg,1.0,0.725,1.0,0.458333,1.0,1.0,1.0,0.583333,1.0,0.722222,benign,0.0,0.001073,0.0,2e-06,0.013956,0.001073,0.000536,0.0,2e-06,1e-06,0.0,0.015031,0.003006,0.0,0.007717,0.076984,0.065556
4,ppmn.or.id,1.0,0.725,1.0,0.458333,1.0,1.0,0.958333,0.666667,1.0,1.0,benign,0.0,0.000311,0.0,0.0,0.191635,0.000311,0.000155,0.0,0.0,0.0,0.0,0.191946,0.038389,0.0,0.006808,0.000309,0.003488
5,zosh6.sumy.ua,1.0,0.7,1.0,0.416667,1.0,1.0,1.0,0.708333,1.0,1.0,benign,0.0,0.000293,0.0,0.0,0.113514,0.000293,0.000146,0.0,0.0,0.0,0.0,0.113807,0.022761,0.0,0.005111,8e-06,0.000874
6,wimwkqdpnftgcpjm.eu,0.475,0.0,0.041667,0.0,0.625,0.0,0.833333,0.0,1.0,0.0,dga,1.0,0.000975,1.0,0.99411,0.999998,1.000975,0.500487,0.000975,1.99411,0.997055,0.99411,3.995082,0.799016,0.000969,0.893751,0.02805,0.091273
7,scamnet.wa.gov.au,0.9,0.525,1.0,0.833333,1.0,0.875,0.833333,0.416667,1.0,1.0,benign,0.0,0.000317,0.0,0.0,0.0,0.000317,0.000159,0.0,0.0,0.0,0.0,0.000317,6.3e-05,0.0,0.008227,2.2e-05,0.000152
8,www.dehebin.ru,0.6,0.2,1.0,0.458333,1.0,0.625,0.958333,0.541667,1.0,0.666667,malware,0.0,0.00253,0.0,0.998267,0.0,0.00253,0.001265,0.0,0.998267,0.499134,0.0,1.000797,0.200159,0.0,0.842463,0.99934,0.883488
9,eros0hostz.xyz,1.0,0.725,1.0,0.458333,1.0,0.625,1.0,0.708333,1.0,0.666667,malware,0.0,0.004439,1.0,0.999878,6e-05,0.004439,0.002219,0.0,1.999878,0.999939,0.999878,2.004377,0.400875,0.0,0.305204,0.031164,0.021


In [7]:
preliminary_results_df[["domain_name", "label", "badness_probability", "phishing_cnn_result", "phishing_lgbm_result", "malware_cnn_result", "malware_xgboost_result", "dga_binary_nn_result"]]

Unnamed: 0,domain_name,label,badness_probability,phishing_cnn_result,phishing_lgbm_result,malware_cnn_result,malware_xgboost_result,dga_binary_nn_result
0,healthsupplements.sg,benign,0.015227,1.0,0.000484,1.0,0.0,0.050805
1,www2.rakuten.co.jplogin.xgwczhh.cn,phishing,0.883275,1.0,0.01569,1.0,0.910851,0.0
2,tacachico.gob.sv,benign,0.009505,0.0,0.000304,0.0,0.005434,0.0
3,saresep.kg,benign,0.007717,0.0,0.001073,0.0,2e-06,0.013956
4,ppmn.or.id,benign,0.006808,0.0,0.000311,0.0,0.0,0.191635
5,zosh6.sumy.ua,benign,0.005111,0.0,0.000293,0.0,0.0,0.113514
6,wimwkqdpnftgcpjm.eu,dga,0.893751,1.0,0.000975,1.0,0.99411,0.999998
7,scamnet.wa.gov.au,benign,0.008227,0.0,0.000317,0.0,0.0,0.0
8,www.dehebin.ru,malware,0.842463,0.0,0.00253,0.0,0.998267,0.0
9,eros0hostz.xyz,malware,0.305204,0.0,0.004439,1.0,0.999878,6e-05


In [8]:
preliminary_results_df[["domain_name", "label", "badness_probability"]]

Unnamed: 0,domain_name,label,badness_probability
0,healthsupplements.sg,benign,0.015227
1,www2.rakuten.co.jplogin.xgwczhh.cn,phishing,0.883275
2,tacachico.gob.sv,benign,0.009505
3,saresep.kg,benign,0.007717
4,ppmn.or.id,benign,0.006808
5,zosh6.sumy.ua,benign,0.005111
6,wimwkqdpnftgcpjm.eu,dga,0.893751
7,scamnet.wa.gov.au,benign,0.008227
8,www.dehebin.ru,malware,0.842463
9,eros0hostz.xyz,malware,0.305204
