# Test of the clasification pipeline

In [1]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /home/ihranicky/git/domainradar-clf/classifiers


## Optional: Create testing dataset
Note: If you want do to this, set create_test_parquet to **True**

In [2]:
create_test_parquet = True

if create_test_parquet:
    import pandas as pd
    import numpy as np

    # List of input Parquet files along with their maximum rows and desired labels
    input_files = [
        {'file': 'testdata/misp_2402.parquet', 'max_rows': 500, 'label': 'phishing'},
        {'file': 'testdata/benign_2312.parquet', 'max_rows': 2000, 'label': 'benign'},
        {'file': 'testdata/malware_bp.parquet', 'max_rows': 500, 'label': 'malware'},
        {'file': 'testdata/dga_2310.parquet', 'max_rows': 500, 'label': 'dga'},
    ]

    # Number of rows to select in total
    n_rows = 1000
    
    # Read the first file to get the initial columns and create the first dataframe
    first_file_info = input_files[0]
    combined_df = pd.read_parquet(first_file_info['file'])
    
    # Limit the number of rows if necessary for the first file
    if len(combined_df) > first_file_info['max_rows']:
        combined_df = combined_df.sample(n=first_file_info['max_rows'], random_state=1)
    
    # Overwrite the "label" column with the specified label for the first file
    combined_df['label'] = first_file_info['label']
    
    # Get the columns from the first dataframe
    all_columns = combined_df.columns.tolist()

    # Process the remaining files
    for file_info in input_files[1:]:
        df = pd.read_parquet(file_info['file'])
        
        # Limit the number of rows if necessary
        if len(df) > file_info['max_rows']:
            df = df.sample(n=file_info['max_rows'], random_state=1)
        
        # Overwrite the "label" column with the specified label
        df['label'] = file_info['label']
        
        # Ensure all columns from the first dataframe are present
        for col in all_columns:
            if col not in df.columns:
                df[col] = None
        
        # Align the dataframe to the columns of the first dataframe
        df = df[all_columns]
        
        # Append the dataframe to the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/sample.parquet')

  combined_df = pd.concat([combined_df, df], ignore_index=True)


## Run classification

In [3]:
# Specify the parquet file with the dataset for classification
test_dataset = 'testdata/sample.parquet'

# Number of domain names to classify with each run of the pipeline (0 = classify all)
CHUNK_SIZE = 30

In [4]:
import pandas as pd
import numpy as np
from pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

# Determine the number of chunks
num_chunks = (len(input_df) + CHUNK_SIZE - 1) // CHUNK_SIZE if CHUNK_SIZE > 0 else 1

# Process the dataframe in chunks
for i in range(num_chunks):
    if CHUNK_SIZE > 0:
        start_idx = i * CHUNK_SIZE
        end_idx = start_idx + CHUNK_SIZE
        chunk_df = input_df[start_idx:end_idx]
    else:
        chunk_df = input_df

    # Perform your classification or processing on the working_df here
    print(f"===== Processing chunk {i+1}/{num_chunks} =====")

    chunk_without_label = chunk_df.drop(columns=['label']) # Label should not be known to classifiers
    chunk_results = clf.classify_domains(chunk_without_label)

    for result in chunk_results:
        print(result)
     
    print(f"===== Chunk {i+1}/{num_chunks} completed. =====")

    break

2024-05-27 13:01:06.357042: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-27 13:01:06.357065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-27 13:01:06.357875: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-27 13:01:06.362070: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-27 13:01:07.656181: E external/local_xla/xla/

CNN model created
CNN model created
===== Processing chunk 1/34 =====


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.fillna(-1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.fillna(-1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data.fillna(-1, inplace=True)


                             domain_name  dns_available  dns_nonzero  \
1322                 cocinortemtb.com.do          1.000        0.725   
975                beautysalon.melbourne          0.600        0.125   
3085                          irivkte.me          0.475        0.000   
1263                          dtx.gov.az          1.000        0.700   
1533                      emmavoyages.fr          0.600        0.125   
2508        www.mail.additionaltones.org          0.600        0.200   
3228                 xyhtfmk2i46hibpv.ru          0.475        0.000   
3308      f90j6uci30xv1t10lsz1p2vfob.net          0.475        0.000   
3123                ysjsjaiqqkjdgasy.net          0.475        0.000   
330                         44445344.xyz          0.600        0.125   
2359                       mail.inbox.lv          0.600        0.200   
297                  conecct-ter.web.app          0.600        0.225   
2026                     cdn7.playep.pro          0.600        0

## Optional: Generate preliminary results for training the final aggregation classifier

In [8]:
import pandas as pd
import numpy as np
from pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

input_df = input_df.sample(frac=1).reset_index(drop=True)

preliminary_results_df = clf.generate_preliminary_results(input_df, output_file="test_preliminary_results.parquet")

CNN model created
CNN model created


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.fillna(-1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.fillna(-1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data.fillna(-1, inplace=True)


In [9]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.options.display.float_format = '{:.6f}'.format
preliminary_results_df

Unnamed: 0,domain_name,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,label,phishing_cnn_result,phishing_lgbm_result,malware_cnn_result,malware_xgboost_result,dga_binary_nn_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,total_sum,total_avg,total_prod
0,newtontool.ca,1.0,0.75,1.0,0.833333,1.0,0.625,1.0,0.833333,1.0,0.722222,malware,0.0,0.711339,0.0,0.999432,0.064606,0.711339,0.35567,0.0,0.999432,0.499716,0.0,1.775378,0.355076,0.0
1,mail.cr4tu2.gq,0.6,0.25,1.0,0.458333,1.0,0.625,0.833333,0.208333,1.0,0.666667,benign,0.0,0.026572,0.0,0.000295,3e-06,0.026572,0.013286,0.0,0.000295,0.000148,0.0,0.026869,0.005374,0.0
2,smstz.edu.ba,1.0,0.7,1.0,0.458333,1.0,1.0,0.833333,0.291667,1.0,0.944444,benign,0.0,0.001071,0.0,0.0,0.007067,0.001071,0.000535,0.0,0.0,0.0,0.0,0.008138,0.001628,0.0
3,superiorcareservices.com,1.0,0.725,1.0,0.458333,1.0,0.625,1.0,0.541667,1.0,0.666667,malware,1.0,0.78972,1.0,0.999965,0.048425,1.78972,0.89486,0.78972,1.999965,0.999983,0.999965,3.83811,0.767622,0.038241
4,extraordinary.diamonds,0.9,0.55,0.041667,0.0,1.0,1.0,1.0,0.833333,1.0,1.0,benign,1.0,0.000116,1.0,0.0,0.0,1.000116,0.500058,0.000116,1.0,0.5,0.0,2.000116,0.400023,0.0
5,wimwkqdpnftgcpjm.eu,0.475,0.0,0.041667,0.0,0.625,0.0,0.833333,0.0,1.0,0.0,dga,1.0,0.060304,1.0,0.99411,0.999998,1.060304,0.530152,0.060304,1.99411,0.997055,0.99411,4.054412,0.810882,0.059949
6,hpw.qld.gov.au,1.0,0.7,1.0,0.708333,1.0,1.0,0.875,0.583333,1.0,1.0,benign,0.0,5e-06,0.0,0.0,0.0,5e-06,3e-06,0.0,0.0,0.0,0.0,5e-06,1e-06,0.0
7,sseeez.com,0.475,0.0,0.041667,0.0,0.625,0.0,0.833333,0.0,1.0,0.0,dga,1.0,0.00292,0.0,0.987593,0.896603,1.00292,0.50146,0.00292,0.987593,0.493796,0.0,2.887115,0.577423,0.0
8,kkei-huhssgkj.icu,0.6,0.125,0.041667,0.0,0.625,0.125,1.0,0.291667,1.0,0.0,phishing,1.0,0.989056,1.0,0.305736,0.042583,1.989056,0.994528,0.989056,1.305736,0.652868,0.305736,3.337374,0.667475,0.012877
9,hckslglnppyurop.biz,0.475,0.0,0.041667,0.0,0.625,0.0,0.833333,0.0,1.0,0.0,dga,1.0,0.143953,1.0,0.972229,0.999972,1.143953,0.571976,0.143953,1.972229,0.986114,0.972229,4.116153,0.823231,0.139951


In [10]:
preliminary_results_df[["domain_name", "label", "phishing_cnn_result", "phishing_lgbm_result", "malware_cnn_result", "malware_xgboost_result", "dga_binary_nn_result"]]

Unnamed: 0,domain_name,label,phishing_cnn_result,phishing_lgbm_result,malware_cnn_result,malware_xgboost_result,dga_binary_nn_result
0,newtontool.ca,malware,0.0,0.711339,0.0,0.999432,0.064606
1,mail.cr4tu2.gq,benign,0.0,0.026572,0.0,0.000295,3e-06
2,smstz.edu.ba,benign,0.0,0.001071,0.0,0.0,0.007067
3,superiorcareservices.com,malware,1.0,0.78972,1.0,0.999965,0.048425
4,extraordinary.diamonds,benign,1.0,0.000116,1.0,0.0,0.0
5,wimwkqdpnftgcpjm.eu,dga,1.0,0.060304,1.0,0.99411,0.999998
6,hpw.qld.gov.au,benign,0.0,5e-06,0.0,0.0,0.0
7,sseeez.com,dga,1.0,0.00292,0.0,0.987593,0.896603
8,kkei-huhssgkj.icu,phishing,1.0,0.989056,1.0,0.305736,0.042583
9,hckslglnppyurop.biz,dga,1.0,0.143953,1.0,0.972229,0.999972


In [11]:
preliminary_results_df[["domain_name", "label", "total_avg"]]

Unnamed: 0,domain_name,label,total_avg
0,newtontool.ca,malware,0.355076
1,mail.cr4tu2.gq,benign,0.005374
2,smstz.edu.ba,benign,0.001628
3,superiorcareservices.com,malware,0.767622
4,extraordinary.diamonds,benign,0.400023
5,wimwkqdpnftgcpjm.eu,dga,0.810882
6,hpw.qld.gov.au,benign,1e-06
7,sseeez.com,dga,0.577423
8,kkei-huhssgkj.icu,phishing,0.667475
9,hckslglnppyurop.biz,dga,0.823231
