# Test of the clasification pipeline

In [1]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Current working directory changed to: /home/ihranicky/git/domainradar-clf/classifiers


## Optional: Create testing dataset
Note: If you want do to this, set create_test_parquet to **True**

In [2]:
create_test_parquet = False

if create_test_parquet:
    import pandas as pd
    import numpy as np
    
    # List of input Parquet files
    input_files = [
        'testdata/misp_2402.parquet',
        'testdata/benign_2312.parquet'
    ]
    
    # Number of rows to select
    n_rows = 1000
    
    # Read all Parquet files into a single DataFrame
    dataframes = [pd.read_parquet(file) for file in input_files]
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/sample.parquet')


## Run classification

In [3]:
# Specify the parquet file with the dataset for classification
test_dataset = 'testdata/sample.parquet'

# Number of domain names to classify with each run of the pipeline (0 = classify all)
CHUNK_SIZE = 30


In [6]:
import pandas as pd
import numpy as np
from pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

# Determine the number of chunks
num_chunks = (len(input_df) + CHUNK_SIZE - 1) // CHUNK_SIZE if CHUNK_SIZE > 0 else 1

# Process the dataframe in chunks
for i in range(num_chunks):
    if CHUNK_SIZE > 0:
        start_idx = i * CHUNK_SIZE
        end_idx = start_idx + CHUNK_SIZE
        chunk_df = input_df[start_idx:end_idx]
    else:
        chunk_df = input_df

    # Perform your classification or processing on the working_df here
    print(f"===== Processing chunk {i+1}/{num_chunks} =====")

    chunk_without_label = chunk_df.drop(columns=['label']) # Label should not be known to classifiers
    chunk_results = clf.classify_domains(chunk_without_label)

    for result in chunk_results:
        print(result)
     
    print(f"===== Chunk {i+1}/{num_chunks} completed. =====")

CNN model created
CNN model created
===== Processing chunk 1/34 =====
{'domain': 'mp4moviez.sydney', 'aggregate_probability': 0.0, 'aggregate_description': '...', 'classification_results': [{'classifier': 'Phishing', 'probability': 0.0, 'description': 'aaa.', 'details': {'CNN phishing classifier': 0.0, 'LightGBM phishing classifier': 0.44678387901879246}}, {'classifier': 'Malware', 'probability': 0.0, 'description': 'aaa.', 'details:': {'CNN phishing classifier': 0.0}}]}
{'domain': 'tsunagu.ryukyu', 'aggregate_probability': 0.0, 'aggregate_description': '...', 'classification_results': [{'classifier': 'Phishing', 'probability': 0.0, 'description': 'aaa.', 'details': {'CNN phishing classifier': 0.0, 'LightGBM phishing classifier': 0.024679162345124905}}, {'classifier': 'Malware', 'probability': 0.0, 'description': 'aaa.', 'details:': {'CNN phishing classifier': 0.0}}]}
{'domain': '6036825.global.siteimproveanalytics.io', 'aggregate_probability': 0.0, 'aggregate_description': '...', 'cla