# Optional - Create a testing parquet

In [5]:
import pandas as pd
import numpy as np

# List of input Parquet files
input_files = [
    'testdata/malware_bp.parquet',
    'testdata/misp_2402.parquet',
    'testdata/umbrella_benign_FINISHED.parquet'
]

# Number of rows to select
n_rows = 20

# Read all Parquet files into a single DataFrame
dataframes = [pd.read_parquet(file) for file in input_files]
combined_df = pd.concat(dataframes, ignore_index=True)

# Randomly select n_rows rows from the combined DataFrame
selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility

# Drop the "label" column if it exists
#if 'label' in selected_rows.columns:
#    selected_rows = selected_rows.drop(columns=['label'])

# Save the selected rows to a new Parquet file
selected_rows.to_parquet('testdata/sample.parquet')

# Display the selected rows
print(selected_rows)





                                              domain_name  \
260277                                     sos.oregon.gov   
215554         authshheagorbgwtmmailssocl.firebaseapp.com   
199363                            oqetydcoqu.log4iit.club   
604467                          r5---sn-vgqsknsk.gvt1.com   
454427                                       wgplayer.com   
605213                                        buypass.com   
121365                                      maxytechs.com   
370528                                   novitasphere.com   
168811                         home-105356.weeblysite.com   
327895                                     trueoffice.com   
118768  bafybeifgwsrtbxk6fywx7jgmebch67gcq5szbfmd6dbnq...   
531382                            hdcontent.homedepot.com   
267407                                        jocooks.com   
561547                                    nlsn.thrtle.com   
130976  bafybeibqnmt6fmmgp56nledl4gze4axmiim4j2volzbfu...   
459505                  

In [6]:
print(selected_rows["label"])

260277             benign_2310:unknown
215554              misp_2310:phishing
199363              misp_2310:phishing
604467             benign_2310:unknown
454427             benign_2310:unknown
605213             benign_2310:unknown
121365                 malware:unknown
370528             benign_2310:unknown
168811    phishing_since_2402:phishing
327895             benign_2310:unknown
118768                 malware:unknown
531382             benign_2310:unknown
267407             benign_2310:unknown
561547             benign_2310:unknown
130976    phishing_since_2402:phishing
459505             benign_2310:unknown
67756                  malware:unknown
164475    phishing_since_2402:phishing
569688             benign_2310:unknown
108328                 malware:unknown
Name: label, dtype: object


# Make a test run

In [1]:
import pandas as pd
from preprocessor import Preprocessor

test_dataset = 'testdata/sample.parquet'

pp = Preprocessor()

domain_data = pd.read_parquet(test_dataset)

for index, row in domain_data.iterrows():
    domain_name = row['domain_name']
    original_label = row['label']
    
    print(f"{index}: {domain_name} - {original_label}")
    
    feature_vector = pd.DataFrame([row])
    feature_vector.drop(columns=['domain_name', 'label'], inplace=True)
    
    ndf = pp.NDF(feature_vector, 'phishing')
    print(type(ndf))
    print(ndf)
    

260277: sos.oregon.gov - benign_2310:unknown
<class 'dict'>
{'name': 'dataset_2024-05-17', 'features': tensor([[0.5000, 0.5178, 0.5000, 0.5227, 0.5384, 0.5104, 0.7311, 0.5000, 0.5622,
         0.5000, 0.5611, 0.5938, 0.6713, 0.6608, 0.5014, 0.5014, 0.6554, 0.6225,
         0.6225, 0.5709, 0.5139, 0.5580, 0.6879, 0.5622, 0.5498, 0.5700, 0.6825,
         0.5005, 0.5008, 0.5051, 0.5000, 0.5000, 0.6344, 0.6742, 0.5870, 0.6672,
         0.5000, 0.7311, 0.5000, 0.5000, 0.5250, 0.5589, 0.7311, 0.5498, 0.5482,
         0.5710, 0.6397, 0.5826, 0.6225, 0.6225, 0.6225, 0.5622, 0.5416, 0.5000,
         0.7311, 0.7311, 0.6225, 0.5622, 0.5498, 0.5563, 0.7311, 0.5000, 0.5167,
         0.6792, 0.6225, 0.5563, 0.7311, 0.5000, 0.5622, 0.5026, 0.5000, 0.5071,
         0.5498, 0.5622, 0.5263, 0.5000, 0.5000, 0.5000, 0.5000, 0.5927, 0.5000,
         0.5442, 0.5405, 0.6705, 0.5000, 0.5000, 0.5000, 0.5440, 0.6225, 0.5326,
         0.6225, 0.5000, 0.5000, 0.5156, 0.5416, 0.5277, 0.5654, 0.5000, 0.5000,
      

<class 'dict'>
{'name': 'dataset_2024-05-17', 'features': tensor([[0.5000, 0.5357, 0.5553, 0.5000, 0.5384, 0.5000, 0.7311, 0.5000, 0.5000,
         0.5000, 0.5568, 0.6210, 0.6391, 0.6608, 0.5012, 0.5015, 0.6713, 0.6554,
         0.6514, 0.6055, 0.5139, 0.5848, 0.6802, 0.5622, 0.5498, 0.5606, 0.6869,
         0.5015, 0.5067, 0.5017, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5200, 0.5245, 0.6225, 0.7311, 0.5317,
         0.5283, 0.5567, 0.5416, 0.6225, 0.6225, 0.6608, 0.5622, 0.5416, 0.5000,
         0.7311, 0.7311, 0.6225, 0.5622, 0.5498, 0.5563, 0.7311, 0.5000, 0.5498,
         0.7311, 0.6225, 0.5563, 0.7311, 0.5000, 0.5622, 0.5026, 0.5000, 0.5071,
         0.5498, 0.5622, 0.5237, 0.5000, 0.5000, 0.5826, 0.5000, 0.5927, 0.7311,
         0.6995, 0.5605, 0.6608, 0.5000, 0.5000, 0.5000, 0.5440, 0.5826, 0.5755,
         0.6852, 0.5000, 0.5000, 0.5467, 0.5826, 0.5000, 0.6163, 0.5000, 0.5000,
         0.5467, 0.6608, 0.5000, 0.5000, 0.5234, 0.

<class 'dict'>
{'name': 'dataset_2024-05-17', 'features': tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.7311, 0.5000,
         0.5000, 0.5611, 0.6214, 0.5357, 0.6608, 0.5000, 0.5000, 0.6225, 0.7311,
         0.5622, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000, 0.5000, 0.5000, 0.5000, 0.5100, 0.5022, 0.5000, 0.5000, 0.5000,
         0.5283, 0.5567, 0.5416, 0.6225, 0.6225, 0.6225, 0.5622, 0.5416, 0.5000,
         0.7311, 0.7311, 0.6225, 0.5622, 0.5498, 0.5563, 0.7311, 0.5000, 0.5498,
         0.7311, 0.6225, 0.5563, 0.7311, 0.5000, 0.5622, 0.5026, 0.5000, 0.5071,
         0.5498, 0.5622, 0.5550, 0.5000, 0.5000, 0.5000, 0.5000, 0.5927, 0.7311,
         0.6995, 0.5672, 0.6436, 0.5000, 0.5000, 0.5000, 0.5440, 0.5744, 0.5755,
         0.6682, 0.5000, 0.5000, 0.5312, 0.5498, 0.5277, 0.6408, 0.5000, 0.5000,
         0.5622, 0.5831, 0.5000, 0.5000, 0.5467, 0.