# Test of the clasification pipeline

In [11]:
import os

# Function to change to the parent directory
def change_to_parent_directory():
    # Check if the directory has already been changed
    if not os.environ.get('DIR_CHANGED'):
        try:
            current_dir = os.path.dirname(os.path.abspath(__file__))
        except NameError:
            current_dir = os.getcwd()
        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
        os.chdir(parent_dir)
        os.environ['DIR_CHANGED'] = '1'
        print(f"Current working directory changed to: {os.getcwd()}")
    else:
        print("Directory has already been changed.")

# Call the function to change the working directory
change_to_parent_directory()

Directory has already been changed.


## Optional: Create testing dataset
Note: If you want do to this, set create_test_parquet to **True**

In [2]:
create_test_parquet = True

if create_test_parquet:
    import pandas as pd
    import numpy as np

    # List of input Parquet files along with their maximum rows and desired labels
    #input_files = [
    #    {'file': 'testdata/2405_clftest_benign_filtered.parquet', 'max_rows': 4000, 'label': 'benign'},
    #    {'file': 'testdata/2405_clftest_phishing_filtered.parquet', 'max_rows': 480, 'label': 'phishing'},
    #    {'file': 'testdata/2405_clftest_malware_filtered.parquet', 'max_rows': 292, 'label': 'malware'},
    #    {'file': 'testdata/dga_2310.parquet', 'max_rows': 300, 'label': 'dga'},
    #]
    
    #input_files = [
    #    {'file': 'testdata/2405_clftest_benign_filtered.parquet', 'max_rows': 300, 'label': 'benign'},
    #    {'file': 'testdata/2405_clftest_phishing_filtered.parquet', 'max_rows': 308, 'label': 'phishing'},
    #    {'file': 'testdata/2405_clftest_malware_filtered.parquet', 'max_rows': 292, 'label': 'malware'},
    #    {'file': 'testdata/dga_2310.parquet', 'max_rows': 300, 'label': 'dga'},
    #]

    input_files = [
        {'file': 'testdata/2405_clftest_benign_filtered_HTML.parquet', 'max_rows': 150, 'label': 'benign'},
        {'file': 'testdata/2405_clftest_malware_filtered_HTML.parquet', 'max_rows': 150, 'label': 'malware'},
    ]

    # Number of rows to select in total
    #n_rows = 5072
    n_rows = 210
    
    # Read the first file to get the initial columns and create the first dataframe
    first_file_info = input_files[0]
    combined_df = pd.read_parquet(first_file_info['file'])
    
    # Limit the number of rows if necessary for the first file
    if len(combined_df) > first_file_info['max_rows']:
        combined_df = combined_df.sample(n=first_file_info['max_rows'], random_state=1)
    
    # Overwrite the "label" column with the specified label for the first file
    combined_df['label'] = first_file_info['label']
    
    # Get the columns from the first dataframe
    all_columns = combined_df.columns.tolist()

    # Process the remaining files
    for file_info in input_files[1:]:
        df = pd.read_parquet(file_info['file'])
        
        # Limit the number of rows if necessary
        if len(df) > file_info['max_rows']:
            df = df.sample(n=file_info['max_rows'], random_state=1)
        
        # Overwrite the "label" column with the specified label
        df['label'] = file_info['label']
        
        # Ensure all columns from the first dataframe are present
        for col in all_columns:
            if col not in df.columns:
                df[col] = None
        
        # Align the dataframe to the columns of the first dataframe
        df = df[all_columns]
        
        # Append the dataframe to the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # Randomly select n_rows rows from the combined DataFrame
    selected_rows = combined_df.sample(n=n_rows, random_state=1)  # random_state for reproducibility

    selected_rows = selected_rows.drop(columns=['label_x', 'label_y'], errors='ignore')
    
    # Save the selected rows to a new Parquet file
    selected_rows.to_parquet('testdata/ver_html_test.parquet')
    print(df.columns)


Index(['domain_name', 'dns_has_dnskey', 'dns_A_count', 'dns_AAAA_count',
       'dns_MX_count', 'dns_NS_count', 'dns_TXT_count', 'dns_SOA_count',
       'dns_CNAME_count', 'dns_zone_level',
       ...
       'html_num_of_form_js', 'html_malicious_form', 'html_most_common',
       'html_num_of_css_internal', 'html_num_of_css_external',
       'html_num_of_anchors_to_content', 'html_num_of_anchors_to_void',
       'html_num_of_blank_spaces', 'html_blocked_keywords_label', 'label'],
      dtype='object', length=265)


## Run classification

In [3]:
# Specify the parquet file with the dataset for classification
test_dataset = 'testdata/ver_html_test.parquet'

# Number of domain names to classify with each run of the pipeline (0 = classify all)
CHUNK_SIZE = 30

In [4]:
import pandas as pd
import numpy as np
import shap
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from classifiers.pipeline import Pipeline
from classifiers.options import PipelineOptions
#import matplotlib.pyplot as plt

# Initialize the classification pipeline
clf_options = PipelineOptions()
print(clf_options)
clf = Pipeline(clf_options)
print(clf)
# Read the input parquet file
input_df = pd.read_parquet(test_dataset)
print(input_df)
# Function to map labels to 'benign' or 'malign'
def map_label(label):
    if label == 'benign':
        return 'benign'
    else:
        return f'malign ({label})'

# Function to convert labels to binary classes
def binary_label(label):
    return 'negative' if label == 'benign' else 'positive'

# Apply label mapping
input_df['mapped_label'] = input_df['label'].apply(map_label)
input_df['binary_label'] = input_df['label'].apply(binary_label)

# Ensure SHAP JavaScript initialization
#shap.initjs()

# Determine the number of chunks
num_chunks = (len(input_df) + CHUNK_SIZE - 1) // CHUNK_SIZE if CHUNK_SIZE > 0 else 1

# Initialize counters for overall statistics
total_true_labels = []
total_pred_labels = []

# Format string for aligned output
header_format_str = "{:<3} | {:<50} | {:<18} | {:<10} | {:<10}"
data_format_str = "{:<3} | {:<50} | {:<18} | {:<10} | {:.6f}"

# Process the dataframe in chunks
for i in range(num_chunks):
    if CHUNK_SIZE > 0:
        start_idx = i * CHUNK_SIZE
        end_idx = start_idx + CHUNK_SIZE
        chunk_df = input_df[start_idx:end_idx]
    else:
        chunk_df = input_df

    # Perform your classification or processing on the working_df here
    print(f"===== Processing chunk {i+1}/{num_chunks} =====")

    chunk_without_label = chunk_df.drop(columns=['label', 'mapped_label', 'binary_label']) # Label should not be known to classifiers
    chunk_results = clf.classify_domains(chunk_without_label)

    # Collect predictions and true labels
    true_labels = chunk_df['binary_label'].values
    pred_labels = []
    for result in chunk_results:
        pred_label = 'negative' if result['aggregate_probability'] < 0.5 else 'positive'
        pred_labels.append(pred_label)

    # Update overall statistics
    total_true_labels.extend(true_labels)
    total_pred_labels.extend(pred_labels)

    # Display header for results
    print(header_format_str.format("Res", "Domain Name", "Actual Label", "Predicted", "Probability"))

    # Display results for each domain
    for idx, result in enumerate(chunk_results):
        actual_label = chunk_df.iloc[idx]['mapped_label']
        predicted_label = 'benign' if pred_labels[idx] == 'negative' else 'malign'
        domain_name = result['domain_name']
        aggregate_probability = result['aggregate_probability']
        status = "OK" if pred_labels[idx] == true_labels[idx] else "ER"
        print(data_format_str.format(status, domain_name[:50], actual_label, predicted_label, aggregate_probability))
        
        # Run debug_domain method for misclassified domains
        #if status == "ER":
            #print(f"Debugging misclassified domain: {domain_name}")
            #ndf_data = clf.pp.df_to_NDF(chunk_df, "phishing")  # Convert to NDF
            #debug_data = clf.clf_phishing_cnn.debug_domain(domain_name, ndf_data, chunk_df, n_top_features=10)
            
            # Print out the top n feature importances and values for each classifier
            #for classifier, data in debug_data.items():
            #    print(f"\nClassifier: {classifier}")
            #    print(f"Top {len(data['top_features'])} features for domain '{domain_name}':")
            #    for feature_info in data['top_features']:
            #        print(f"Feature: {feature_info['feature']}, Value: {feature_info['value']}, SHAP Value: {feature_info['shap_value']}")
                
                # Display the force plot for phishing_lgbm
                #if classifier == "phishing_cnn":
                #    base_value, shap_values, domain_row = data['force_plot_data']
                #    shap.force_plot(base_value, shap_values, domain_row)
                #    plt.show()

    # Calculate metrics for the current chunk
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, pos_label='positive', average='binary')
    recall = recall_score(true_labels, pred_labels, pos_label='positive', average='binary')
    f1 = f1_score(true_labels, pred_labels, pos_label='positive', average='binary')

    tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels, labels=['negative', 'positive']).ravel()
    false_positives = fp
    false_negatives = fn
    total_positives = tp + fp
    total_negatives = tn + fn

    fp_ratio = (false_positives / total_positives) if total_positives > 0 else 0
    fn_ratio = (false_negatives / total_negatives) if total_negatives > 0 else 0

    print(f"Chunk {i+1}/{num_chunks} metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"False Positives: {false_positives} ({fp_ratio * 100:.2f}%)")
    print(f"False Negatives: {false_negatives} ({fn_ratio * 100:.2f}%)")
    print(f"===== Chunk {i+1}/{num_chunks} completed. =====")

# Calculate overall metrics
overall_accuracy = accuracy_score(total_true_labels, total_pred_labels)
overall_precision = precision_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')
overall_recall = recall_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')
overall_f1 = f1_score(total_true_labels, total_pred_labels, pos_label='positive', average='binary')

overall_tn, overall_fp, overall_fn, overall_tp = confusion_matrix(total_true_labels, total_pred_labels, labels=['negative', 'positive']).ravel()
overall_false_positives = overall_fp
overall_false_negatives = overall_fn
overall_total_positives = overall_tp + overall_fp
overall_total_negatives = overall_tn + overall_fn

overall_fp_ratio = (overall_false_positives / overall_total_positives) if overall_total_positives > 0 else 0
overall_fn_ratio = (overall_false_negatives / overall_total_negatives) if overall_total_negatives > 0 else 0

print("Overall metrics:")
print(f"Overall Accuracy: {overall_accuracy}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1 Score: {overall_f1}")
print(f"Overall False Positives: {overall_false_positives} ({overall_fp_ratio * 100:.2f}%)")
print(f"Overall False Negatives: {overall_false_negatives} ({overall_fn_ratio * 100:.2f}%)")


  from .autonotebook import tqdm as notebook_tqdm
2024-10-22 11:38:04.312165: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-22 11:38:04.335297: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<classifiers.options.PipelineOptions object at 0x7f8b9e0c2a90>
CNN model created
<classifiers.pipeline.Pipeline object at 0x7f8acad10110>
                            domain_name  dns_has_dnskey  dns_A_count  \
39     drachov.katalog.knihovnatabor.cz             1.0            0   
168                 en.supplypoland.com             0.0            2   
93                     www.outbrain.com             0.0            0   
62   hisinone-studium.oth-regensburg.de             0.0            0   
197                           oil-z.com             0.0            2   
..                                  ...             ...          ...   
142                   thewoksoflife.com             0.0            3   
157                   nickeldrive.autos             0.0            2   
156                   www.nametests.com             0.0            5   
139                           gizmod.ru             0.0            1   
146                        www.ipuss.tv             0.0            1  

## Optional: Generate preliminary results for training the final aggregation classifier

In [5]:
import pandas as pd
import numpy as np
from classifiers.pipeline import Pipeline

# Initialize the classification pipeline
clf = Pipeline()

# Read the input parquet file
input_df = pd.read_parquet(test_dataset)

input_df = input_df.sample(frac=1).reset_index(drop=True)

#preliminary_results_df = clf.generate_preliminary_results(input_df, output_file="test_preliminary_results.parquet")
preliminary_results_df = clf.generate_preliminary_results(input_df, add_final=False)

CNN model created


In [6]:
pd.reset_option('display.max_rows')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.options.display.float_format = '{:.6f}'.format
preliminary_results_df

Unnamed: 0,domain_name,dns_available,dns_nonzero,tls_available,tls_nonzero,ip_available,ip_nonzero,rdap_available,rdap_nonzero,geo_available,geo_nonzero,phishing_cnn_result,phishing_lgbm_result,phishing_xgboost_result,phishing_deepnn_result,phishing_dns_nn_result,phishing_rdap_nn_result,phishing_geo_nn_result,phishing_ip_nn_result,phishing_html_lgbm_result,malware_lgbm_result,malware_xgboost_result,malware_deepnn_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result,malware_ip_nn_result,malware_html_lgbm_result,dga_binary_deepnn_result,dga_binary_lgbm_result,phishing_sum,phishing_avg,phishing_prod,malware_sum,malware_avg,malware_prod,dga_binary_sum,dga_binary_avg,dga_binary_prod,total_sum,total_avg,total_prod,label,Unnamed: 44,Unnamed: 45
0,medunigraz.webex.com,0.9,0.5,1.0,0.833333,1.0,0.25,1.0,0.708333,1.0,0.666667,0.000355,0.000659,0.000686,0.00185,0.016548,0.006708,0.276619,0.181992,0.502376,0.000321,0.002408,1.6e-05,0.003224,0.001855,0.211472,0.006459,0.169694,0.0,6e-06,0.987793,0.109755,0.0,0.395448,0.049431,0.0,6e-06,3e-06,0.0,1.383247,0.072802,0.0,benign,,
1,familylifeconnections.com,0.9,0.525,1.0,0.541667,1.0,0.5,1.0,0.5,1.0,0.666667,0.952957,0.002706,0.003238,0.463886,0.064171,0.007974,0.252255,0.184353,0.443108,0.663725,0.791218,0.83097,0.964708,0.016577,0.689959,0.279973,0.31677,0.101099,0.000681,2.374648,0.26385,0.0,4.553899,0.569237,0.000427,0.10178,0.05089,6.9e-05,7.030327,0.370017,0.0,malware,,
2,cmr.earthdata.nasa.gov,0.9,0.55,1.0,0.833333,1.0,0.375,1.0,0.583333,1.0,0.722222,0.0,0.000254,0.000937,1e-05,0.021287,0.000834,0.390085,0.047373,0.306089,0.000127,0.002233,0.0,0.004028,0.002628,0.307506,0.004919,0.067373,0.0,0.0,0.766869,0.085208,0.0,0.388815,0.048602,0.0,0.0,0.0,0.0,1.155684,0.060825,0.0,benign,,
3,haveibeenpwned.com,1.0,0.8,1.0,0.791667,1.0,0.625,1.0,0.75,1.0,0.722222,1e-05,0.005389,0.001309,0.008162,0.01354,0.014646,0.521558,0.489801,0.22272,0.177067,0.074227,0.056235,0.055626,0.027542,0.399883,0.087516,0.171447,0.650179,0.022162,1.277135,0.141904,0.0,1.049542,0.131193,0.0,0.67234,0.33617,0.014409,2.999018,0.157843,0.0,benign,,
4,mail.rub.de,0.65,0.375,1.0,0.833333,1.0,0.625,0.875,0.5,1.0,0.944444,9e-06,0.000484,0.001493,0.000224,0.007845,0.005538,0.160036,0.688248,0.138378,0.000512,0.002364,6.2e-05,0.032958,0.001689,0.399883,0.291825,0.052578,0.005376,1.1e-05,1.002255,0.111362,0.0,0.781872,0.097734,0.0,0.005386,0.002693,0.0,1.789514,0.094185,0.0,benign,,
5,pl.supplypoland.com,0.6,0.25,1.0,0.791667,1.0,0.75,1.0,0.75,1.0,1.0,0.037852,0.979079,0.9125,0.99999,0.137938,0.015571,0.397401,0.535513,0.608367,0.857816,0.849868,0.84311,0.307426,0.236392,0.688216,0.71041,0.680274,0.00021,1e-06,4.624212,0.513801,9e-06,5.173512,0.646689,0.014857,0.000211,0.000105,0.0,9.797934,0.515681,0.0,malware,,
6,zakazky.pardubickykraj.cz,0.6,0.25,1.0,0.833333,1.0,0.625,1.0,0.625,1.0,0.666667,0.000126,0.000646,0.000626,0.000735,0.816929,0.001483,0.004337,0.182926,0.197307,0.000908,0.002626,8.6e-05,0.050268,0.002022,0.060135,0.280062,0.065771,1e-06,2.2e-05,1.205115,0.133902,0.0,0.461877,0.057735,0.0,2.2e-05,1.1e-05,0.0,1.667015,0.087738,0.0,benign,,
7,oroscopo.donnaclick.it,0.6,0.2,1.0,0.75,1.0,0.375,1.0,0.833333,1.0,1.0,0.002029,0.027839,0.002413,0.012733,0.014023,0.002498,0.038165,0.326686,0.074949,0.000388,0.002408,0.0,0.132851,0.002471,0.040997,0.006016,0.04837,0.0,6e-06,0.501335,0.055704,0.0,0.2335,0.029188,0.0,6e-06,3e-06,0.0,0.734841,0.038676,0.0,benign,,
8,fixviol.xyz,0.9,0.55,1.0,0.791667,1.0,0.75,1.0,0.75,1.0,1.0,0.100622,0.888865,0.871684,0.987133,0.143861,0.031188,0.397401,0.532873,0.428519,0.987346,0.976092,0.883472,0.408389,0.473468,0.688216,0.784726,0.297787,0.912645,0.089175,4.382146,0.486905,3.1e-05,5.499496,0.687437,0.026477,1.00182,0.50091,0.081385,10.883462,0.572814,0.0,malware,,
9,interspace.sbs,0.9,0.55,1.0,0.833333,1.0,0.75,1.0,0.75,1.0,1.0,0.070303,0.944586,0.893039,0.606527,0.157411,0.027224,0.397401,0.530926,0.433451,0.886967,0.977278,0.893936,0.441397,0.440355,0.688216,0.809495,0.882866,0.00244,3e-06,4.060868,0.451208,1.4e-05,6.02051,0.752564,0.074079,0.002443,0.001221,0.0,10.08382,0.530727,0.0,malware,,


In [7]:
pd.set_option('display.max_rows', None)
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "malware"])][["domain_name", "label", "phishing_cnn_result", "malware_xgboost_result", "phishing_html_lgbm_result", "malware_html_lgbm_result","malware_lgbm_result"]]

Unnamed: 0,domain_name,label,phishing_cnn_result,malware_xgboost_result,phishing_html_lgbm_result,malware_html_lgbm_result,malware_lgbm_result
0,medunigraz.webex.com,benign,0.000355,0.002408,0.502376,0.169694,0.000321
1,familylifeconnections.com,malware,0.952957,0.791218,0.443108,0.31677,0.663725
2,cmr.earthdata.nasa.gov,benign,0.0,0.002233,0.306089,0.067373,0.000127
3,haveibeenpwned.com,benign,1e-05,0.074227,0.22272,0.171447,0.177067
4,mail.rub.de,benign,9e-06,0.002364,0.138378,0.052578,0.000512
5,pl.supplypoland.com,malware,0.037852,0.849868,0.608367,0.680274,0.857816
6,zakazky.pardubickykraj.cz,benign,0.000126,0.002626,0.197307,0.065771,0.000908
7,oroscopo.donnaclick.it,benign,0.002029,0.002408,0.074949,0.04837,0.000388
8,fixviol.xyz,malware,0.100622,0.976092,0.428519,0.297787,0.987346
9,interspace.sbs,malware,0.070303,0.977278,0.433451,0.882866,0.886967


In [16]:
preliminary_results_df[["domain_name", "label", "phishing_deepnn_result", "phishing_lgbm_result","malware_lgbm_result", "malware_xgboost_result"]]

Unnamed: 0,domain_name,label,phishing_deepnn_result,phishing_lgbm_result,malware_lgbm_result,malware_xgboost_result
0,learncalligraphy.eu,benign,0.728575,0.036375,0.732897,0.694849
1,strimat.ru,benign,0.005329,0.006161,0.201182,0.499941
2,4297lpwm.computer-mod-s.tech,phishing,0.276442,0.105811,0.00085,0.002626
3,cbsvib.ru,benign,0.017742,0.034122,0.144569,0.304841
4,www.mywaymag.ru,benign,0.028642,0.011305,0.016059,0.011198
5,gizmod.ru,benign,0.003697,0.084907,0.093318,0.284888
6,zakazky.pardubickykraj.cz,benign,0.000735,0.000646,0.000908,0.002626
7,karriere.hofer.at,benign,0.029117,0.002096,0.00156,0.005187
8,bits.ashleyblewer.com,benign,0.019198,0.024398,0.000298,0.002233
9,webital.ir,phishing,0.033923,0.062392,0.003159,0.009123


In [11]:
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "phishing"])][
    ["domain_name", "label", "malware_deepnn_result", "malware_lgbm_result", "phishing_lgbm_result","phishing_xgboost_result", "malware_dns_nn_result", "malware_rdap_nn_result", "malware_geo_nn_result"]
]


Unnamed: 0,domain_name,label,malware_deepnn_result,malware_lgbm_result,phishing_lgbm_result,phishing_xgboost_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result
0,cancelartupago.appbancolombia.co.hstn.me,phishing,0.002447,0.003497,0.014593,0.090689,0.004229,0.007939,0.37939
1,node.show,benign,1e-06,0.000108,0.000746,0.000601,0.103703,1.9e-05,0.013399
2,servismail6.figomif911.workers.dev,phishing,0.000148,0.000606,0.363261,0.872132,0.605706,0.005307,0.662648
3,telstra-100481.weeblysite.com,phishing,0.000288,0.000966,0.976012,0.997737,0.149615,0.003003,0.399883
4,iti.0731ktv.cn,phishing,0.001427,0.001329,0.009227,0.019488,0.149749,0.008626,0.365328
5,mmo.org.hk,benign,0.0,5.7e-05,0.00023,0.000422,0.053872,0.0,2.4e-05
6,koszalin.praca.gov.pl,benign,3e-06,0.000117,0.000247,0.00041,0.26839,0.000235,0.197541
7,bafybeidckbotq4h5lxa3gem66lqt4dp2eml7thcgwpm4f...,phishing,0.00023,0.000339,0.972171,0.989508,0.374449,0.006523,0.399883
8,aksioma.lviv.ua,benign,0.0,0.00011,0.000194,0.000408,0.182564,0.003534,0.172861
9,nftyjsekyu1.pages.dev,phishing,0.0,7.4e-05,0.842235,0.949956,0.746663,6.3e-05,1.8e-05


In [10]:
preliminary_results_df[preliminary_results_df["label"].isin(["benign", "malware"])][
    ["domain_name", "label", "malware_deepnn_result", "malware_dns_nn_result", "malware_rdap_nn_result", "malware_geo_nn_result", "malware_ip_nn_result"]
]


Unnamed: 0,domain_name,label,malware_deepnn_result,malware_dns_nn_result,malware_rdap_nn_result,malware_geo_nn_result,malware_ip_nn_result
1,brandstorm.loreal.com,benign,0.000122,0.184353,0.001344,0.399883,0.548636
2,wytypowany-zwyciezca.com,malware,0.759562,0.775626,0.021642,0.399883,0.212463
3,oopt.fr,malware,0.12285,0.070304,0.024303,0.209001,0.450995
4,16chan.org,malware,0.627538,0.941982,0.04969,0.605018,0.279973
7,www.sprawozdania24.click,malware,0.849272,0.589228,0.398529,0.399883,0.452909
9,patasblancasvet.com,malware,0.794451,0.888655,0.061971,0.689959,0.330984
10,www.ipop.at,benign,2.1e-05,0.032429,0.175987,0.129151,0.00627
13,vurbont.xyz,malware,0.889243,0.398776,0.310197,0.399883,0.462252
14,srebny23.aleksytymon.com.pl,malware,0.000637,0.33502,0.000711,0.399883,0.637277
15,tools.pinpoll.com,benign,8e-06,0.076731,0.047117,0.016066,0.005857


In [11]:
#print(preliminary_results_df['malware_geo_nn_result'].unique())
#for value in preliminary_results_df['malware_geo_nn_result'].unique():
#    print(value)

In [12]:
preliminary_results_df[["label", "dga_binary_deepnn_result", "dga_binary_lgbm_result"]]

Unnamed: 0,label,dga_binary_deepnn_result,dga_binary_lgbm_result
0,dga,0.922759,0.195645
1,benign,0.0,2e-06
2,malware,0.156716,0.007199
3,malware,0.136516,0.001036
4,malware,0.027879,0.00108
5,phishing,1e-06,0.001439
6,phishing,0.482309,0.006404
7,malware,0.0,0.01045
8,phishing,0.0,6e-06
9,malware,0.226647,0.003


In [6]:
preliminary_results_df[["domain_name", "label", "phishing_cnn_result", "phishing_deepnn_result", "phishing_lgbm_result", "phishing_xgboost_result", "phishing_dns_nn_result", "phishing_rdap_nn_result"]]

NameError: name 'preliminary_results_df' is not defined

In [14]:
preliminary_results_df[["domain_name", "label", "malware_deepnn_result", "malware_lgbm_result", "malware_xgboost_result"]]

Unnamed: 0,domain_name,label,malware_deepnn_result,malware_lgbm_result,malware_xgboost_result
0,olkano.com,dga,0.001556,0.01397,0.047478
1,brandstorm.loreal.com,benign,0.000122,0.000514,0.002626
2,wytypowany-zwyciezca.com,malware,0.759562,0.383536,0.286979
3,oopt.fr,malware,0.12285,0.277118,0.445704
4,16chan.org,malware,0.627538,0.708378,0.589509
5,down.t0kbnpobket.biz,phishing,0.000658,0.010174,0.005187
6,bolsadetrabajosgt.com,phishing,0.0,6.7e-05,0.002233
7,www.sprawozdania24.click,malware,0.849272,0.677257,0.90705
8,discord.writemall.top,phishing,0.000252,0.000411,0.002364
9,patasblancasvet.com,malware,0.794451,0.881981,0.884218


In [15]:
preliminary_results_df[["domain_name", "label", "badness_probability"]]

KeyError: "['badness_probability'] not in index"

In [None]:
preliminary_results_df