### The `wrapper` notebook, to report the mean and standard deviation of the F1 scores -
#### Calculated over `5` runs with different random seeds

In [1]:
import os
import sys
import time
from datetime import timedelta

import nbformat
import numpy as np
from nbconvert.preprocessors import ExecutePreprocessor

In [2]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError("Only runs efficiently, as tested, on Python 3.11.8")

In [3]:
### TO HIDE THE LOGGING FROM SPARK ###

from IPython.display import HTML

HTML('''<script>
var code_show_err = false;
var code_toggle_err = function() {
    var stderrNodes = document.querySelectorAll('[data-mime-type="application/vnd.jupyter.stderr"]')
    var stderr = Array.from(stderrNodes)
    if (code_show_err){
        stderr.forEach(ele => ele.style.display = 'block');
    } else {
        stderr.forEach(ele => ele.style.display = 'none');
    }
    code_show_err = !code_show_err
}
document.addEventListener('DOMContentLoaded', code_toggle_err);
</script>
To toggle on/off output_stderr, click <a onclick="javascript:code_toggle_err()">here</a>.''')

In [4]:
def parse_output(nb_out, seed):
    result = ""
    for row in nb_out[0]["cells"]:
        for output in row.get("outputs", [{"text": ""}]):
            if output.get("text", "").startswith(f"SEED={seed} "):
                result = output["text"]
                break
        if result:
            break
    return result.split("\n")


def print_results(f1_scores):
    f1_mean = round(np.mean(f1_scores), 2)
    f1_std = round(np.std(f1_scores), 2)

    train_perc = int(float(os.environ["EXSTRAQT_TRAIN_PERC"]) * 100)
    validation_perc = int(float(os.environ["EXSTRAQT_VALIDATION_PERC"]) * 100)
    test_perc = int(float(os.environ["EXSTRAQT_TEST_PERC"]) * 100)

    high_illicit = bool(int(os.environ["EXSTRAQT_HIGH_ILLICIT"]))
    high_or_low_illicit = "HI" if high_illicit else "LI"
    file_size = os.environ["EXSTRAQT_FILE_SIZE"]
    
    print("=" * 100)
    print(
        f"[{file_size}-{high_or_low_illicit}-SPLITS={train_perc}/{validation_perc}/{test_perc}]"
        f" {f1_mean} ± {f1_std}"
    )
    print("=" * 100)

In [5]:
os.environ["EXSTRAQT_NUM_PROCS"] = "6"
filename = "2_model.ipynb"
with open(filename) as ff:
    nb_in = nbformat.read(ff, nbformat.NO_CONVERT)

In [5]:
os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Small"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"

f1_scores = []
start = time.time()
for seed in [10, 20, 30, 40, 50]:
    executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in), seed)
    print(result[0])
    delta = round(time.time() - start)
    print(f"Elasped time: {timedelta(seconds=delta)}")
    start = time.time()
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/02 11:55:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=10 f1=78.1 recall=68.56
Elasped time: 3:17:08



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/02 15:13:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=20 f1=78.01 recall=68.78
Elasped time: 3:17:29



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/02 18:30:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=30 f1=78.13 recall=68.67
Elasped time: 3:18:18



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/02 21:48:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=40 f1=78.2 recall=68.67
Elasped time: 3:26:48



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/03 01:15:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=50 f1=78.21 recall=68.89
Elasped time: 3:27:42

[Small-HI-SPLITS=60/20/20] 78.13 ± 0.07


In [6]:
os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Small"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.64"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.19"
os.environ["EXSTRAQT_TEST_PERC"] = "0.17"

f1_scores = []
start = time.time()
for seed in [10, 20, 30, 40, 50]:
    executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in), seed)
    print(result[0])
    delta = round(time.time() - start)
    print(f"Elasped time: {timedelta(seconds=delta)}")
    start = time.time()
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/03 17:12:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=10 f1=80.78 recall=71.88
Elasped time: 3:18:42



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/03 20:31:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=20 f1=80.69 recall=72.13
Elasped time: 3:19:32



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/03 23:50:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=30 f1=80.81 recall=72.25
Elasped time: 3:21:49



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/04 03:12:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=40 f1=81.1 recall=72.19
Elasped time: 3:22:35



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/04 06:34:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=50 f1=80.75 recall=72.13
Elasped time: 3:22:40

[Small-HI-SPLITS=64/19/17] 80.83 ± 0.14


In [7]:
os.environ["EXSTRAQT_HIGH_ILLICIT"] = "0"
os.environ["EXSTRAQT_FILE_SIZE"] = "Small"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"

f1_scores = []
start = time.time()
for seed in [10, 20, 30, 40, 50]:
    executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in), seed)
    print(result[0])
    delta = round(time.time() - start)
    print(f"Elasped time: {timedelta(seconds=delta)}")
    start = time.time()
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/07 21:37:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=10 f1=42.96 recall=28.54
Elasped time: 4:23:22



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/08 02:00:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=20 f1=42.51 recall=28.22
Elasped time: 4:21:47



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/08 06:22:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=30 f1=42.0 recall=27.68
Elasped time: 4:27:41



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/08 10:50:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=40 f1=42.53 recall=28.0
Elasped time: 4:24:36



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/08 15:14:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=50 f1=42.69 recall=28.11
Elasped time: 4:31:12

[Small-LI-SPLITS=60/20/20] 42.54 ± 0.31


In [8]:
os.environ["EXSTRAQT_HIGH_ILLICIT"] = "0"
os.environ["EXSTRAQT_FILE_SIZE"] = "Small"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.64"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.19"
os.environ["EXSTRAQT_TEST_PERC"] = "0.17"

f1_scores = []
start = time.time()
for seed in [10, 20, 30, 40, 50]:
    executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in), seed)
    print(result[0])
    delta = round(time.time() - start)
    print(f"Elasped time: {timedelta(seconds=delta)}")
    start = time.time()
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/08 19:45:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=10 f1=45.72 recall=30.67
Elasped time: 4:31:01



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 00:17:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=20 f1=45.68 recall=30.67
Elasped time: 4:26:21



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 04:43:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=30 f1=45.62 recall=30.55
Elasped time: 4:26:03



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 09:09:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=40 f1=45.56 recall=30.42
Elasped time: 4:36:29



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 13:46:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=50 f1=45.23 recall=30.42
Elasped time: 4:33:51

[Small-LI-SPLITS=64/19/17] 45.56 ± 0.18


In [6]:
os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Medium"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"
os.environ["EXSTRAQT_DIM_REDUCTION_PERC"] = "0.10"
os.environ["EXSTRAQT_SCALE_TO_FLOAT_16"] = "0"
os.environ["EXSTRAQT_SKIP_ANOMALY_DETECTION"] = "1"

f1_scores = []
start = time.time()
for seed in [10]:
    executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in), seed)
    print(result[0])
    delta = round(time.time() - start)
    print(f"Elasped time: {timedelta(seconds=delta)}")
    start = time.time()
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/11 17:41:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=10 f1=78.5 recall=68.49
Elasped time: 11:38:22

[Medium-HI-SPLITS=60/20/20] 78.5 ± 0.0


In [None]:
os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Large"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"
os.environ["EXSTRAQT_DIM_REDUCTION_PERC"] = "0.10"
os.environ["EXSTRAQT_SCALE_TO_FLOAT_16"] = "1"
os.environ["EXSTRAQT_SKIP_ANOMALY_DETECTION"] = "1"

f1_scores = []
start = time.time()
for seed in [10]:
    executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in), seed)
    print(result[0])
    delta = round(time.time() - start)
    print(f"Elasped time: {timedelta(seconds=delta)}")
    start = time.time()
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)