### The `wrapper` notebook, to report the mean and standard deviation of the F1 scores -
#### Calculated over `5` runs with different random seeds

In [1]:
import os
import sys
import time
import shutil
from datetime import timedelta

import nbformat
import numpy as np
from nbconvert.preprocessors import ExecutePreprocessor

In [2]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError("Only runs efficiently, as tested, on Python 3.11.8")

In [3]:
### TO HIDE THE LOGGING FROM SPARK ###

from IPython.display import HTML

HTML('''<script>
var code_show_err = false;
var code_toggle_err = function() {
    var stderrNodes = document.querySelectorAll('[data-mime-type="application/vnd.jupyter.stderr"]')
    var stderr = Array.from(stderrNodes)
    if (code_show_err){
        stderr.forEach(ele => ele.style.display = 'block');
    } else {
        stderr.forEach(ele => ele.style.display = 'none');
    }
    code_show_err = !code_show_err
}
document.addEventListener('DOMContentLoaded', code_toggle_err);
</script>
To toggle on/off output_stderr, click <a onclick="javascript:code_toggle_err()">here</a>.''')

In [4]:
def parse_output(nb_out, seed):
    result = ""
    for row in nb_out[0]["cells"]:
        for output in row.get("outputs", [{"text": ""}]):
            if output.get("text", "").startswith(f"SEED={seed} "):
                result = output["text"]
                break
        if result:
            break
    return result.split("\n")


def print_results(f1_scores):
    f1_mean = round(np.mean(f1_scores), 2)
    f1_std = round(np.std(f1_scores), 2)

    train_perc = int(float(os.environ["EXSTRAQT_TRAIN_PERC"]) * 100)
    validation_perc = int(float(os.environ["EXSTRAQT_VALIDATION_PERC"]) * 100)
    test_perc = int(float(os.environ["EXSTRAQT_TEST_PERC"]) * 100)

    high_illicit = bool(int(os.environ["EXSTRAQT_HIGH_ILLICIT"]))
    high_or_low_illicit = "HI" if high_illicit else "LI"
    file_size = os.environ["EXSTRAQT_FILE_SIZE"]
    
    print("=" * 100)
    print(
        f"[{file_size}-{high_or_low_illicit}-SPLITS={train_perc}/{validation_perc}/{test_perc}]"
        f" {f1_mean} ± {f1_std}"
    )
    print("=" * 100)

In [5]:
os.environ["EXSTRAQT_NUM_PROCS"] = "9"

with open("2_model.ipynb") as ff:
    nb_in_model = nbformat.read(ff, nbformat.NO_CONVERT)

with open("training.ipynb") as ff:
    nb_in_training = nbformat.read(ff, nbformat.NO_CONVERT)

In [6]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Small"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

SEED=10 f1=79.32 recall=67.96

SEED=20 f1=78.95 recall=67.8

SEED=30 f1=78.71 recall=67.74

SEED=40 f1=78.79 recall=67.46

SEED=50 f1=78.73 recall=67.74

[Small-HI-SPLITS=60/20/20] 78.9 ± 0.23
CPU times: user 1.78 s, sys: 1.74 s, total: 3.52 s
Wall time: 1h 1min 33s


In [7]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Small"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.64"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.19"
os.environ["EXSTRAQT_TEST_PERC"] = "0.17"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/05 17:58:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=10 f1=81.28 recall=71.01

SEED=20 f1=80.74 recall=70.14

SEED=30 f1=80.97 recall=70.14

SEED=40 f1=80.35 recall=69.27

SEED=50 f1=81.11 recall=70.52

[Small-HI-SPLITS=64/19/17] 80.89 ± 0.32
CPU times: user 11.9 s, sys: 5.79 s, total: 17.7 s
Wall time: 4h 20min 20s


In [8]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "0"
os.environ["EXSTRAQT_FILE_SIZE"] = "Small"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/08 03:28:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=10 f1=42.55 recall=29.95

SEED=20 f1=42.28 recall=29.73

SEED=30 f1=42.59 recall=29.51

SEED=40 f1=43.49 recall=29.95

SEED=50 f1=42.75 recall=29.95

[Small-LI-SPLITS=60/20/20] 42.73 ± 0.41
CPU times: user 12.6 s, sys: 5.56 s, total: 18.1 s
Wall time: 4h 31min 1s


In [6]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "0"
os.environ["EXSTRAQT_FILE_SIZE"] = "Small"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.64"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.19"
os.environ["EXSTRAQT_TEST_PERC"] = "0.17"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

SEED=10 f1=45.3 recall=30.92

SEED=20 f1=45.03 recall=30.8

SEED=30 f1=45.24 recall=30.8

SEED=40 f1=44.93 recall=30.67

SEED=50 f1=45.01 recall=30.92

[Small-LI-SPLITS=64/19/17] 45.1 ± 0.14
CPU times: user 3.38 s, sys: 5.14 s, total: 8.52 s
Wall time: 2h 18min 43s


In [7]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Medium"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"
os.environ["EXSTRAQT_DIM_REDUCTION_PERC"] = "0.15"
os.environ["EXSTRAQT_SKIP_ANOMALY_DETECTION"] = "1"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

SEED=10 f1=80.78 recall=73.56

SEED=20 f1=80.67 recall=73.81

SEED=30 f1=80.81 recall=73.88

SEED=40 f1=80.53 recall=73.79

SEED=50 f1=80.75 recall=73.69

[Medium-HI-SPLITS=60/20/20] 80.71 ± 0.1
CPU times: user 2.11 s, sys: 1.47 s, total: 3.58 s
Wall time: 1h 33min 48s


In [7]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Medium"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.61"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.17"
os.environ["EXSTRAQT_TEST_PERC"] = "0.22"
os.environ["EXSTRAQT_DIM_REDUCTION_PERC"] = "0.15"
os.environ["EXSTRAQT_SKIP_ANOMALY_DETECTION"] = "1"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/08 22:10:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".
                                                                                

SEED=10 f1=80.08 recall=72.55

SEED=20 f1=79.64 recall=72.62

SEED=30 f1=79.83 recall=72.62

SEED=40 f1=79.97 recall=72.34

SEED=50 f1=79.78 recall=72.11

[Medium-HI-SPLITS=61/17/22] 79.86 ± 0.15
CPU times: user 23.9 s, sys: 12.6 s, total: 36.6 s
Wall time: 12h 23min 25s


In [7]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "0"
os.environ["EXSTRAQT_FILE_SIZE"] = "Medium"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"
os.environ["EXSTRAQT_DIM_REDUCTION_PERC"] = "0.15"
os.environ["EXSTRAQT_SKIP_ANOMALY_DETECTION"] = "1"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

SEED=10 f1=47.47 recall=33.2

SEED=20 f1=47.61 recall=33.23

SEED=30 f1=47.51 recall=33.28

SEED=40 f1=47.93 recall=33.31

SEED=50 f1=47.81 recall=33.76

[Medium-LI-SPLITS=60/20/20] 47.67 ± 0.18
CPU times: user 2.83 s, sys: 1.96 s, total: 4.79 s
Wall time: 1h 25min 1s


In [6]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "0"
os.environ["EXSTRAQT_FILE_SIZE"] = "Medium"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.61"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.17"
os.environ["EXSTRAQT_TEST_PERC"] = "0.22"
os.environ["EXSTRAQT_DIM_REDUCTION_PERC"] = "0.15"
os.environ["EXSTRAQT_SKIP_ANOMALY_DETECTION"] = "1"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

SEED=10 f1=45.19 recall=29.99

SEED=20 f1=45.27 recall=30.34

SEED=30 f1=45.3 recall=30.06

SEED=40 f1=45.26 recall=29.99

SEED=50 f1=45.37 recall=30.31

[Medium-LI-SPLITS=61/17/22] 45.28 ± 0.06
CPU times: user 2.75 s, sys: 1.74 s, total: 4.49 s
Wall time: 1h 23min 58s


In [6]:
with open("training_large.ipynb") as ff:
    nb_in_training = nbformat.read(ff, nbformat.NO_CONVERT)

In [None]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "1"
os.environ["EXSTRAQT_FILE_SIZE"] = "Large"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"
os.environ["EXSTRAQT_DIM_REDUCTION_PERC"] = "0.075"
os.environ["EXSTRAQT_SCALE_TO_FLOAT_16"] = "1"
os.environ["EXSTRAQT_SKIP_ANOMALY_DETECTION"] = "1"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

# f1_scores = []
# for seed in [10, 20, 30, 40, 50]:
#     os.environ["EXSTRAQT_SEED"] = str(seed)
#     result = parse_output(executor.preprocess(nb_in_training), seed)
#     print(result[0])
#     print()
#     f1_scores.append(float(result[1]))

# print_results(f1_scores)
# shutil.rmtree("./features/", ignore_errors=True)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/09 21:41:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/09 21:41:29 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in standalone/kubernetes and LOCAL_DIRS in YARN).

In [7]:
%%time

os.environ["EXSTRAQT_HIGH_ILLICIT"] = "0"
os.environ["EXSTRAQT_FILE_SIZE"] = "Large"
os.environ["EXSTRAQT_TRAIN_PERC"] = "0.60"
os.environ["EXSTRAQT_VALIDATION_PERC"] = "0.20"
os.environ["EXSTRAQT_TEST_PERC"] = "0.20"
os.environ["EXSTRAQT_DIM_REDUCTION_PERC"] = "0.075"
os.environ["EXSTRAQT_SCALE_TO_FLOAT_16"] = "1"
os.environ["EXSTRAQT_SKIP_ANOMALY_DETECTION"] = "1"

executor = ExecutePreprocessor(timeout=24*60*60, kernel_name="python3")
_ = executor.preprocess(nb_in_model)

f1_scores = []
for seed in [10, 20, 30, 40, 50]:
    os.environ["EXSTRAQT_SEED"] = str(seed)
    result = parse_output(executor.preprocess(nb_in_training), seed)
    print(result[0])
    print()
    f1_scores.append(float(result[1]))

print_results(f1_scores)
shutil.rmtree("./features/", ignore_errors=True)

SEED=10 f1=38.44 recall=26.08



python3.11(11092) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


SEED=20 f1=38.4 recall=26.0



python3.11(33127) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


SEED=30 f1=38.26 recall=26.09



python3.11(56715) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


SEED=40 f1=38.11 recall=26.18



python3.11(80152) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


SEED=50 f1=38.45 recall=26.3

[Large-LI-SPLITS=60/20/20] 38.33 ± 0.13
CPU times: user 28.6 s, sys: 28.1 s, total: 56.7 s
Wall time: 1d 28min 44s
