In [None]:
import gc
import os
import random
import time
from glob import glob
from sklearn.metrics import f1_score, recall_score

import numpy as np
import xgboost as xgb

import settings as s

os.environ["EXSTRAQT_DATA_TYPE_FOLDER"] = s.OUTPUT_POSTFIX.lstrip("-")

In [None]:
assert s.FILE_SIZE == "Large", "Script suitable for `large`dataset"

In [None]:
SEED = int(os.environ.get("EXSTRAQT_SEED", 42))
print(f"{SEED=}")
random.seed(SEED)
np.random.seed(SEED)
EXSTRAQT_NUM_PROCS = int(os.environ.get("EXSTRAQT_NUM_PROCS", os.cpu_count()))

In [None]:
location_main = os.path.join("features", os.environ["EXSTRAQT_DATA_TYPE_FOLDER"])

location_train_features_dm = f"{location_main}{os.sep}train_dm.bin"
location_valid_features_dm = f"{location_main}{os.sep}valid_dm.bin"
location_test_features_dm = f"{location_main}{os.sep}test_dm.bin"

In [None]:
cuda_available = False
try:
    import torch
    cuda_available = torch.cuda.is_available()
except ImportError:
    pass

xgb_args = dict(
    seed=SEED,
    max_depth=6,
    scale_pos_weight=1.5,
    eta=0.2,
    subsample=1,
    colsample_bytree=0.5,
    colsample_bylevel=0.5,
    colsample_bynode=0.5,
    num_parallel_tree=15,
    objective="binary:logistic",
    eval_metric="aucpr",
    disable_default_eval_metric=True,
    nthread=9,
    device="cpu",
)

if cuda_available:
    xgb_args["device"] = "cuda"
    xgb_args["nthread"] = 2

In [None]:
files = sorted(
    glob(f"{location_train_features_dm}{os.sep}*.bin"), 
    reverse=True, 
    key=lambda x: int(x.split("/")[-1].split(".")[0])
) or [location_train_features_dm]
files *= 25

In [None]:
%%time

validation_dm = xgb.DMatrix(location_valid_features_dm)

model = None
all_training_files = files or [location_train_features_dm]
for index, fl in enumerate(all_training_files):
    _ = gc.collect()
    train_dm = xgb.DMatrix(fl)
    model = xgb.train(
        xgb_args,
        train_dm,
        num_boost_round=1,
        evals=[(validation_dm, "validation")],
        verbose_eval=True,
        early_stopping_rounds=10,
        xgb_model=model,
    )
    print(f"Trained {index + 1} of {len(all_training_files)} | {model.best_iteration=}")
del train_dm
del validation_dm

In [None]:
test_dm = xgb.DMatrix(location_test_features_dm)
y_test_predicted = model.predict(test_dm, iteration_range=(0, model.best_iteration)) > 0.5

In [None]:
f1_test = f1_score(test_dm.get_label(), y_test_predicted) * 100
print(
    f"{SEED=}",
    f"f1={round(f1_test, 2)}",
    f"recall={round(recall_score(test_dm.get_label(), y_test_predicted) * 100, 2)}",
)
del test_dm
print(f1_test)
print("")   # Extra buffer for parsing nb output
time.sleep(5)
print("\n")