In [None]:
from pathlib import Path
import os

base_folder = Path("/home/automl/git/iot-threat-classifier/2025-06-28/Input_Multiclass")

dispatcher_filename = Path(os.path.join(base_folder, "start"))

In [None]:
from datetime import datetime
from time import sleep

def now():
    now = datetime.now()
    yyyymmdd_hhmmss_part = now.strftime('%Y-%m-%d %H:%M:%S')
    ms_part = f'{int(now.microsecond / 1000):03d}'
    return f'{yyyymmdd_hhmmss_part},{ms_part}'

while not dispatcher_filename.exists():
    print(f'[{now()}] Dispatcher file does not exist; sleeping...')
    sleep(300)

print(f'[{now()}] Dispatcher file EXISTS; starting...')

In [None]:
import json
import numpy as np
import pandas as pd

def load_results(results_filename):
    df = pd.read_excel(results_filename)
    try:
        row = df.loc[df['f1_score_abs'].idxmax(), ['Unnamed: 0', 'f1_score_abs']]
        max_config = str(row['Unnamed: 0'])
        max_f1_weighted = f"{float(row['f1_score_abs']):.6f}"
    except Exception:
        max_config = None
        max_f1_weighted = str(np.nan)
    cfg_str = f"{max_config if max_config else 'N/A':<8}"
    f1_str = f"{max_f1_weighted if max_f1_weighted not in ['nan', None] else 'N/A':<8}"
    return cfg_str, f1_str

def parse_exception(e):
    try:
        return str(e).split("\n")[-2]
    except:
        return "unknown error"

In [None]:
from tqdm.notebook import tqdm
import papermill as pm

errors = {}

# Find all .parquet files recursively
parquet_files = list(base_folder.rglob("*.parquet"))

# Sort by file size (ascending)
parquet_files_sorted = sorted(parquet_files, key=lambda p: os.path.getsize(p))

# Iterate
for file in tqdm(parquet_files_sorted, desc='File', leave='False'):
    
    dataset_path = str(file)
    size_mb = f'{(os.path.getsize(dataset_path) / (1024 * 1024)):.3f} MB'
    output_folder = Path(dataset_path.replace('/Input_Multiclass/', '/Output_Multiclass/')).parent
    output_folder = os.path.join(str(output_folder), str(file.stem))
    os.makedirs(output_folder, exist_ok=True)
    
    input_notebook = 'evaluator_code.ipynb'
    output_notebook = os.path.join(output_folder, 'xgb_execution.ipynb')
    results_filename = os.path.join(output_folder, 'xgb_summary_table.xlsx')

    try:

        tqdm.write(f'[{now()}] Processing | FILE = {str(file.stem):<64} | FILE_SIZE = {size_mb:<12}')
    
        if not Path(results_filename).exists():

            parameters = dict(
                dataset_path=dataset_path,
                output_folder=output_folder,
                target_column='label',
                handle_object_cols='keep',
                sampling_rate_global=None,
                sampling_rate_sets=0.10,
                sample_sets=['train'],
                min_samples_per_class=1,
                feature_selection_threshold=0.95,
                sample_filtering_quantile=0.10,
                hpo_n_trials=100,
                hpo_timeout=900,
                num_boost_round=300,
                early_stopping_rounds=15,
                n_jobs=-1,
                random_state=42,
                plot_param_importances=False
            )

            with open(output_notebook.replace('.ipynb', '_params.json'), 'w', encoding='utf-8') as f:
                json.dump(parameters, f, indent=4)                
        
            pm.execute_notebook(input_notebook, output_notebook, parameters=parameters)
    
        max_config, max_f1_weighted = load_results(results_filename)
        tqdm.write(f'[{now()}] Processed  | FILE = {str(file.stem):<64} | BEST_CONFIG = {max_config} | F1_WEIGHTED = {max_f1_weighted}')
        
    except (ValueError, Exception) as e:
        tqdm.write(f'[{now()}] ERROR      | FILE = {str(file.stem):<64} | EXCEPTION = {parse_exception(e)}')
        with open(output_notebook.replace('.ipynb', '_errors.json'), 'w', encoding='utf-8') as f:
            json.dump({"timestamp": now(), "file": str(file), "error": str(e).split('\n')}, f, indent=2)