# Task Settings

In [1]:
import os
os.environ['OPENAI_API_KEY']="YOUR_API_KEY"


from utils import *
from prompts import Prompts, TASK_LABLES, TAGS

dataset_name = 'trafficsafe'                   # 'trafficsafe', 'swiss'
test_model = "experimental:gpt-4o-mini"     # Forward Engine
eval_model = "gpt-4o"                       # Backward Engine
iteration = 1
date = '0919'
total_steps=9                               # 5-11
epoch=1                                     # 2, 1
batch_size=3                                # 1-3

# Initialize

In [2]:
cm_labels = TASK_LABLES[dataset_name]
tags = TAGS[dataset_name]
CAUSAL_SYSTEM = Prompts[dataset_name]['CAUSAL_SYSTEM']
CAUSAL_SYSTEM_CONSTRAINT = Prompts[dataset_name]['CAUSAL_SYSTEM_CONSTRAINT']
SYSTEM = Prompts[dataset_name]['SYSTEM']

llm_api_eval = tg.get_engine(engine_name=eval_model)
llm_api_test = tg.get_engine(engine_name=test_model, cache=False)
tg.set_backward_engine(llm_api_eval, override=True)

train_set, val_set, test_set_ori, eval_fn = load_task(dataset_name, evaluation_api=llm_api_eval, prompt_col="organized_prompt")
train_loader = tg.tasks.DataLoader(train_set, batch_size=batch_size, shuffle=True)
col = "organized_prompt" if dataset_name == 'swiss' else "prompt"
train_set.data[col] = train_set.data[col].apply(lambda x: f"{tags[0]}{x}{tags[1]}")
val_set.data[col] = val_set.data[col].apply(lambda x: f"{tags[0]}{x}{tags[1]}")
test_set_ori.data[col] = test_set_ori.data[col].apply(lambda x: f"{tags[0]}{x}{tags[1]}")
print("Train/Val/Test Set Lengths: ", len(train_set), len(val_set), len(test_set_ori))

Train/Val/Test Set Lengths:  100 100 100


In [3]:
import matplotlib.pyplot as plt
from copy import deepcopy

res = []

test_set = deepcopy(test_set_ori)
system_prompt, causal_prompt, model, causal_model, optimizer, optimizer_causal = init(SYSTEM, CAUSAL_SYSTEM, llm_api_test, llm_api_eval, CAUSAL_SYSTEM_CONSTRAINT)
results = {"test_f1": [], "prompt": [], "validation_f1": [], 'system_prompt':[], 'causal_prompt': []}

# Run EGO-Prompt

In [4]:
import time
import copy
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

def run_one_worker(worker_id: int):
    
    local_test_set = copy.deepcopy(test_set)
    local_test_set_ori = copy.deepcopy(test_set_ori)
    local_val_set = copy.deepcopy(val_set)
    local_train_loader = train_loader

    val_performance = -float('inf')
    test_performance = -float('inf')
    final_results = None
    all_val_f1s = []
    all_test_f1s = []

    local_test_set.data[col] = local_test_set_ori.data[col].apply(
        lambda x: f"<!-- {time.time()} (w{worker_id}) -->, {x}"
    )

    for cur_iter in range(iteration):
        print(f"[Worker {worker_id}] [Iteration {cur_iter+1}/{iteration}] begin")
        output_json = (
            f"res/{date}_{dataset_name}_{test_model.split('/')[-1].split(':')[-1]}_"
            f"w{worker_id}_it{cur_iter+1}.json"
        )
        initialize_json_file(output_json)

        system_prompt, causal_prompt, model, causal_model, optimizer, optimizer_causal = init(
            SYSTEM, CAUSAL_SYSTEM, llm_api_test, llm_api_eval, CAUSAL_SYSTEM_CONSTRAINT
        )

        results, test_res, val_res = init_eval(
            local_val_set, local_test_set, eval_fn, model, causal_model,
            system_prompt, causal_prompt, cm_labels, iters=ITERS    
        )

        results = run_training(
            local_train_loader, local_val_set, local_test_set, eval_fn,
            model, causal_model, system_prompt, causal_prompt,
            optimizer, optimizer_causal, results, cm_labels,
            output_json=output_json, epoch=epoch, steps=total_steps, iters=ITERS
        )

        all_val_f1s.append(results['validation_f1'])
        all_test_f1s.append(results['test_f1'])

        cur_val = results['validation_f1'][-1]
        cur_test = results['test_f1'][-1]
        if cur_val > val_performance:
            val_performance = cur_val
            test_performance = cur_test
            final_results = results

        print(f"[Worker {worker_id}] [Iteration {cur_iter+1}] "
              f"val_best={val_performance:.4f}, test_at_best={test_performance:.4f}")

    return {
        'best_test_f1': test_performance,
        'val_f1s': all_val_f1s,
        'test_f1s': all_test_f1s,
        'worker_id': worker_id,
    }


# Running the cell below will incur API usage charges. Refer to our paper for detailed cost

In [5]:
NUM_WORKERS = 3
ITERS = 1
total_steps=9
epoch=1
EGO_res = []


with ThreadPoolExecutor(max_workers=NUM_WORKERS) as ex:
    futures = [ex.submit(run_one_worker, i) for i in range(NUM_WORKERS)]
    for fut in as_completed(futures):
        res = fut.result()

        EGO_res.append(res['best_test_f1'])

        print(f"[Main] Worker {res['worker_id']} done. Best test_f1={res['best_test_f1']:.4f}")

print("EGO_res (best test F1 per worker):", EGO_res)

[Worker 0] [Iteration 1/1] begin
[Worker 2] [Iteration 1/1] begin
[Worker 1] [Iteration 1/1] begin


Accuracy: 0.3000: 100%|██████████| 100/100 [00:22<00:00,  4.42it/s]
Accuracy: 0.3000: 100%|██████████| 100/100 [00:22<00:00,  4.41it/s]
Accuracy: 0.2500: 100%|██████████| 100/100 [00:28<00:00,  3.51it/s]
Accuracy: 0.2600: 100%|██████████| 100/100 [00:20<00:00,  4.83it/s]


SCG_val_f1: 0.1821413043478261, SCG_test_f1:0.21471328825114078


Accuracy: 0.2987:  75%|███████▌  | 75/100 [00:15<00:02,  9.26it/s]


Epoch 0, Step 0


Accuracy: 0.2700: 100%|██████████| 100/100 [00:22<00:00,  4.40it/s]


SCG_val_f1: 0.2005776843146957, SCG_test_f1:0.17242979242979242


0it [00:00, ?it/s]


Epoch 0, Step 0


Accuracy: 0.2900: 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


SCG_val_f1: 0.23078582364786895, SCG_test_f1:0.212860932171277


0it [00:00, ?it/s]


Epoch 0, Step 0


Accuracy: 0.2800: 100%|██████████| 100/100 [00:15<00:00,  6.48it/s]


[System Validation] F1: 0.2797, Previous F1: 0.2006
[System Validation CM]:
[[ 2  9 12  0]
 [ 3  5 13  1]
 [ 2  4 10  8]
 [ 2  2 15 11]]


Accuracy: 0.2400: 100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


Skip Test
[System Validation] F1: 0.1462, Previous F1: 0.1821
[System Validation CM]:
[[ 1  5 16  0]
 [ 0  3 19  0]
 [ 1  2 20  0]
 [ 1  2 28  0]]


Accuracy: 0.2900: 100%|██████████| 100/100 [00:10<00:00,  9.13it/s]


Skip Test
[Causal Validation] F1: 0.2553, Previous F1: 0.2797
[Causal Validation CM]:
[[ 0  5 17  1]
 [ 0  8 11  3]
 [ 0  4 13  7]
 [ 1  5 17  8]]


Accuracy: 0.2800: 100%|██████████| 100/100 [00:20<00:00,  4.90it/s]
Accuracy: 0.2700: 100%|██████████| 100/100 [00:17<00:00,  5.60it/s]
Accuracy: 0.3152:  90%|█████████ | 90/100 [00:15<00:01,  7.04it/s]

Skip Test
[System Validation] F1: 0.1790, Previous F1: 0.2308
[System Validation CM]:
[[ 0 17  6  0]
 [ 0 13  9  0]
 [ 1  8 15  0]
 [ 1 10 20  0]]
[Test Result] F1: 0.2461

Epoch 0, Step 1


Accuracy: 0.3200: 100%|██████████| 100/100 [00:19<00:00,  5.12it/s]


[Causal Validation] F1: 0.2501, Previous F1: 0.1821
[Causal Validation CM]:
[[ 5 10  8  0]
 [ 4 13  5  0]
 [ 2  8 14  0]
 [ 5  9 17  0]]


Accuracy: 0.3300: 100%|██████████| 100/100 [00:16<00:00,  6.08it/s]
1it [03:59, 239.55s/it]

[Test Result] F1: 0.2636

Epoch 0, Step 1


Accuracy: 0.2600: 100%|██████████| 100/100 [00:21<00:00,  4.75it/s]
1it [04:04, 244.48s/it]

Skip Test
[Causal Validation] F1: 0.2259, Previous F1: 0.2308
[Causal Validation CM]:
[[10 10  1  0]
 [ 8  9  5  0]
 [ 6 12  6  0]
 [ 7  7 16  1]]
Skip Test

Epoch 0, Step 1


Accuracy: 0.2900: 100%|██████████| 100/100 [00:13<00:00,  7.25it/s]


Skip Test
[System Validation] F1: 0.2759, Previous F1: 0.2797
[System Validation CM]:
[[ 1 12  9  1]
 [ 1  6 12  3]
 [ 3  4 11  6]
 [ 0  7 13 11]]


Accuracy: 0.2600: 100%|██████████| 100/100 [00:21<00:00,  4.76it/s]
Accuracy: 0.3000:  57%|█████▋    | 57/100 [00:07<00:01, 28.74it/s]

Skip Test
[System Validation] F1: 0.2063, Previous F1: 0.2308
[System Validation CM]:
[[ 1 17  3  2]
 [ 0 11 11  0]
 [ 0 10 12  2]
 [ 0 13 16  2]]


Accuracy: 0.2800: 100%|██████████| 100/100 [00:24<00:00,  4.01it/s]
2it [07:18, 220.69s/it]          | 1/100 [00:04<07:39,  4.64s/it]

Skip Test
[Causal Validation] F1: 0.2710, Previous F1: 0.2797
[Causal Validation CM]:
[[ 5 18  0  0]
 [ 5 13  3  1]
 [ 2 15  5  2]
 [ 0 17  9  5]]
Skip Test

Epoch 0, Step 2


Accuracy: 0.1900: 100%|██████████| 100/100 [00:55<00:00,  1.79it/s]


Skip Test
[System Validation] F1: 0.1449, Previous F1: 0.2501
[System Validation CM]:
[[ 2 17  4  0]
 [ 5  8  9  0]
 [ 1 14  9  0]
 [ 1 13 17  0]]


Accuracy: 0.2900: 100%|██████████| 100/100 [00:18<00:00,  5.27it/s]
2it [07:31, 222.32s/it]

Skip Test
[Causal Validation] F1: 0.2045, Previous F1: 0.2308
[Causal Validation CM]:
[[ 2 10 10  1]
 [ 2  8 12  0]
 [ 1  4 19  0]
 [ 2  7 22  0]]
Skip Test

Epoch 0, Step 2


Accuracy: 0.2800: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s]
2it [08:23, 254.00s/it]

Skip Test
[Causal Validation] F1: 0.2373, Previous F1: 0.2501
[Causal Validation CM]:
[[ 8 12  3  0]
 [ 5  9  8  0]
 [ 3 10 11  0]
 [ 0 14 17  0]]
Skip Test

Epoch 0, Step 2


Accuracy: 0.2800: 100%|██████████| 100/100 [00:17<00:00,  5.84it/s]


Skip Test
[System Validation] F1: 0.2678, Previous F1: 0.2797
[System Validation CM]:
[[ 1 15  6  1]
 [ 1 10 10  1]
 [ 1 10  8  5]
 [ 2 13  7  9]]


Accuracy: 0.3200: 100%|██████████| 100/100 [00:08<00:00, 11.50it/s]


[Causal Validation] F1: 0.2970, Previous F1: 0.2797
[Causal Validation CM]:
[[ 1 13  9  0]
 [ 2 11  9  0]
 [ 2  3 11  8]
 [ 1  8 13  9]]


Accuracy: 0.3300: 100%|██████████| 100/100 [00:09<00:00, 10.03it/s]
3it [10:09, 198.15s/it]

[Test Result] F1: 0.2876

Epoch 0, Step 3


Accuracy: 0.2500: 100%|██████████| 100/100 [00:25<00:00,  3.85it/s]


Skip Test
[System Validation] F1: 0.1498, Previous F1: 0.2308
[System Validation CM]:
[[ 0 12 11  0]
 [ 1  6 15  0]
 [ 0  5 19  0]
 [ 1  5 25  0]]


Accuracy: 0.2400: 100%|██████████| 100/100 [00:22<00:00,  4.52it/s]


Skip Test
[System Validation] F1: 0.1814, Previous F1: 0.2501
[System Validation CM]:
[[ 2 18  2  0]
 [ 5 12  5  0]
 [ 2 12 10  0]
 [ 4 13 14  0]]


Accuracy: 0.2400: 100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
3it [11:18, 224.58s/it]

Skip Test
[Causal Validation] F1: 0.1812, Previous F1: 0.2308
[Causal Validation CM]:
[[ 2 12  9  0]
 [ 2  5 15  0]
 [ 1  7 16  0]
 [ 2  3 25  1]]
Skip Test

Epoch 0, Step 3


Accuracy: 0.2800: 100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
3it [11:59, 236.65s/it]

Skip Test
[Causal Validation] F1: 0.2257, Previous F1: 0.2501
[Causal Validation CM]:
[[ 6 11  6  0]
 [ 3  9 10  0]
 [ 3  8 13  0]
 [ 4  8 19  0]]
Skip Test

Epoch 0, Step 3


Accuracy: 0.3200: 100%|██████████| 100/100 [00:16<00:00,  6.24it/s]


[System Validation] F1: 0.3058, Previous F1: 0.2970
[System Validation CM]:
[[ 3 11  7  2]
 [ 0 12  7  3]
 [ 1  9  9  5]
 [ 2 10 11  8]]


Accuracy: 0.3400: 100%|██████████| 100/100 [00:13<00:00,  7.67it/s]


[Causal Validation] F1: 0.3178, Previous F1: 0.3058
[Causal Validation CM]:
[[ 2 13  6  2]
 [ 0 11  8  3]
 [ 1  6 12  5]
 [ 0  9 13  9]]


Accuracy: 0.3500: 100%|██████████| 100/100 [00:13<00:00,  7.48it/s]
4it [13:07, 189.98s/it]

[Test Result] F1: 0.2948

Epoch 0, Step 4


Accuracy: 0.2900: 100%|██████████| 100/100 [00:22<00:00,  4.50it/s]


[System Validation] F1: 0.2340, Previous F1: 0.2308
[System Validation CM]:
[[ 2  8 13  0]
 [ 0  8 14  0]
 [ 1  5 17  0]
 [ 2  6 21  2]]


Accuracy: 0.3000: 100%|██████████| 100/100 [00:26<00:00,  3.81it/s]


[System Validation] F1: 0.2507, Previous F1: 0.2501
[System Validation CM]:
[[ 5 13  5  0]
 [ 5  9  8  0]
 [ 4  5 15  0]
 [ 1 12 16  1]]


Accuracy: 0.2700: 100%|██████████| 100/100 [00:19<00:00,  5.05it/s]


Skip Test
[Causal Validation] F1: 0.1927, Previous F1: 0.2340
[Causal Validation CM]:
[[ 0 10 13  0]
 [ 1  9 11  0]
 [ 1  4 17  1]
 [ 1  2 26  1]]


Accuracy: 0.2900: 100%|██████████| 100/100 [00:20<00:00,  4.81it/s]
Accuracy: 0.3725:  50%|█████     | 50/100 [00:07<00:01, 25.58it/s]

[Test Result] F1: 0.2060

Epoch 0, Step 4


Accuracy: 0.3000: 100%|██████████| 100/100 [00:19<00:00,  5.22it/s]
Accuracy: 0.3729:  59%|█████▉    | 59/100 [00:07<00:02, 19.68it/s]

[Causal Validation] F1: 0.2844, Previous F1: 0.2507
[Causal Validation CM]:
[[ 5 11  7  0]
 [ 2  9 10  0]
 [ 4  7 12  1]
 [ 3  5 19  4]]


Accuracy: 0.2900: 100%|██████████| 100/100 [00:12<00:00,  8.17it/s]


Skip Test
[System Validation] F1: 0.2755, Previous F1: 0.3178
[System Validation CM]:
[[ 1 10 12  0]
 [ 1  8 13  0]
 [ 0  9 12  3]
 [ 1  3 19  8]]


Accuracy: 0.2600: 100%|██████████| 100/100 [00:20<00:00,  4.96it/s]
4it [15:47, 232.99s/it]

[Test Result] F1: 0.2301

Epoch 0, Step 4


Accuracy: 0.3300: 100%|██████████| 100/100 [00:14<00:00,  6.92it/s]


[Causal Validation] F1: 0.3313, Previous F1: 0.3178
[Causal Validation CM]:
[[ 4 16  2  1]
 [ 4 13  5  0]
 [ 2 13  6  3]
 [ 4 12  5 10]]


Accuracy: 0.2900: 100%|██████████| 100/100 [00:14<00:00,  6.87it/s]
5it [16:13, 188.53s/it]

[Test Result] F1: 0.2515

Epoch 0, Step 5


Accuracy: 0.2400: 100%|██████████| 100/100 [00:24<00:00,  4.09it/s]


Skip Test
[System Validation] F1: 0.1619, Previous F1: 0.2340
[System Validation CM]:
[[ 0 11 12  0]
 [ 1  1 20  0]
 [ 0  4 20  0]
 [ 0  5 23  3]]


Accuracy: 0.2800: 100%|██████████| 100/100 [00:21<00:00,  4.61it/s]


Skip Test
[System Validation] F1: 0.2539, Previous F1: 0.2844
[System Validation CM]:
[[ 6  8  8  0]
 [ 3  7 12  0]
 [ 4  6 13  0]
 [ 2  7 19  2]]


Accuracy: 0.2900: 100%|██████████| 100/100 [00:19<00:00,  5.25it/s]
Accuracy: 0.2857:  68%|██████▊   | 68/100 [00:11<00:01, 23.10it/s]

Skip Test
[Causal Validation] F1: 0.2136, Previous F1: 0.2340
[Causal Validation CM]:
[[ 1 17  5  0]
 [ 0 13  8  0]
 [ 0  9 14  1]
 [ 1  9 20  1]]
Skip Test

Epoch 0, Step 5


Accuracy: 0.2600: 100%|██████████| 100/100 [00:15<00:00,  6.40it/s]


Skip Test
[System Validation] F1: 0.1923, Previous F1: 0.3313
[System Validation CM]:
[[ 3 18  2  0]
 [ 0 16  6  0]
 [ 3 14  7  0]
 [ 1 19 11  0]]


Accuracy: 0.2400: 100%|██████████| 100/100 [00:14<00:00,  7.04it/s]
5it [19:16, 224.47s/it]

Skip Test
[Causal Validation] F1: 0.1573, Previous F1: 0.2844
[Causal Validation CM]:
[[ 1  9 13  0]
 [ 3  5 14  0]
 [ 1  4 18  1]
 [ 1  2 28  0]]
Skip Test

Epoch 0, Step 5


Accuracy: 0.3200: 100%|██████████| 100/100 [00:12<00:00,  7.92it/s]
6it [19:23, 189.05s/it]

Skip Test
[Causal Validation] F1: 0.3112, Previous F1: 0.3313
[Causal Validation CM]:
[[ 2 17  4  0]
 [ 4 13  5  0]
 [ 2 10  8  4]
 [ 3 12  7  9]]
Skip Test

Epoch 0, Step 6


Accuracy: 0.2700: 100%|██████████| 100/100 [00:20<00:00,  4.83it/s]
Accuracy: 0.3333:  63%|██████▎   | 63/100 [00:08<00:01, 23.34it/s]

[System Validation] F1: 0.2365, Previous F1: 0.2340
[System Validation CM]:
[[ 2  9 10  1]
 [ 0  7 12  1]
 [ 0  6 15  1]
 [ 2  1 25  3]]


Accuracy: 0.3300: 100%|██████████| 100/100 [00:11<00:00,  8.79it/s]


Skip Test
[System Validation] F1: 0.2970, Previous F1: 0.3313
[System Validation CM]:
[[ 3 17  2  1]
 [ 2 17  2  1]
 [ 2 13  8  1]
 [ 2 15  9  5]]


Accuracy: 0.3000: 100%|██████████| 100/100 [00:16<00:00,  6.14it/s]


[Causal Validation] F1: 0.2783, Previous F1: 0.2365
[Causal Validation CM]:
[[ 1 11  8  1]
 [ 3  7 11  0]
 [ 1  4 15  4]
 [ 1  5 15  7]]


Accuracy: 0.3100: 100%|██████████| 100/100 [00:23<00:00,  4.27it/s]


[System Validation] F1: 0.3055, Previous F1: 0.2844
[System Validation CM]:
[[ 7  7  8  0]
 [ 2 10 10  0]
 [ 1 13 10  0]
 [ 2 12 13  4]]


Accuracy: 0.3100: 100%|██████████| 100/100 [00:13<00:00,  7.58it/s]
Accuracy: 0.4255:  45%|████▌     | 45/100 [00:12<00:03, 17.05it/s]

Skip Test
[Causal Validation] F1: 0.2777, Previous F1: 0.3313
[Causal Validation CM]:
[[ 1 20  2  0]
 [ 3 16  3  0]
 [ 3 10  7  4]
 [ 1 12 11  7]]
Skip Test

Epoch 0, Step 7


Accuracy: 0.3700: 100%|██████████| 100/100 [00:18<00:00,  5.43it/s]
6it [22:38, 225.66s/it]

[Test Result] F1: 0.3402

Epoch 0, Step 6


Accuracy: 0.3500: 100%|██████████| 100/100 [00:29<00:00,  3.34it/s]


Skip Test
[Causal Validation] F1: 0.2961, Previous F1: 0.3055
[Causal Validation CM]:
[[ 0 10 13  0]
 [ 1 13  8  0]
 [ 1  4 16  3]
 [ 1  5 18  6]]


Accuracy: 0.3400: 100%|██████████| 100/100 [00:20<00:00,  4.98it/s]
6it [23:37, 236.86s/it]

[Test Result] F1: 0.3141

Epoch 0, Step 6


Accuracy: 0.2800: 100%|██████████| 100/100 [00:09<00:00, 10.66it/s]


Skip Test
[System Validation] F1: 0.2679, Previous F1: 0.3313
[System Validation CM]:
[[ 3 15  3  1]
 [ 5 14  3  0]
 [ 4 16  2  2]
 [ 2 16  4  9]]


Accuracy: 0.3200: 100%|██████████| 100/100 [00:10<00:00,  9.50it/s]
8it [25:19, 181.81s/it]

Skip Test
[Causal Validation] F1: 0.2784, Previous F1: 0.3313
[Causal Validation CM]:
[[ 1 19  3  0]
 [ 0 16  6  0]
 [ 3  8  9  4]
 [ 0 14 11  6]]
Skip Test

Epoch 0, Step 8


Accuracy: 0.3000: 100%|██████████| 100/100 [00:19<00:00,  5.15it/s]


[System Validation] F1: 0.2855, Previous F1: 0.2783
[System Validation CM]:
[[ 1 12  8  0]
 [ 0  9 12  0]
 [ 1  6 13  2]
 [ 0  7 14  7]]


Accuracy: 0.3000: 100%|██████████| 100/100 [00:22<00:00,  4.48it/s]


Skip Test
[Causal Validation] F1: 0.2598, Previous F1: 0.2855
[Causal Validation CM]:
[[ 1  8 13  1]
 [ 0  9 11  1]
 [ 0  5 16  1]
 [ 0  2 22  4]]


Accuracy: 0.3600: 100%|██████████| 100/100 [00:21<00:00,  4.75it/s]
7it [26:31, 227.96s/it]

[Test Result] F1: 0.3077

Epoch 0, Step 7


Accuracy: 0.3400: 100%|██████████| 100/100 [00:21<00:00,  4.60it/s]


[System Validation] F1: 0.3224, Previous F1: 0.3055
[System Validation CM]:
[[ 7  6 10  0]
 [ 3  9 10  0]
 [ 1  7 14  1]
 [ 4  6 17  4]]


Accuracy: 0.3200: 100%|██████████| 100/100 [00:19<00:00,  5.04it/s]


Skip Test
[Causal Validation] F1: 0.2960, Previous F1: 0.3224
[Causal Validation CM]:
[[ 2 13  8  0]
 [ 2  9 11  0]
 [ 2  5 15  1]
 [ 2  3 20  6]]


Accuracy: 0.3400: 100%|██████████| 100/100 [00:16<00:00,  6.02it/s]
Accuracy: 0.3535:  99%|█████████▉| 99/100 [00:21<00:00,  2.25it/s]

Skip Test
[System Validation] F1: 0.3040, Previous F1: 0.3313
[System Validation CM]:
[[ 2 19  2  0]
 [ 2 19  0  1]
 [ 2 13  5  4]
 [ 3 16  4  8]]


Accuracy: 0.3500: 100%|██████████| 100/100 [00:23<00:00,  4.25it/s]
7it [28:05, 247.10s/it]

[Test Result] F1: 0.3051

Epoch 0, Step 7


Accuracy: 0.2800: 100%|██████████| 100/100 [00:11<00:00,  8.45it/s]
9it [28:25, 183.24s/it]

Skip Test
[Causal Validation] F1: 0.2607, Previous F1: 0.3313
[Causal Validation CM]:
[[ 1 18  4  0]
 [ 1 13  6  2]
 [ 2 13  4  5]
 [ 1 11  9 10]]
Skip Test

Epoch 0, Step 9


Accuracy: 0.3400: 100%|██████████| 100/100 [00:20<00:00,  4.88it/s]


[System Validation] F1: 0.3057, Previous F1: 0.2855
[System Validation CM]:
[[ 0 13  9  1]
 [ 1  9 12  0]
 [ 0  5 16  2]
 [ 1  3 16  9]]


Accuracy: 0.3300: 100%|██████████| 100/100 [00:12<00:00,  8.28it/s]


Skip Test
[Causal Validation] F1: 0.2989, Previous F1: 0.3057
[Causal Validation CM]:
[[ 1  6 14  1]
 [ 0  9 11  1]
 [ 0  5 16  2]
 [ 0  4 20  7]]


Accuracy: 0.3500: 100%|██████████| 100/100 [00:09<00:00, 10.53it/s]


[System Validation] F1: 0.3437, Previous F1: 0.3313
[System Validation CM]:
[[ 2 15  6  0]
 [ 4 14  4  0]
 [ 3 10  9  2]
 [ 3 13  5 10]]


Accuracy: 0.3000: 100%|██████████| 100/100 [00:21<00:00,  4.57it/s]
8it [30:24, 229.52s/it]

[Test Result] F1: 0.2905

Epoch 0, Step 8


Accuracy: 0.3400: 100%|██████████| 100/100 [00:14<00:00,  6.98it/s]


Skip Test
[Causal Validation] F1: 0.3164, Previous F1: 0.3437
[Causal Validation CM]:
[[ 2 16  5  0]
 [ 2 16  4  0]
 [ 3 10  8  3]
 [ 1 14  8  8]]


Accuracy: 0.3500: 100%|██████████| 100/100 [00:12<00:00,  7.94it/s]
9it [31:02, 206.96s/it]


[Test Result] F1: 0.3291
[Worker 0] [Iteration 1] val_best=0.3437, test_at_best=0.3291
[Main] Worker 0 done. Best test_f1=0.3291


Accuracy: 0.3000: 100%|██████████| 100/100 [00:14<00:00,  6.82it/s]


Skip Test
[System Validation] F1: 0.2621, Previous F1: 0.3224
[System Validation CM]:
[[ 3 11  9  0]
 [ 3 12  7  0]
 [ 2  8 12  2]
 [ 1 10 17  3]]


Accuracy: 0.3200: 100%|██████████| 100/100 [00:17<00:00,  5.64it/s]
8it [32:09, 246.00s/it]

Skip Test
[Causal Validation] F1: 0.2504, Previous F1: 0.3224
[Causal Validation CM]:
[[ 2 10 11  0]
 [ 0 12  9  0]
 [ 0  6 17  0]
 [ 1  3 24  1]]
Skip Test

Epoch 0, Step 8


Accuracy: 0.3200: 100%|██████████| 100/100 [00:16<00:00,  6.10it/s]


Skip Test
[System Validation] F1: 0.2856, Previous F1: 0.3057
[System Validation CM]:
[[ 2  9 10  1]
 [ 0  9 11  2]
 [ 1  5 16  2]
 [ 0  7 17  5]]


Accuracy: 0.3000: 100%|██████████| 100/100 [00:23<00:00,  4.26it/s]
9it [34:14, 229.87s/it]

Skip Test
[Causal Validation] F1: 0.2643, Previous F1: 0.3057
[Causal Validation CM]:
[[ 1  8 13  0]
 [ 1  9 10  2]
 [ 0  5 15  3]
 [ 0  4 20  5]]
Skip Test

Epoch 0, Step 9


Accuracy: 0.2500: 100%|██████████| 100/100 [00:21<00:00,  4.63it/s]


Skip Test
[System Validation] F1: 0.2288, Previous F1: 0.3224
[System Validation CM]:
[[ 3 12  7  0]
 [ 3  6 12  0]
 [ 3  5 13  1]
 [ 2  7 18  3]]


Accuracy: 0.2300: 100%|██████████| 100/100 [00:20<00:00,  4.94it/s]
9it [36:20, 247.62s/it]

Skip Test
[Causal Validation] F1: 0.2014, Previous F1: 0.3224
[Causal Validation CM]:
[[ 3 10  9  0]
 [ 4  6 12  0]
 [ 4  5 12  3]
 [ 3  6 19  2]]
Skip Test

Epoch 0, Step 9


Accuracy: 0.3700: 100%|██████████| 100/100 [00:15<00:00,  6.45it/s]


[System Validation] F1: 0.3640, Previous F1: 0.3057
[System Validation CM]:
[[ 2 14  6  0]
 [ 0 11  9  0]
 [ 1  7 14  1]
 [ 1  4 15 10]]


Accuracy: 0.3400: 100%|██████████| 100/100 [00:27<00:00,  3.66it/s]


Skip Test
[Causal Validation] F1: 0.2996, Previous F1: 0.3640
[Causal Validation CM]:
[[ 0 10 10  1]
 [ 0 12  7  1]
 [ 0  6 15  2]
 [ 0  2 20  7]]


Accuracy: 0.3400: 100%|██████████| 100/100 [00:21<00:00,  4.73it/s]
9it [38:20, 255.56s/it]


[Test Result] F1: 0.3126
[Worker 2] [Iteration 1] val_best=0.3640, test_at_best=0.3126
[Main] Worker 2 done. Best test_f1=0.3126


Accuracy: 0.3500: 100%|██████████| 100/100 [00:20<00:00,  4.93it/s]


Skip Test
[System Validation] F1: 0.3059, Previous F1: 0.3224
[System Validation CM]:
[[ 4  8 10  0]
 [ 2 12  8  0]
 [ 1  7 16  0]
 [ 2  8 18  3]]


Accuracy: 0.2400: 100%|██████████| 100/100 [00:21<00:00,  4.72it/s]
9it [40:22, 269.14s/it]

Skip Test
[Causal Validation] F1: 0.2056, Previous F1: 0.3224
[Causal Validation CM]:
[[ 1  8 14  0]
 [ 3  4 15  0]
 [ 1  7 15  1]
 [ 4  7 16  4]]
Skip Test
[Worker 1] [Iteration 1] val_best=0.3224, test_at_best=0.3051
[Main] Worker 1 done. Best test_f1=0.3051
EGO_res (best test F1 per worker): [0.3291249143880723, 0.312577579697298, 0.30512086461513077]



