In [None]:
import sys

sys.path.append("../modules")

top_dir = "/Users/pictomo/Repositories/SequentialCTA_experiment"

In [None]:
# prepare data

from tensorflow.keras.datasets import cifar10

(x_train, y_train), (x_test, y_test) = cifar10.load_data()

labels = [
    "Airplane",
    "Car",
    "Bird",
    "Cat",
    "Deer",
    "Dog",
    "Frog",
    "Horse",
    "Ship",
    "Truck",
]
label_to_index = {label: i for i, label in enumerate(labels)}

# normalize
# x_train, x_test = x_train / 255.0, x_test / 255.0

In [None]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

def provide_model(optimizer='adam'):
    input = Input(shape=(32, 32, 3))

    x = Conv2D(32, (3, 3), activation='relu')(input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)

    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    output = Dense(10, activation="softmax")(x)

    model = Model(input, output)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

In [None]:
# prepare haio
from haio import HAIOClient, OpenAI_IO, Gemini_IO, Bedrock_IO, QuestionTemplate
from virtual_human_io import VirtualHuman_IO

virtual_human_io = VirtualHuman_IO()
gemini_io = Gemini_IO()
openai_io = OpenAI_IO()
llama_io = Bedrock_IO("us.meta.llama3-2-90b-instruct-v1:0")
nova_io = Bedrock_IO("us.amazon.nova-lite-v1:0")

haio_client = HAIOClient(
    filepath=f"{top_dir}/notebooks",
    human_io=virtual_human_io,
    openai_io=openai_io,
    gemini_io=gemini_io,
    llama_io=llama_io,
    nova_io=nova_io,
    # # claude_io=claude_io,
)

question_template: QuestionTemplate = QuestionTemplate(
    title="CIFAR-10 Image Classification",
    description="Classify the image using the CIFAR-10 dataset",
    question=[
        {"tag": "img", "src": 0},
        {
            "tag": "h1",
            "value": "Choose the label that best describes the image from the options.",
        },
        {"tag": "p", "value": " ".join(labels)},
    ],
    answer={
        "type": "select",
        "options": labels,
    },
)

In [None]:
from haio import img_to_url
from vec_to_img import vec_to_img
import numpy as np
import hashlib
import json


def hash_data(src: any) -> str:  # same as haio_hash
    return hashlib.md5(json.dumps(src, sort_keys=True).encode()).hexdigest()


data_to_label_index = {hash_data(img.tolist()): y_train[i] for i, img in enumerate(x_train)}


def provide_label_collect(img_list) -> np.ndarray:
    return np.array([data_to_label_index[hash_data(img.tolist())] for img in img_list])


async def provide_label(img_list: list, execution_config: dict) -> HAIOClient.MethodReturn:
    asked_questions = []
    for img in img_list:
        img_url = img_to_url(img_data=vec_to_img(img), mime_type="image/png")
        data_list = [img_url]
        asked_questions.append(
            haio_client.ask(
                question_template=question_template,
                data_list=data_list,
            )
        )
    answer_info = await haio_client.wait(
        asked_questions=asked_questions, execution_config=execution_config
    )
    return answer_info

In [None]:
async def active_learn(execution_config, retrain=False):
    result = {
        "question_count":[0],
        "human_count": [0],
        "collect_count": [0],
        "accuracy": [0.0],
        "loss": [1.0],
    }

    #  データの準備
    data_size = 10000
    x_train = x_train[:data_size]  # 最初の10,000枚だけを使う

    # Active Learningの設定
    initial_size = 1000  # 最初に使用するデータ数
    query_size = 100  # 追加するデータ数
    n_steps = 40  # ステップ数 (最大で90)

    # 最初の1,000枚を使用する
    model = provide_model()

    initial_indices = np.arange(initial_size)
    x_labeled = x_train[initial_indices]

    # ラベリングと学習
    answer_info = await provide_label(x_train, execution_config)
    y_labeled = np.array(
        [np.array([label_to_index[ans]]) for ans in answer_info["answer_list"]]
    )
    collect_labels = provide_label_collect(x_train)
    model.fit(x_train / 255.0, y_labeled, epochs=10, verbose=1)
    loss, acc = model.evaluate(x_test / 255.0, y_test)

    # 残りの9,000枚を未使用として保持
    remaining_indices = np.arange(initial_size, data_size)
    x_unlabeled = x_train[remaining_indices]

    # Active Learningのループ
    for step in range(n_steps):
        # 未使用データに対して予測を行う
        predictions = model.predict(x_unlabeled / 255.0)
        # 予測の不確実性を計算（確信度の低い順に選ぶ）
        uncertainties = np.max(predictions, axis=1)
        query_indices = np.argsort(uncertainties)[:query_size]

        # 新しく選ばれたデータをラベル付きデータに追加
        new_samples = x_unlabeled[query_indices]
        answer_info = await provide_label(new_samples, execution_config=execution_config)
        new_labels = np.array(
            [
                np.array([label_to_index[answer]])
                for answer in answer_info["answer_list"]
            ]
        )
        collect_labels = provide_label_collect(new_samples)

        x_labeled = np.concatenate([x_labeled, new_samples])
        y_labeled = np.concatenate([y_labeled, new_labels])

        # 未使用データから選ばれたデータを除去
        x_unlabeled = np.delete(x_unlabeled, query_indices, axis=0)

        # モデルをFine-Tuning
        if retrain:
            model = provide_model()
        model.fit(x_labeled / 255.0, y_labeled, epochs=10, verbose=1)

        # テストセットでの評価
        loss, acc = model.evaluate(x_test / 255.0, y_test)

In [None]:
from ensure_dir import ensure_dir
from datetime import datetime
import json

execution_config_list = [
    ({"method": "sequential_cta_1", "quality_requirement": 0.9}, "cta1"),
    ({"method": "sequential_gta_1", "quality_requirement": 0.9}, "gta1"),
    ({"method": "sequential_cta_2", "quality_requirement": 0.9, "sample_size": 300}, "cta2"),
    ({"method": "sequential_gta_2", "quality_requirement": 0.9, "sample_size": 300}, "gta2"),
    ({"method": "sequential_cta_3", "quality_requirement": 0.9}, "cta3"),
    ({"method": "sequential_gta_3", "quality_requirement": 0.9}, "gta3"),
]

for execution_config, name in execution_config_list:
    ensure_dir(f"{top_dir}/results/expr2")
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
    result = active_learn(execution_config, retrain=True)
    with open(f"{top_dir}/results/expr2/{name}_{timestamp}.txt", "w") as f:
        json.dump(result, f)