# 報酬設計の比較実験

このノートブックでは、Continuous ECI-BO-Banditアルゴリズムについて、2つの異なる報酬設計を比較します：

1. **元の報酬設計**: 予測誤差に基づく報酬計算
2. **新しい報酬設計**: GPモデルの勾配の絶対値を使った報酬計算

## 実験設定
- テスト関数: Styblinski-Tang, Rastrigin, Ackley (100次元中先頭5次元が有効)
- 20回の独立実行
- 300回の評価
- 収束性能と方向選択の比較

In [1]:
import math
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import torch

# BoTorch / GPyTorch
from botorch import fit_gpytorch_model
from botorch.models import SingleTaskGP
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.kernels import RBFKernel, ScaleKernel
from botorch.acquisition import ExpectedImprovement
from botorch.optim import optimize_acqf

# デフォルトのdtypeをfloat32に設定
torch.set_default_dtype(torch.float32)

# プロット設定
plt.rcParams["figure.dpi"] = 100

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


## テスト関数群（100次元中先頭5次元が有効）

In [2]:
def styblinski_tang_100d(x, noise_std=1e-5):
    if not torch.is_tensor(x):
        x = torch.tensor(x, dtype=torch.float32)
    x5 = x[..., :5]
    res = 0.5 * torch.sum(x5**4 - 16.0*x5**2 + 5.0*x5, dim=-1)
    return res + torch.randn_like(res) * noise_std

def rastrigin_100d(x, noise_std=1e-5):
    if not torch.is_tensor(x):
        x = torch.tensor(x, dtype=torch.float32)
    x5 = x[..., :5]
    s = torch.sum(x5**2 - 10.0*torch.cos(2*math.pi*x5) + 10.0, dim=-1)
    return s + torch.randn_like(s) * noise_std

def ackley_100d(x, noise_std=1e-5):
    if not torch.is_tensor(x):
        x = torch.tensor(x, dtype=torch.float32)
    x5 = x[..., :5]
    d = 5
    sum_sq = torch.sum(x5**2, dim=-1)
    r = torch.sqrt(sum_sq / d)
    part1 = -20.0 * torch.exp(-0.2 * r)
    part2 = -torch.exp(torch.mean(torch.cos(2.0*math.pi*x5), dim=-1))
    res = part1 + part2 + 20.0 + math.e
    return res + torch.randn_like(res) * noise_std

## ベースクラス: ECI_BO_Bandit_Original

In [3]:
class ECI_BO_Bandit_Original:
    def __init__(self, X, objective_function, bounds, n_initial, n_max, dim,
                 algo_base_name="ECI_BO_Bandit_Original", coordinate_ratio=0.8, run_id=1, output_base_dir="output_results"):
        self.X = X.float()
        self.dim = dim
        self.num_arms = dim
        self.A = torch.eye(dim)
        self.b = torch.zeros(dim)
        self.objective_function = objective_function
        self.bounds = bounds.float()
        self.n_initial = n_initial
        self.n_max = n_max
        self.Y = None
        self.best_value = None
        self.best_point = None
        self.model = None
        self.eval_history = []
        self.selected_direction_history = []
        self.theta_history = []
        self.coordinate_ratio = coordinate_ratio
        self.scale_init = 1.0
        self.run_id = run_id

        self.function_name_with_ratio = f"Original_coord_{self.coordinate_ratio:.1f}"
        self.algo_name_for_run = f"{algo_base_name}_{self.function_name_with_ratio}_run{self.run_id}"

        self.output_dir = os.path.join(output_base_dir, algo_base_name, self.function_name_with_ratio)
        os.makedirs(self.output_dir, exist_ok=True)

        self.total_iterations_for_bandit = 0

    def update_model(self):
        kernel = ScaleKernel(
            RBFKernel(ard_num_dims=self.X.shape[-1], dtype=torch.float32),
            dtype=torch.float32, noise_constraint=1e-3
        ).to(self.X)
        self.model = SingleTaskGP(self.X, self.Y, covar_module=kernel)
        mll = ExactMarginalLogLikelihood(self.model.likelihood, self.model)
        fit_gpytorch_model(mll)

    def initialize(self):
        y_val = self.objective_function(self.X)
        self.Y = y_val.unsqueeze(-1).float()
        y_max, y_min = self.Y.max().item(), self.Y.min().item()
        self.scale_init = (y_max - y_min) if (y_max - y_min) != 0 else 1.0
        self.update_model()
        post_mean = self.model.posterior(self.X).mean.squeeze(-1)
        bi = post_mean.argmin()
        self.best_value = post_mean[bi].item()
        self.best_point = self.X[bi]
        self.eval_history = [self.best_value] * self.n_initial

    def propose_new_x(self, direction):
        ei = ExpectedImprovement(self.model, best_f=self.best_value, maximize=False)
        
        active_dims_mask = direction.abs() > 1e-9
        if not active_dims_mask.any():
            lb, ub = -1.0, 1.0
        else:
            ratios_lower = (self.bounds[0] - self.best_point) / (direction + 1e-12 * (~active_dims_mask))
            ratios_upper = (self.bounds[1] - self.best_point) / (direction + 1e-12 * (~active_dims_mask))

            t_bounds = torch.zeros(self.dim, 2, device=self.X.device)
            t_bounds[:, 0] = torch.minimum(ratios_lower, ratios_upper)
            t_bounds[:, 1] = torch.maximum(ratios_lower, ratios_upper)

            lb = -float('inf')
            ub = float('inf')
            for i in range(self.dim):
                if active_dims_mask[i]:
                    lb = max(lb, t_bounds[i,0].item())
                    ub = min(ub, t_bounds[i,1].item())

        if lb > ub:
            lb, ub = -1.0, 1.0
            if self.best_point is not None:
                domain_width = (self.bounds[1,0] - self.bounds[0,0]).item()
                lb = -0.1 * domain_width
                ub =  0.1 * domain_width

        one_d_bounds = torch.tensor([[lb],[ub]], dtype=torch.float32, device=self.X.device)

        def ei_on_line(t_scalar_tensor):
            t_values = t_scalar_tensor.squeeze(-1)
            points_on_line = self.best_point.unsqueeze(0) + t_values.reshape(-1,1) * direction.unsqueeze(0)
            points_on_line_clamped = torch.clamp(points_on_line, self.bounds[0].unsqueeze(0), self.bounds[1].unsqueeze(0))
            return ei(points_on_line_clamped.unsqueeze(1))

        cand_t, acq_val_t = optimize_acqf(
            ei_on_line,
            bounds=one_d_bounds,
            q=1,
            num_restarts=10,
            raw_samples=100
        )

        alpha_star = cand_t.item()
        new_x = self.best_point + alpha_star * direction
        new_x_clamped = torch.clamp(new_x, self.bounds[0], self.bounds[1])

        return new_x_clamped, alpha_star, acq_val_t.item(), lb, ub

## アルゴリズム1: 元の報酬設計（予測誤差ベース）

In [4]:
class ECI_BO_Bandit_Continuous_Original(ECI_BO_Bandit_Original):
    def __init__(self, X, objective_function, bounds, n_initial, n_max, dim,
                 algo_base_name="ECI_BO_Bandit_Continuous_Original", coordinate_ratio=0.8, run_id=1, output_base_dir="output_results"):
        super().__init__(X, objective_function, bounds, n_initial, n_max, dim,
                         algo_base_name, coordinate_ratio, run_id, output_base_dir)
        self.function_name_with_ratio = f"Continuous_Original_coord_{self.coordinate_ratio:.1f}"
        self.algo_name_for_run = f"{algo_base_name}_{self.function_name_with_ratio}_run{self.run_id}"
        self.output_dir = os.path.join(output_base_dir, algo_base_name, self.function_name_with_ratio)
        os.makedirs(self.output_dir, exist_ok=True)

    def find_optimal_direction_continuous(self):
        """連続空間での最適方向を見つける"""
        # 1. 現在のLinUCBパラメータを取得
        A_t = self.lambda_reg * torch.eye(self.dim, device=self.X.device) + self.A
        try:
            A_inv = torch.inverse(A_t)
        except torch.linalg.LinAlgError:
            A_inv = torch.linalg.pinv(A_t)

        theta_hat = A_inv @ self.b

        # 2. UCBのbetaを計算
        current_round_t = self.total_iterations_for_bandit
        if current_round_t == 0: current_round_t = 1
        log_term_numerator = 1 + (current_round_t - 1) * self.L**2 / self.lambda_reg
        if log_term_numerator <= 0: log_term_numerator = 1e-9
        beta_t = (self.sigma * math.sqrt(
                    self.dim * math.log(log_term_numerator / self.delta))
                  + math.sqrt(self.lambda_reg)*self.S)

        # 3. 最適なthetaを求める問題を解く
        # 二分探索で最適なlambdaを見つける
        low = 0.0
        high = 1000.0
        for _ in range(100):
            lam = (low + high) / 2.0
            if lam < 1e-9: lam = 1e-9

            # theta(lambda)を計算
            M = A_t + lam * torch.eye(self.dim, device=self.X.device)
            try:
                M_inv = torch.inverse(M)
            except torch.linalg.LinAlgError:
                M_inv = torch.linalg.pinv(M)
            
            theta_lam = M_inv @ A_t @ theta_hat

            # 制約式の評価
            diff = theta_lam - theta_hat
            val = diff.t() @ A_t @ diff
            
            if val > beta_t**2:
                low = lam
            else:
                high = lam
        
        # 最終的なlambdaで optimal theta を計算
        lam = (low + high) / 2.0
        M = A_t + lam * torch.eye(self.dim, device=self.X.device)
        try:
            M_inv = torch.inverse(M)
        except torch.linalg.LinAlgError:
            M_inv = torch.linalg.pinv(M)
        
        optimal_theta = M_inv @ A_t @ theta_hat

        # 4. 方向ベクトルを正規化して返す
        norm = torch.norm(optimal_theta)
        if norm < 1e-9:
            # ノルムがほぼゼロの場合、ランダムな方向を返す
            direction = torch.randn(self.dim, device=self.X.device)
            return direction / torch.norm(direction)
        else:
            return optimal_theta / norm

    def optimize(self):
        self.sigma = 1.0
        self.L = 1.0
        self.lambda_reg = 1.0
        self.delta = 0.1
        self.S = 1.0

        self.initialize()
        n_bo_iter = self.n_initial

        while n_bo_iter < self.n_max:
            self.total_iterations_for_bandit += 1

            # 連続方向探索を使用
            direction = self.find_optimal_direction_continuous()
            self.selected_direction_history.append(direction.clone())

            new_x, _, _, _, _ = self.propose_new_x(direction)

            with torch.no_grad():
                predicted_mean_at_new_x = self.model.posterior(new_x.unsqueeze(0)).mean.squeeze().item()
            actual_y_at_new_x = self.objective_function(new_x.unsqueeze(0)).squeeze().item()
            
            # 元の報酬設計: 予測誤差ベース
            prediction_error = abs(predicted_mean_at_new_x - actual_y_at_new_x)
            reward = 10.0 * (1.0 - math.exp(-prediction_error / self.scale_init))

            x_arm_for_update = direction.view(-1, 1)
            self.A += x_arm_for_update @ x_arm_for_update.t()
            self.b += reward * direction  # 元の報酬設計

            self.X = torch.cat([self.X, new_x.unsqueeze(0)], 0)
            self.Y = torch.cat([self.Y, torch.tensor([[actual_y_at_new_x]], dtype=torch.float32, device=self.X.device)], 0)
            self.update_model()

            with torch.no_grad():
                posterior_mean_overall = self.model.posterior(self.X).mean.squeeze(-1)
            current_best_idx = posterior_mean_overall.argmin()
            self.best_value = posterior_mean_overall[current_best_idx].item()
            self.best_point = self.X[current_best_idx]
            self.eval_history.append(self.best_value)
            n_bo_iter += 1

        return self.best_point, self.best_value

## アルゴリズム2: 新しい報酬設計（勾配ベース）

In [5]:
class ECI_BO_Bandit_Continuous_Gradient(ECI_BO_Bandit_Original):
    def __init__(self, X, objective_function, bounds, n_initial, n_max, dim,
                 algo_base_name="ECI_BO_Bandit_Continuous_Gradient", coordinate_ratio=0.8, run_id=1, output_base_dir="output_results"):
        super().__init__(X, objective_function, bounds, n_initial, n_max, dim,
                         algo_base_name, coordinate_ratio, run_id, output_base_dir)
        self.function_name_with_ratio = f"Continuous_Gradient_coord_{self.coordinate_ratio:.1f}"
        self.algo_name_for_run = f"{algo_base_name}_{self.function_name_with_ratio}_run{self.run_id}"
        self.output_dir = os.path.join(output_base_dir, algo_base_name, self.function_name_with_ratio)
        os.makedirs(self.output_dir, exist_ok=True)

    def find_optimal_direction_continuous(self):
        """連続空間での最適方向を見つける"""
        # 1. 現在のLinUCBパラメータを取得
        A_t = self.lambda_reg * torch.eye(self.dim, device=self.X.device) + self.A
        try:
            A_inv = torch.inverse(A_t)
        except torch.linalg.LinAlgError:
            A_inv = torch.linalg.pinv(A_t)

        theta_hat = A_inv @ self.b

        # 2. UCBのbetaを計算
        current_round_t = self.total_iterations_for_bandit
        if current_round_t == 0: current_round_t = 1
        log_term_numerator = 1 + (current_round_t - 1) * self.L**2 / self.lambda_reg
        if log_term_numerator <= 0: log_term_numerator = 1e-9
        beta_t = (self.sigma * math.sqrt(
                    self.dim * math.log(log_term_numerator / self.delta))
                  + math.sqrt(self.lambda_reg)*self.S)

        # 3. 最適なthetaを求める問題を解く
        # 二分探索で最適なlambdaを見つける
        low = 0.0
        high = 1000.0
        for _ in range(100):
            lam = (low + high) / 2.0
            if lam < 1e-9: lam = 1e-9

            # theta(lambda)を計算
            M = A_t + lam * torch.eye(self.dim, device=self.X.device)
            try:
                M_inv = torch.inverse(M)
            except torch.linalg.LinAlgError:
                M_inv = torch.linalg.pinv(M)
            
            theta_lam = M_inv @ A_t @ theta_hat

            # 制約式の評価
            diff = theta_lam - theta_hat
            val = diff.t() @ A_t @ diff
            
            if val > beta_t**2:
                low = lam
            else:
                high = lam
        
        # 最終的なlambdaで optimal theta を計算
        lam = (low + high) / 2.0
        M = A_t + lam * torch.eye(self.dim, device=self.X.device)
        try:
            M_inv = torch.inverse(M)
        except torch.linalg.LinAlgError:
            M_inv = torch.linalg.pinv(M)
        
        optimal_theta = M_inv @ A_t @ theta_hat

        # 4. 方向ベクトルを正規化して返す
        norm = torch.norm(optimal_theta)
        if norm < 1e-9:
            # ノルムがほぼゼロの場合、ランダムな方向を返す
            direction = torch.randn(self.dim, device=self.X.device)
            return direction / torch.norm(direction)
        else:
            return optimal_theta / norm

    def optimize(self):
        self.sigma = 1.0
        self.L = 1.0
        self.lambda_reg = 1.0
        self.delta = 0.1
        self.S = 1.0

        self.initialize()
        n_bo_iter = self.n_initial

        while n_bo_iter < self.n_max:
            self.total_iterations_for_bandit += 1

            # 連続方向探索を使用
            direction = self.find_optimal_direction_continuous()
            self.selected_direction_history.append(direction.clone())

            new_x, _, _, _, _ = self.propose_new_x(direction)

            # 実際の評価
            actual_y_at_new_x = self.objective_function(new_x.unsqueeze(0)).squeeze().item()
            
            # --- 新しい報酬計算 ---
            new_x_for_grad = new_x.clone().unsqueeze(0)
            new_x_for_grad.requires_grad_(True)

            # GPモデルで事後分布を取得
            posterior = self.model.posterior(new_x_for_grad)
            mean_at_new_x = posterior.mean

            # 勾配を計算
            mean_at_new_x.sum().backward()
            grad_vector = new_x_for_grad.grad.squeeze(0)

            # 報酬ベクトルを定義 (絶対値を取ることで影響の大きさを評価)
            reward_vector = grad_vector.abs() 
            
            # banditパラメータを更新
            x_arm_for_update = direction.view(-1, 1)
            self.A += x_arm_for_update @ x_arm_for_update.t()
            self.b += reward_vector  # 新しい報酬設計
            # --- 新しい報酬計算ここまで ---

            self.X = torch.cat([self.X, new_x.unsqueeze(0)], 0)
            self.Y = torch.cat([self.Y, torch.tensor([[actual_y_at_new_x]], dtype=torch.float32, device=self.X.device)], 0)
            self.update_model()

            with torch.no_grad():
                posterior_mean_overall = self.model.posterior(self.X).mean.squeeze(-1)
            current_best_idx = posterior_mean_overall.argmin()
            self.best_value = posterior_mean_overall[current_best_idx].item()
            self.best_point = self.X[current_best_idx]
            self.eval_history.append(self.best_value)
            n_bo_iter += 1

        return self.best_point, self.best_value

## 実験実行と比較

In [6]:
def generate_initial_points(n_initial, dim, bounds):
    return torch.rand(n_initial, dim) * (bounds[1] - bounds[0]) + bounds[0]

In [7]:
if __name__ == "__main__":
    test_funcs = [
        ("StyblinskiTang", styblinski_tang_100d, -195.83),
        ("Rastrigin", rastrigin_100d, 0.0),
        ("Ackley", ackley_100d, 0.0),
    ]
    dim = 20
    bounds = torch.tensor([[-5.0]*dim, [5.0]*dim], dtype=torch.float32)
    n_initial = 5
    n_iter = 300  
    n_runs = 20

    output_base_dir = "output_results_reward_comparison"
    os.makedirs(output_base_dir, exist_ok=True)

    coordinate_ratio = 0.8

    # 全実行で共通の初期点
    initial_points_all_runs = [
        generate_initial_points(n_initial, dim, bounds)
        for _ in range(n_runs)
    ]

    algorithms = [
        ("Original_Reward", ECI_BO_Bandit_Continuous_Original),
        ("Gradient_Reward", ECI_BO_Bandit_Continuous_Gradient)
    ]

    for func_name_short, func_eval, global_opt_val in test_funcs:
        print(f"========== テスト関数実行中: {func_name_short} ==========")

        # 全アルゴリズムの結果を保存
        all_algorithm_results = {}

        for algo_name, algo_class in algorithms:
            print(f"--- {algo_name} アルゴリズム実行中 ---")
            
            histories_for_this_algo = []
            dim_sums_for_this_algo = []

            # tqdmの代わりにシンプルなプログレス表示を使用
            for run_idx in range(n_runs):
                print(f"\r  実行中: {run_idx + 1}/{n_runs}", end="", flush=True)
                
                initial_X_for_run = initial_points_all_runs[run_idx].clone().to(dtype=torch.float32)

                optimizer = algo_class(
                    X=initial_X_for_run,
                    objective_function=func_eval,
                    bounds=bounds,
                    n_initial=n_initial,
                    n_max=n_iter,
                    dim=dim,
                    algo_base_name=func_name_short,
                    coordinate_ratio=coordinate_ratio,
                    run_id=run_idx + 1,
                    output_base_dir=output_base_dir
                )

                _, _ = optimizer.optimize()

                histories_for_this_algo.append(optimizer.eval_history)

                if optimizer.selected_direction_history:
                    directions_tensor = torch.stack(optimizer.selected_direction_history, 0)
                    abs_sum_per_dim = directions_tensor.abs().sum(dim=0).cpu().numpy()
                    dim_sums_for_this_algo.append(abs_sum_per_dim)
                else:
                    dim_sums_for_this_algo.append(np.zeros(dim))
            
            print()  # 改行

            # 収束統計の計算
            eval_histories_np_array = np.array(histories_for_this_algo)
            mean_convergence = eval_histories_np_array.mean(axis=0)
            std_convergence = eval_histories_np_array.std(axis=0)

            if dim_sums_for_this_algo:
                avg_dim_abs_sum = np.mean(np.stack(dim_sums_for_this_algo, 0), axis=0)
            else:
                avg_dim_abs_sum = np.zeros(dim)

            all_algorithm_results[algo_name] = {
                'mean_hist': mean_convergence,
                'std_hist': std_convergence,
                'avg_dim_abs_sum': avg_dim_abs_sum
            }

            # 個別アルゴリズムの方向プロットを保存
            plot_save_dir = os.path.join(output_base_dir, func_name_short, f"{algo_name}_coord_{coordinate_ratio:.1f}")
            os.makedirs(plot_save_dir, exist_ok=True)
            
            plt.figure(figsize=(10, 6))
            plt.bar(np.arange(dim), avg_dim_abs_sum, alpha=0.7)
            plt.xlabel("次元インデックス", fontsize=12)
            plt.ylabel("方向成分絶対値の平均和", fontsize=12)
            title_str = (f"{func_name_short} - {algo_name} (coord_ratio={coordinate_ratio:.1f})\n"
                        f"{n_runs}回実行での方向絶対値平均和")
            plt.title(title_str, fontsize=14)
            plt.xticks(np.arange(0, dim, step=max(1, dim//10)), fontsize=10)
            plt.yticks(fontsize=10)
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.savefig(os.path.join(plot_save_dir, "average_dimension_abs_sum.png"), dpi=150)
            plt.close()

        # 比較収束プロットの作成
        plt.figure(figsize=(12, 8))
        iters_plot = np.arange(1, n_iter + 1)
        
        colors = ['blue', 'red']
        for i, (algo_name, results) in enumerate(all_algorithm_results.items()):
            plt.plot(iters_plot, results['mean_hist'], 
                    label=f"{algo_name}", color=colors[i], linewidth=2)
            plt.fill_between(iters_plot,
                           results['mean_hist'] - results['std_hist'],
                           results['mean_hist'] + results['std_hist'],
                           alpha=0.2, color=colors[i])

        plt.axhline(global_opt_val, color='green', linestyle='--', label='大域最適値', linewidth=2)
        plt.xlabel("評価回数", fontsize=14)
        plt.ylabel("発見された最良目的値 (平均 ± 標準偏差)", fontsize=14)
        plt.title(f"{func_name_short}での報酬設計比較\n(coordinate_ratio={coordinate_ratio:.1f})", fontsize=16)
        plt.legend(fontsize=12)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()

        # 比較プロットの保存
        comparison_plot_save_dir = os.path.join(output_base_dir, func_name_short)
        os.makedirs(comparison_plot_save_dir, exist_ok=True)
        plt.savefig(os.path.join(comparison_plot_save_dir, f"{func_name_short}_reward_comparison.png"), dpi=150)
        plt.close()

        # 方向比較プロットの作成
        plt.figure(figsize=(12, 5))
        for i, (algo_name, results) in enumerate(all_algorithm_results.items()):
            plt.subplot(1, 2, i+1)
            plt.bar(np.arange(dim), results['avg_dim_abs_sum'], alpha=0.7, color=colors[i])
            plt.xlabel("次元インデックス", fontsize=10)
            plt.ylabel("方向成分絶対値の平均和", fontsize=10)
            plt.title(f"{algo_name}\n{func_name_short}", fontsize=12)
            plt.xticks(np.arange(0, dim, step=max(1, dim//5)), fontsize=8)
            plt.yticks(fontsize=8)
            plt.grid(axis='y', linestyle='--', alpha=0.5)
            
        plt.suptitle(f"方向使用比較 - {func_name_short}", fontsize=14)
        plt.tight_layout()
        plt.savefig(os.path.join(comparison_plot_save_dir, f"{func_name_short}_direction_comparison.png"), dpi=150)
        plt.close()

        print(f"========== テスト関数完了: {func_name_short} ==========")

    print("全ての実験が完了しました。")

--- Original_Reward アルゴリズム実行中 ---
  実行中: 20/20
--- Gradient_Reward アルゴリズム実行中 ---
  実行中: 20/20
--- Original_Reward アルゴリズム実行中 ---
  実行中: 20/20
--- Gradient_Reward アルゴリズム実行中 ---
  実行中: 20/20
--- Original_Reward アルゴリズム実行中 ---
  実行中: 20/20
--- Gradient_Reward アルゴリズム実行中 ---
  実行中: 20/20
全ての実験が完了しました。


## 結果の分析

実験完了後、以下の分析を行います：

1. **収束性能の比較**
   - 元の報酬設計 vs 新しい報酬設計の収束速度
   - 最終的な最適化性能の差

2. **方向選択の比較**
   - 各次元の使用頻度の違い
   - 有効次元（0-4）への集中度

3. **報酬設計の影響**
   - 予測誤差ベース vs 勾配ベースの特徴
   - 各テスト関数での効果の違い

実験を実行して結果を確認してください。