In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time

In [36]:
import numpy as np
from typing import Callable, List, Tuple
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

class CMAES():
    def __init__(self, arg_names: List[str], ave_vec: List[float], sigma=1.0, max_iter=100, population=None, mu=None, fixed_args=None):
        self.arg_names = arg_names
        self.fixed_args = fixed_args or {}
        self.dim = len(ave_vec)
        self.max_iter = max_iter
        # 個体数と選抜数
        self.population = population if population else int(4 + 3 * np.log(self.dim))
        self.mu = mu if mu else int(np.floor(self.population / 2))
        # 平均値ベクトル
        self.m = np.array(ave_vec, dtype=np.float64)
        # 重み行列の計算(muを定義した後)
        self.weights = self.calc_weights()
        self.mu_eff = 1.0 / (self.weights**2).sum()
        self.sigma = float(sigma)
        self.C = np.identity(self.dim)
        self.c_1 = 2.0 / ((self.dim + 1.3) ** 2 + self.mu_eff)
        self.c_mu = min(
        1 - self.c_1,
        2.0 * (self.mu_eff - 2 + 1/self.mu_eff) / ((self.dim + 2) ** 2 + self.mu_eff)
        )
        self.chi = np.sqrt(self.dim) * (1 - 1 / (4 * self.dim) + 1 / (21 * (self.dim ** 2)))
        self.c_c = (4 + self.mu_eff / self.dim) / (self.dim + 4 + 2 * self.mu_eff / self.dim)
        self.c_sigma = (self.mu_eff + 2) / (self.dim + self.mu_eff + 5)
        self.p_c = np.zeros(self.dim)
        self.p_sigma = np.zeros(self.dim)
        self.loss = float('inf')
        self.best_val = None

        self.history = {
            'best_fitness': [],
            'mean_fitness': [],
            'worst_fitness': [],
            'mean_vector': [],
            'sigma': [],
            'eigenvalues': [],
            'populations': []  # 各世代の全個体
        }

    def sample(self) -> List[float]:
        """多次元正規分布からサンプリングをする"""
        arr = np.random.multivariate_normal(mean=self.m, cov=self.C, size=self.dim)
        arr = arr.tolist()[0]
        return arr

    def calc_weights(self):
        """対数重みを計算する"""
        raw_weights = np.log(self.mu + 0.5) - np.log(np.arange(1, self.mu + 1))
        return raw_weights / raw_weights.sum()

    def matrix_inverse_sqrt(self):
        # 固有値分解
        eigvals, eigvecs = np.linalg.eigh(self.C)

        # 数値安定性のために微小値で下限をつける
        eigvals = np.maximum(eigvals, 1e-20)

        # Λ^{-1/2}
        D_inv_sqrt = np.diag(1.0 / np.sqrt(eigvals))

        # C^{-1/2} = Q Λ^{-1/2} Q^T
        C_inv_sqrt = eigvecs @ D_inv_sqrt @ eigvecs.T
        return C_inv_sqrt

    def compute_d_sigma(self):
        return 1 + self.c_sigma + 2 * max(0, np.sqrt((self.mu_eff - 1) / (self.dim + 1)) - 1)

    def debug(self):
        print(f"weights: {self.weights}")
        print(f"")

    def record_history(self, fitness_values, population):
        self.history['best_fitness'].append(np.min(fitness_values))
        self.history['mean_fitness'].append(np.mean(fitness_values))
        self.history['worst_fitness'].append(np.max(fitness_values))
        self.history['mean_vector'].append(self.m.copy())
        self.history['sigma'].append(self.sigma)
        eigenvals, _ = np.linalg.eigh(self.C)
        self.history['eigenvalues'].append(eigenvals.copy())
        self.history['populations'].append(population.copy())

    def opt(self, f: Callable) -> Tuple[float, List[float]]:
        dim = self.dim
        mu_eff = self.mu_eff

        # 選抜を行うループ
        for gen in range(self.max_iter):
            print(f"{'='*5}{gen+1}世代目{'='*5}")
            # 個体集合を生成
            group: List[List[float]] = []
            for _ in range(self.population):
                group.append(self.sample())

            # 関数に入力する
            scores: List[Tuple[float, List[float]]] = []
            for x in group:
                arg_dict = {name: val for name, val in zip(self.arg_names, x)}
                arg_dict.update(self.fixed_args)
                current_loss = f(**arg_dict)
                scores.append((current_loss, x))

            # 損失で昇順に並べ替える
            scores.sort(key=lambda x: x[0])

            # 暫定出力値の更新
            if self.loss > scores[0][0]:
                # print(f"DEBUG loss: {scores[0][0]}")
                self.loss = scores[0][0]
                self.best_val = scores[0][1]
                print(f"最小値の更新: ")
                print(f"値: {self.loss}")
                print(f"ベクトル: {self.best_val}")

            fitness_values = np.array([i[0] for i in scores])
            population = np.array([i[1] for i in scores])
            # print(f"min(fitness_values): {np.min(fitness_values)}")
            self.record_history(fitness_values, population)

            # self.muの個体を取り出す
            elites = scores[:self.mu]
            elites = np.array([i[1] for i in elites])

            # 平均値ベクトルの更新
            m_old = self.m
            self.m = self.weights @ elites
            # print(f"m: {self.m}")

            # 共分散行列のランクmu更新
            C_mu = np.zeros((dim, dim))
            for i in range(self.mu):
                x = np.array(elites[i])
                y_i = x - m_old
                C_mu = C_mu + self.weights[i] * (np.outer(y_i, y_i) / self.mu)

            # print(f"[DEBUG] C_mu: \n{C_mu}")
            C_mu /= self.sigma ** 2

            # ステップサイズσの更新処理
            y = (self.m - m_old) / self.sigma
            p_sigma = (1 - self.c_sigma) * self.p_sigma
            p_sigma += np.sqrt(1 - (1 - self.c_sigma) ** 2) * mu_eff * (self.matrix_inverse_sqrt() @ y)

            p_sigma_norm = np.linalg.norm(p_sigma)
            self.sigma = self.sigma * np.exp(
                (self.c_sigma / self.compute_d_sigma())
                * (p_sigma_norm / self.chi - 1)
            )
            self.p_sigma = p_sigma

            """
            # ステップサイズが多すぎるときにCの更新を止める
            left = np.sqrt((self.p_sigma ** 2).sum()) / np.sqrt(1 - (1 - self.c_sigma) ** (2 * (gen+1)))
            right = (1.4 + 2 / (self.dim + 1)) * self.chi
            hsigma = 1 if left < right else 0
            d_hsigma = (1 - hsigma) * self.c_c * (2 - self.c_c)
            """

            # 共分散行列のランク1更新
            self.p_c = (1 - self.c_c) * self.p_c + np.sqrt(1 - (1 - self.c_c) ** 2) * np.sqrt(mu_eff) * y
            C_1 = np.outer(self.p_c, self.p_c)

            # 共分散行列の更新
            C_new = (1 - self.c_mu - self.c_1) * self.C + self.c_mu * C_mu + self.c_1 * C_1
            self.C = C_new

        # print(f"[DEBUG] m: {m}")
        return (self.loss, self.best_val)


In [3]:
def load_mnist_data():
  """
  MNISTをロードして前処理
  """
  mnist = fetch_openml('mnist_784', version=1, as_frame=False)
  X, y = mnist.data, mnist.target.astype(int)
  # (0-255) -> (0, 1)
  X = X / 255.0

  X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=0.2, random_state=42, stratify=y
  )

  X_train, X_val, y_train, y_val = train_test_split(
      X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
  )

  print(f"訓練データ: {X_train.shape}")
  print(f"検証データ: {X_val.shape}")
  print(f"テストデータ: {X_test.shape}")

  return X_train, X_val, X_test, y_train, y_val, y_test


In [4]:
def create_data_loaders(X_train, X_val, X_test, y_train, y_val, y_test, batch_size=32):
  """PyTorchのDataLoaderを作成"""
  # numpy array -> Pytorch Tensor
  X_train_tensor = torch.FloatTensor(X_train)
  X_val_tensor = torch.FloatTensor(X_val)
  X_test_tensor = torch.FloatTensor(X_test)
  y_train_tensor = torch.LongTensor(y_train)
  y_val_tensor = torch.LongTensor(y_val)
  y_test_tensor = torch.LongTensor(y_test)

  train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
  val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
  test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  return train_loader, val_loader, test_loader

In [5]:
class SmallMLP(nn.Module):
  def __init__(self, input_size=784, hidden_size=128, num_classes=10, dropout_rate=0.2):
    super(SmallMLP, self).__init__()
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, num_classes)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, x):
    # (batch_size, 28, 28) → (batch_size, 784)
    x = x.view(x.size(0), -1)
    x = F.relu(self.fc1(x))
    x = self.dropout(x)
    x = self.fc2(x)

    return x


In [6]:
def train_smallmlp_with_params(learning_rate, hidden_size, dropout_rate,
                     train_loader, val_loader, epochs=10):
    """
    指定されたハイパーパラメータでモデルを訓練
    """
    start_time = time.time()
    # デバイスの設定
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # print(f"Using device: {device}")

    # パラメータの制約
    learning_rate = max(0.0001, min(0.1, learning_rate / 100))
    hidden_size = max(32, min(512, int(hidden_size * 100)))
    dropout_rate = max(0.0, min(0.5, dropout_rate / 100))

    model = SmallMLP(hidden_size=hidden_size, dropout_rate=dropout_rate)
    model = model.to(device)  # モデルをGPUに移動

    # 訓練
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(epochs):
        for data, target in train_loader:
            data = data.to(device)
            target = target.to(device)

            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss = loss.item()

        training_time = time.time() - start_time
        # print(f'Epoch [{epoch+1}/{epochs}], ' f'Train Loss: {train_loss:.4f}, ' f'Time: {training_time:.2f}')

    # 検証
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in val_loader:
            # データをGPUに移動
            data = data.to(device)
            target = target.to(device)

            output = model(data)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    training_time = time.time() - start_time
    accuracy = correct / total
    print(f"training time: {training_time:.2f}s, accuracy: {accuracy:.4f}, lr: {learning_rate:.4f}, hidden: {hidden_size}, dropout: {dropout_rate:.4f}")
    return -accuracy  # CMA-ESは最小化なので負の値を返す


In [7]:
X_train, X_val, X_test, y_train, y_val, y_test = load_mnist_data()
train_loader, val_loader, test_loader = create_data_loaders(X_train, X_val, X_test, y_train, y_val, y_test, batch_size=256)

訓練データ: (44800, 784)
検証データ: (11200, 784)
テストデータ: (14000, 784)


In [8]:
params = ["learning_rate", "hidden_size", "dropout_rate"]
fixed_args = {"train_loader": train_loader, "val_loader": val_loader, "epochs": 10}
init_point = [0.0, 0.0, 0.0]
cmaes = CMAES(arg_names=params, ave_vec=init_point, max_iter=20, fixed_args=fixed_args)

In [9]:
loss, value = cmaes.opt(train_smallmlp_with_params)

=====1世代目=====
training time: 14.06s, accuracy: 0.9696, lr: 0.0118, hidden: 91, dropout: 0.0040
training time: 6.58s, accuracy: 0.9609, lr: 0.0059, hidden: 32, dropout: 0.0149
training time: 6.00s, accuracy: 0.9029, lr: 0.0001, hidden: 32, dropout: 0.0060
training time: 6.53s, accuracy: 0.8992, lr: 0.0001, hidden: 32, dropout: 0.0000
training time: 6.01s, accuracy: 0.9544, lr: 0.0014, hidden: 32, dropout: 0.0000
training time: 6.90s, accuracy: 0.9734, lr: 0.0061, hidden: 186, dropout: 0.0098
training time: 6.19s, accuracy: 0.9704, lr: 0.0039, hidden: 109, dropout: 0.0000
最小値の更新: 
値: -0.9733928571428572
ベクトル: [0.6141138254655145, 1.864712036214139, 0.981877964537629]
=====2世代目=====
training time: 6.58s, accuracy: 0.9119, lr: 0.0001, hidden: 52, dropout: 0.0000
training time: 6.99s, accuracy: 0.9463, lr: 0.0226, hidden: 32, dropout: 0.0000
training time: 6.75s, accuracy: 0.9732, lr: 0.0100, hidden: 272, dropout: 0.0000
training time: 6.83s, accuracy: 0.9710, lr: 0.0090, hidden: 290, drop

In [35]:
result = train_smallmlp_with_params(*value, train_loader, val_loader, epochs=10)
print(result)

training time: 6.77s, accuracy: 0.9757, lr: 0.0039, hidden: 282, dropout: 0.0262
-0.9757142857142858
