In [7]:
# -*- coding: utf-8 -*-
import optuna
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.initializers import GlorotNormal #(튜닝) 추가
from tensorflow.keras.activations import elu
from tensorflow.keras.optimizers import Adam
import gym
import numpy as np
import random as rand
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

LOSS_CLIPPING = 0.2

class Agent(object):
    #(1) trial
    def __init__(self, trial):

        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n

        #(2) 변수설정
        self.node_num = trial.suggest_int("node_num", 16, 128, step=16)
        self.actor_lr = trial.suggest_float("actor_lr", 1e-5, 1e-2, log=True)
        self.critic_lr = trial.suggest_float("critic_lr", 1e-5, 1e-2, log=True)
        self.epochs_cnt = trial.suggest_int("epochs_cnt", 1, 10)
        self.actor = self.build_actor()
        self.critic = self.build_critic()

        self.discount_rate = trial.suggest_float("discount_rate", 0.90, 0.999)
        self.lambda_gae = trial.suggest_float("lambda_gae", 0.90, 0.99)
        self.penalty = -10

        #(3) episode_num = 200
        self.episode_num = 200
        self.moving_avg_size = 20
        self.reward_list = []
        self.count_list = []
        self.moving_avg_list = []

        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.next_states = []
        self.old_probs = []

    def build_actor(self):
        input_states = Input(shape=(self.state_size,), name='input_states')

        x = Dense(self.node_num, activation='swish', kernel_initializer=GlorotNormal())(input_states)  
        out_actions = Dense(self.action_size, activation='softmax', name='output')(x)

        model = tf.keras.models.Model(inputs=input_states, outputs=out_actions)

        model.compile(optimizer=Adam(learning_rate=self.actor_lr, clipnorm=1.0))
        return model

    def build_critic(self):
        input_states = Input(shape=(self.state_size,), name='input_states')

        x = Dense(self.node_num, activation='swish', kernel_initializer=GlorotNormal())(input_states)  
        out_value = Dense(1, activation='linear', name='value')(x)

        model = tf.keras.models.Model(inputs=input_states, outputs=out_value)
        
        model.compile(optimizer=Adam(learning_rate=self.critic_lr, clipnorm=1.0), loss='mean_squared_error')
        return model

    def train(self):
        for episode in range(self.episode_num):

            state, _ = self.env.reset()
            reward_tot = 0
            step_count = 0
            done = False

            while not done:
                action, prob = self.get_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                if done and step_count < 499:
                    reward = self.penalty

                self.store_transition(state, action, reward, done, next_state, prob)
                state = next_state
                reward_tot += reward
                step_count += 1

            self.reward_list.append(reward_tot - self.penalty)
            self.count_list.append(step_count)
            self.moving_avg_list.append(np.mean(self.reward_list[-self.moving_avg_size:]))

            self.update_models()
            self.clear_memory()

        #(4) 전체 성과 반환 (목적 함수용)
        return np.mean(self.reward_list)

    def get_action(self, state):
        state_input = np.reshape(state, [1, self.state_size]).astype(np.float32)

        prob = self.actor(state_input, training=False).numpy()[0]

        action = np.random.choice(self.action_size, p=prob)

        return action, prob

    def store_transition(self, state, action, reward, done, next_state, prob):
        action_onehot = np.zeros(self.action_size)
        action_onehot[action] = 1.0

        self.states.append(state)
        self.actions.append(action_onehot)
        self.rewards.append(reward)
        self.dones.append(done)
        self.next_states.append(next_state)
        self.old_probs.append(prob)

    def update_models(self):
        states = np.array(self.states, dtype=np.float32)
        next_states = np.array(self.next_states, dtype=np.float32)
        rewards = np.array(self.rewards, dtype=np.float32).reshape(-1, 1)
        dones = np.array(self.dones, dtype=np.int32).reshape(-1, 1)
        actions = np.array(self.actions, dtype=np.float32)
        old_probs = np.array(self.old_probs, dtype=np.float32)

        advantages, targets = self.compute_gae(states, next_states, rewards, dones)
        advantages -= np.mean(advantages)
        advantages /= (np.std(advantages) + 1e-8)

        advantages = advantages.astype(np.float32)
        targets = targets.astype(np.float32)

        for _ in range(self.epochs_cnt):
            with tf.GradientTape() as tape:
                probs = self.actor(states, training=True)
                new_probs = tf.reduce_sum(actions * probs, axis=1, keepdims=True)
                old_probs_sum = tf.reduce_sum(actions * old_probs, axis=1, keepdims=True)

                ratio = new_probs / (old_probs_sum + 1e-10)
                clipped_ratio = tf.clip_by_value(ratio, 1 - LOSS_CLIPPING, 1 + LOSS_CLIPPING)
                actor_loss = -tf.reduce_mean(tf.minimum(ratio * advantages, clipped_ratio * advantages))

            gradients = tape.gradient(actor_loss, self.actor.trainable_variables)
            self.actor.optimizer.apply_gradients(zip(gradients, self.actor.trainable_variables))

            self.critic.train_on_batch(states, targets)

    def compute_gae(self, states, next_states, rewards, dones):

        values = self.critic(states, training=False).numpy()
        next_values = self.critic(next_states, training=False).numpy()

        advantages = np.zeros_like(rewards, dtype=np.float32)
        targets = np.zeros_like(rewards, dtype=np.float32)

        gae = 0.0

        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.discount_rate * next_values[t] * (1 - dones[t]) - values[t]

            gae = delta + self.discount_rate * self.lambda_gae * (1 - dones[t]) * gae

            advantages[t] = gae
            targets[t] = gae + values[t]
        return advantages, targets
		
    def clear_memory(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.next_states = []
        self.old_probs = []

#(5) Optuna 목적 함수
def objective(trial):
    agent = Agent(trial)
    avg_reward = agent.train()
    return avg_reward
    
if __name__ == "__main__":
    #(6) Optuna 최적화 실행
    study = optuna.create_study(direction="maximize")  # reward 최대화
    study.optimize(objective, n_trials=30)

    print("Best trial:")
    print("  Value: ", study.best_value)
    print("  Params: ", study.best_params)

[I 2025-04-03 07:07:36,416] A new study created in memory with name: no-name-1f7d903b-b95e-4ebd-8a90-26c17784d142
[I 2025-04-03 07:08:44,994] Trial 0 finished with value: 218.05 and parameters: {'node_num': 112, 'actor_lr': 0.0038585321692404815, 'critic_lr': 0.00030119862008390195, 'epochs_cnt': 6, 'discount_rate': 0.9773412435247546, 'lambda_gae': 0.9696765285440372}. Best is trial 0 with value: 218.05.
[I 2025-04-03 07:10:26,551] Trial 1 finished with value: 325.53 and parameters: {'node_num': 128, 'actor_lr': 0.0006987296340588978, 'critic_lr': 0.0001181861980175588, 'epochs_cnt': 9, 'discount_rate': 0.9943858638955163, 'lambda_gae': 0.9726290455601527}. Best is trial 1 with value: 325.53.
[I 2025-04-03 07:11:37,078] Trial 2 finished with value: 244.13 and parameters: {'node_num': 128, 'actor_lr': 0.006500115648153573, 'critic_lr': 0.0005152116218151408, 'epochs_cnt': 5, 'discount_rate': 0.9073762307627764, 'lambda_gae': 0.9844586157742938}. Best is trial 1 with value: 325.53.
[I 2

Best trial:
  Value:  -328.445
  Params:  {'node_num': 96, 'actor_lr': 0.005372432964286116, 'critic_lr': 0.0002277168169569523, 'epochs_cnt': 5, 'discount_rate': 0.9745127000993128, 'lambda_gae': 0.970020616032837}
