In [1]:
import gym
import numpy as np
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
from tensorflow.compat.v1.losses import huber_loss
from utils.gpu_memory import limited_gpu_memory

In [2]:
limited_gpu_memory()

memory growth: True


In [3]:
ENV = 'CartPole-v0'

NUM_EPISODES = 500
MAX_STEPS = 200
GAMMA = 0.99


MEMORY_SIZE = 10000
BATCH_SIZE = 32

In [4]:
class QNetwork:
    def __init__(self, num_states, num_actions):
        self.model = Sequential()
        self.model.add(Dense(32, activation='relu', input_dim=num_states))
        self.model.add(Dense(32, activation='relu'))
        self.model.add(Dense(16, activation='relu'))
        self.model.add(Dense(num_actions, activation='linear'))
        
        self.model.compile(loss=huber_loss, optimizer=Adam(lr=0.001))

In [5]:
class ReplayMemory():
    def __init__(self, memory_size):
        self.buffer = deque(maxlen=memory_size)
        
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
        return [self.buffer[i] for i in idx]
    
    def __len__(self):
        return len(self.buffer)

In [6]:
class Brain:
    def __init__(self, num_states, num_actions):
        self.num_states = num_states
        self.num_actions = num_actions
        
        self.main_qn = QNetwork(num_states, num_actions)
        self.target_qn = QNetwork(num_states, num_actions)
        print(self.main_qn.model.summary())
        
        self.memory = ReplayMemory(MEMORY_SIZE)
        
    def replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        
        inputs = np.zeros((BATCH_SIZE, self.num_states))
        targets = np.zeros((BATCH_SIZE, self.num_actions))

        minibatch =self. memory.sample(BATCH_SIZE)

        for i, (state_b, action_b, next_state_b, reward_b) in enumerate(minibatch):
            inputs[i] = state_b

            if not(next_state_b == np.zeros(state_b.shape)).all(axis=1):
                target = reward_b + GAMMA * np.amax(self.target_qn.model.predict(next_state_b)[0])
            else:
                target = reward_b

            targets[i] = self.main_qn.model.predict(state_b)
            targets[i][action_b] = target

        self.main_qn.model.fit(inputs, targets, epochs=1, verbose=0)
        
    def decide_action(self, state, episode):
        epsilon = 0.5 * (1 / episode)
        
        if epsilon <= np.random.uniform(0, 1):
            return np.argmax(self.main_qn.model.predict(state)[0])
        
        return np.random.choice([0, 1])
    
    def update_target_q_network(self):
        self.target_qn.model.set_weights(self.main_qn.model.get_weights())

In [7]:
class Agent:
    def __init__(self, num_states, num_actions):
        self.brain = Brain(num_states, num_actions)
        
    def update_q_function(self):
        self.brain.replay()
        
    def get_action(self, state, episode):
        return self.brain.decide_action(state, episode)
    
    def memorize(self, state, action, state_next, reward):
        self.brain.memory.push(state, action, state_next, reward)
        
    def get_action(self, state, episode):
        return self.brain.decide_action(state, episode)
        
    def memorize(self, state, action, next_state, reward):
        self.brain.memory.add((state, action, next_state, reward))
        
    def update_target_q_function(self):
        self.brain.update_target_q_network()

In [8]:
class Enviroment:
    def __init__(self):
        self.env = gym.make(ENV)
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.n
        self.agent = Agent(self.num_states, self.num_actions)
        
    def run(self):
        success_count = 0
        episode_final = False
        episode_10_list = np.zeros(10)  # 10試行分の立ち続けたstep数を格納し、平均ステップ数を出力に利用

        for episode in range(1, NUM_EPISODES + 1):
            state = self.env.reset()
            state = np.reshape(state, (1, self.num_states))
            
            for step in range(1, MAX_STEPS + 1):

                # 行動決定
                action = self.agent.get_action(state, episode)

                next_state, _, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.num_states])

                if done:
                    if step >= 190:
                        success_count += 1
                        reward = 1
                    else:
                        success_count = 0 # 連続記録をリセット
                        reward = -1

                    next_state = np.zeros(next_state.shape)
                    episode_10_list = np.hstack((episode_10_list[1:], step + 1))

                else:
                    reward = 0
                
                self.agent.memorize(state, action, next_state, reward)
                
                self.agent.update_q_function()
                
                state = next_state

                # 終了時の処理
                if done:
                    print("{} Episode: Finished after {} steps：10試行の平均step数 = {}".format(episode, step, episode_10_list.mean()))
                    if episode % 2 == 0:
                        self.agent.update_target_q_function()
                    break
            
            if episode_final is True:
                self.env.render()
                break

            if success_count >= 5:
                episode_final = True

In [9]:
cartpole_env = Enviroment()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                160       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 34        
Total params: 1,778
Trainable params: 1,778
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
# cartpole_env.env.action_space.sample()
print(np.random.uniform(0, 1))

0.039709932306058016


In [None]:
cartpole_env.run()

1 Episode: Finished after 11 steps：10試行の平均step数 = 1.2
2 Episode: Finished after 14 steps：10試行の平均step数 = 2.7
3 Episode: Finished after 9 steps：10試行の平均step数 = 3.7
4 Episode: Finished after 12 steps：10試行の平均step数 = 5.0
5 Episode: Finished after 12 steps：10試行の平均step数 = 6.3
6 Episode: Finished after 9 steps：10試行の平均step数 = 7.3
7 Episode: Finished after 9 steps：10試行の平均step数 = 8.3
8 Episode: Finished after 12 steps：10試行の平均step数 = 9.6
9 Episode: Finished after 10 steps：10試行の平均step数 = 10.7
10 Episode: Finished after 9 steps：10試行の平均step数 = 11.7
11 Episode: Finished after 10 steps：10試行の平均step数 = 11.6
12 Episode: Finished after 16 steps：10試行の平均step数 = 11.8
13 Episode: Finished after 11 steps：10試行の平均step数 = 12.0
14 Episode: Finished after 11 steps：10試行の平均step数 = 11.9
15 Episode: Finished after 14 steps：10試行の平均step数 = 12.1
16 Episode: Finished after 16 steps：10試行の平均step数 = 12.8
17 Episode: Finished after 10 steps：10試行の平均step数 = 12.9
18 Episode: Finished after 11 steps：10試行の平均step数 = 12.8
19 Episode: F