In [1]:
import gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
from tensorflow.compat.v1.losses import huber_loss
from utils.gpu_memory import limited_gpu_memory

In [2]:
limited_gpu_memory()
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

memory growth: True


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 393722072269112159,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 7049546957
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 12511179666425546025
 physical_device_desc: "device: 0, name: GeForce RTX 2060 SUPER, pci bus id: 0000:06:00.0, compute capability: 7.5"]

In [3]:
NUM_EPISODES = 200
MAX_STEPS = 50
GAMMA = 0.99
WARMUP = 10

E_START = 1.0
E_STOP = 0.01
E_DECAY_RATE = 0.001

MEMORY_SIZE = 1000
BATCH_SIZE = 32

In [4]:
class QNetwork:
    def __init__(self, state_size, action_size):
        self.model = Sequential()
        self.model.add(Dense(16, activation='relu', input_dim=state_size))
        self.model.add(Dense(16, activation='relu'))
        self.model.add(Dense(16, activation='relu'))
        self.model.add(Dense(action_size, activation='linear'))
        
        self.model.compile(loss=huber_loss, optimizer=Adam(lr=0.001))

In [5]:
class Memory():
    def __init__(self, memory_size):
        self.buffer = deque(maxlen=memory_size)
        
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
        return [self.buffer[i] for i in idx]
    
    def __len__(self):
        return len(self.buffer)

In [6]:
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

main_qn = QNetwork(state_size, action_size)

target_qn = QNetwork(state_size, action_size)

memory = Memory(MEMORY_SIZE)

In [None]:
state = env.reset()
state = np.reshape(state, [1, state_size])

total_step = 0
success_count = 0

for episode in range(1, NUM_EPISODES + 1):
    step = 0
    
    target_qn.model.set_weights(main_qn.model.get_weights())
    
    for _ in range(1, MAX_STEPS + 1):
        # env.render()
        step += 1
        total_step += 1
        
        epsilon = E_STOP + (E_START - E_STOP) * np.exp(-E_DECAY_RATE * total_step)
        
        if epsilon > np.random.rand():
            action = env.action_space.sample()
        else:
            action = np.argmax(main_qn.model.predict(state)[0])
            
        next_state, _, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        
        if done:
            if step >= 190:
                success_count += 1
                reward = 1
            else:
                success_count = 0
                reward = 0
                
            next_state = np.zeros(state.shape)
            
            if step > WARMUP:
                memory.add((state, action, reward, next_state))
            
        else:
            reward = 0
            
            if step > WARMUP:
                memory.add((state, action, reward, next_state))
                
            state = next_state
            
        if len(memory) >= BATCH_SIZE:
            inputs = np.zeros((BATCH_SIZE, 4))
            targets = np.zeros((BATCH_SIZE, 2))
            
            minibatch = memory.sample(BATCH_SIZE)
            
            for i, (state_b, action_b, reward_b, next_state_b) in enumerate(minibatch):
                inputs[i] = state_b
                
                if not(next_state_b == np.zeros(state_b.shape)).all(axis=1):
                    target = reward_b + GAMMA * np.amax(target_qn.model.predict(next_state_b)[0])
                else:
                    target = reward_b
                    
                targets[i] = main_qn.model.predict(state_b)
                targets[i][action_b] = target
                
            main_qn.model.fit(inputs, targets, epochs=1, verbose=0)
        
        if done:
            break
            
    print('エピソード: {}, ステップ数: {}, epsilon: {:.4f}'.format(episode, step, epsilon))
    
    if success_count >= 5:
        break
        
    state = env.reset()
    state = np.reshape(state, [1, state_size])

エピソード: 1, ステップ数: 26, epsilon: 0.9746
エピソード: 2, ステップ数: 20, epsilon: 0.9555
エピソード: 3, ステップ数: 11, epsilon: 0.9451
エピソード: 4, ステップ数: 13, epsilon: 0.9331
エピソード: 5, ステップ数: 10, epsilon: 0.9239
エピソード: 6, ステップ数: 44, epsilon: 0.8845
