###### Original

In [6]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

EPISODES = 300


# 카트폴 예제에서의 DQN 에이전트
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.render = False
        self.load_model = False

        # 상태와 행동의 크기 정의
        self.state_size = state_size
        self.action_size = action_size

        # DQN 하이퍼파라미터
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 1000

        # 리플레이 메모리, 최대 크기 2000
        self.memory = deque(maxlen=2000)

        # 모델과 타깃 모델 생성
        self.model = self.build_model()
        self.target_model = self.build_model()

        # 타깃 모델 초기화
        self.update_target_model()

        if self.load_model:
            self.model.load_weights("./save_model/cartpole_dqn_trained.h5")

    # 상태가 입력, 큐함수가 출력인 인공신경망 생성
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    # 타깃 모델을 모델의 가중치로 업데이트
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # 입실론 탐욕 정책으로 행동 선택
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
    def train_model(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        # 메모리에서 배치 크기만큼 무작위로 샘플 추출
        mini_batch = random.sample(self.memory, self.batch_size)

        states = np.zeros((self.batch_size, self.state_size))
        next_states = np.zeros((self.batch_size, self.state_size))
        actions, rewards, dones = [], [], []

        for i in range(self.batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            next_states[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])

        # 현재 상태에 대한 모델의 큐함수
        # 다음 상태에 대한 타깃 모델의 큐함수
        target = self.model.predict(states)
        target_val = self.target_model.predict(next_states)

        # 벨만 최적 방정식을 이용한 업데이트 타깃
        for i in range(self.batch_size):
            if dones[i]:
                target[i][actions[i]] = rewards[i]
            else:
                target[i][actions[i]] = rewards[i] + self.discount_factor * (
                    np.amax(target_val[i]))

        self.model.fit(states, target, batch_size=self.batch_size,
                       epochs=1, verbose=0)


if __name__ == "__main__":
    # CartPole-v1 환경, 최대 타임스텝 수가 500
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # DQN 에이전트 생성
    agent = DQNAgent(state_size, action_size)

    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        # env 초기화
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if agent.render:
                env.render()

            # 현재 상태로 행동을 선택
            action = agent.get_action(state)
            # 선택한 행동으로 환경에서 한 타임스텝 진행
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            # 에피소드가 중간에 끝나면 -100 보상
            reward = reward if not done or score == 499 else -100

            # 리플레이 메모리에 샘플 <s, a, r, s'> 저장
            agent.append_sample(state, action, reward, next_state, done)
            # 매 타임스텝마다 학습
            if len(agent.memory) >= agent.train_start:
                agent.train_model()

            score += reward
            state = next_state

            if done:
                # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트
                agent.update_target_model()

                score = score if score == 500 else score + 100
                # 에피소드마다 학습 결과 출력
                scores.append(score)
                episodes.append(e)
                pylab.plot(episodes, scores, 'b')
                pylab.savefig("./save_graph/cartpole_dqn.png")
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)

                # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    agent.model.save_weights("./save_model/cartpole_dqn.h5")
                    sys.exit()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_5 (Dense)              (None, 

episode: 95   score: 500.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 96   score: 338.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 97   score: 295.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 98   score: 331.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 99   score: 421.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 100   score: 223.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 101   score: 306.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 102   score: 286.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 103   score: 439.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 104   score: 500.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 105   score: 340.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 106   score: 345.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 107   score:

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [5]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

EPISODES = 100  # 300

In [18]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.render = False
        self.load_model = False

        # 상태와 행동의 크기 정의
        self.state_size = state_size
        self.action_size = action_size

        # DQN 하이퍼파라미터
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 100  # 1000

        # 리플레이 메모리, 최대 크기 2000
        self.memory = deque(maxlen=2000)

        # 모델과 타깃 모델 생성
        self.model = self.build_model()
        self.target_model = self.build_model()

        # 타깃 모델 초기화
        self.update_target_model()

        if self.load_model:
            self.model.load_weights("./save_model/cartpole_dqn_trained.h5")
        
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
        
    # 타깃 모델을 모델의 가중치로 업데이트
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # 입실론 탐욕 정책으로 행동 선택
    def get_action(self, state):
        
        return random.randrange(self.action_size)
        '''
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])
        '''

    # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def train_model(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        mini_batch = random.sample(self.memory, self.batch_size)
        
        states = np.zeros((self.batch_size, self.state_size))
        next_states = np.zeros((self.batch_size, self.state_size))
        actions, rewards, dones = [], [], []
        
        for i in range(self.batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            next_states[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])
            
        target = self.model.predict(states)
        target_val = self.target_model.predict(next_states)
        
        print('>', type(target), target.shape, target[0])

In [19]:
if __name__ == "__main__":
    # CartPole-v1 환경, 최대 타임스텝 수가 500
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # DQN 에이전트 생성
    agent = DQNAgent(state_size, action_size)

    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        # env 초기화
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        # while not done:
        for _ in range(100):
            if agent.render:
                env.render()

            # 현재 상태로 행동을 선택
            action = agent.get_action(state)
            # 선택한 행동으로 환경에서 한 타임스텝 진행
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            # 에피소드가 중간에 끝나면 -100 보상
            reward = reward if not done or score == 499 else -100

            # 리플레이 메모리에 샘플 <s, a, r, s'> 저장
            agent.append_sample(state, action, reward, next_state, done)
            # 매 타임스텝마다 학습
            if len(agent.memory) >= agent.train_start:
                agent.train_model()

            score += reward
            state = next_state

            if done:
                # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트
                agent.update_target_model()

                score = score if score == 500 else score + 100
                # 에피소드마다 학습 결과 출력
                scores.append(score)
                episodes.append(e)
                pylab.plot(episodes, scores, 'b')
                pylab.savefig("./save_graph/cartpole_dqn.png")
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)

                # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    agent.model.save_weights("./save_model/cartpole_dqn.h5")
                    sys.exit()
                
                break

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_43 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_44 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_45 (Dense)             (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_46 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_47 (Dense)             (None, 

episode: 7   score: 18.0   memory length: 217   epsilon: 0.8886435861147077
> <class 'numpy.ndarray'> (64, 2) [-0.33629984 -0.32157612]
> <class 'numpy.ndarray'> (64, 2) [-0.06057519 -0.10981572]
> <class 'numpy.ndarray'> (64, 2) [-0.23732817 -0.03435567]
> <class 'numpy.ndarray'> (64, 2) [-0.14014058 -0.15604141]
> <class 'numpy.ndarray'> (64, 2) [-0.50068748 -0.2118908 ]
> <class 'numpy.ndarray'> (64, 2) [-0.06057519 -0.10981572]
> <class 'numpy.ndarray'> (64, 2) [-0.56471229 -0.00119291]
> <class 'numpy.ndarray'> (64, 2) [-0.34793246 -0.40823799]
> <class 'numpy.ndarray'> (64, 2) [-0.01507039 -0.01038669]
> <class 'numpy.ndarray'> (64, 2) [-0.02952508 -0.04796745]
> <class 'numpy.ndarray'> (64, 2) [-0.41655546  0.02535233]
> <class 'numpy.ndarray'> (64, 2) [-0.04984751 -0.01973659]
> <class 'numpy.ndarray'> (64, 2) [-1.029984   -0.06281167]
> <class 'numpy.ndarray'> (64, 2) [-0.30376473 -0.12254754]
> <class 'numpy.ndarray'> (64, 2) [-0.24187744  0.02334921]
> <class 'numpy.ndarray'

episode: 13   score: 16.0   memory length: 364   epsilon: 0.7671042341100781
> <class 'numpy.ndarray'> (64, 2) [-0.00609851 -0.0535203 ]
> <class 'numpy.ndarray'> (64, 2) [-0.55242181 -0.23970857]
> <class 'numpy.ndarray'> (64, 2) [-0.13475233 -0.10250361]
> <class 'numpy.ndarray'> (64, 2) [-0.03375101 -0.01350038]
> <class 'numpy.ndarray'> (64, 2) [-0.30376473 -0.12254754]
> <class 'numpy.ndarray'> (64, 2) [-0.87512898 -0.03757912]
> <class 'numpy.ndarray'> (64, 2) [-0.12970056 -0.1011776 ]
> <class 'numpy.ndarray'> (64, 2) [-0.02965415 -0.01752287]
> <class 'numpy.ndarray'> (64, 2) [-0.27507028 -0.25467476]
> <class 'numpy.ndarray'> (64, 2) [-0.64677876 -0.01864159]
> <class 'numpy.ndarray'> (64, 2) [-0.25182605 -0.23117758]
> <class 'numpy.ndarray'> (64, 2) [-0.54422623 -0.00382726]
> <class 'numpy.ndarray'> (64, 2) [-0.68942142 -0.13771404]
> <class 'numpy.ndarray'> (64, 2) [-0.23180565  0.00048129]
episode: 14   score: 13.0   memory length: 378   epsilon: 0.7564343028582378
> <cla

episode: 19   score: 34.0   memory length: 501   epsilon: 0.6688462043806328
> <class 'numpy.ndarray'> (64, 2) [-0.29613504 -0.29962942]
> <class 'numpy.ndarray'> (64, 2) [-0.26095766 -0.1331102 ]
> <class 'numpy.ndarray'> (64, 2) [-0.08284631 -0.10445548]
> <class 'numpy.ndarray'> (64, 2) [-0.30417693 -0.04581286]
> <class 'numpy.ndarray'> (64, 2) [-0.29613504 -0.29962942]
> <class 'numpy.ndarray'> (64, 2) [-0.38973191 -0.36877704]
> <class 'numpy.ndarray'> (64, 2) [-0.48814064 -0.01169476]
> <class 'numpy.ndarray'> (64, 2) [-0.80982804 -0.07479137]
> <class 'numpy.ndarray'> (64, 2) [-0.42118394 -0.39628464]
> <class 'numpy.ndarray'> (64, 2) [-0.00390231 -0.07089983]
> <class 'numpy.ndarray'> (64, 2) [-0.50068748 -0.2118908 ]
> <class 'numpy.ndarray'> (64, 2) [-0.06608663 -0.11021072]
> <class 'numpy.ndarray'> (64, 2) [-0.33965334 -0.31683379]
> <class 'numpy.ndarray'> (64, 2) [-0.30376473 -0.12254754]
> <class 'numpy.ndarray'> (64, 2) [-0.00443461 -0.08013958]
> <class 'numpy.ndarray

episode: 26   score: 28.0   memory length: 656   epsilon: 0.572765620160788
> <class 'numpy.ndarray'> (64, 2) [-0.19157043 -0.00772656]
> <class 'numpy.ndarray'> (64, 2) [-0.39278501 -0.32363129]
> <class 'numpy.ndarray'> (64, 2) [-0.24394913 -0.08663494]
> <class 'numpy.ndarray'> (64, 2) [-0.26095766 -0.1331102 ]
> <class 'numpy.ndarray'> (64, 2) [-0.65713429 -0.11792669]
> <class 'numpy.ndarray'> (64, 2) [-0.37999603 -0.02769923]
> <class 'numpy.ndarray'> (64, 2) [-0.2198216   0.06609661]
> <class 'numpy.ndarray'> (64, 2) [-0.48814064 -0.01169476]
> <class 'numpy.ndarray'> (64, 2) [-0.38179478 -0.02690203]
> <class 'numpy.ndarray'> (64, 2) [-0.0392509  -0.17923762]
> <class 'numpy.ndarray'> (64, 2) [-0.02946918 -0.0084115 ]
> <class 'numpy.ndarray'> (64, 2) [-0.00694976 -0.03165462]
> <class 'numpy.ndarray'> (64, 2) [-0.29078329 -0.13976462]
> <class 'numpy.ndarray'> (64, 2) [-0.30615115 -0.17143662]
> <class 'numpy.ndarray'> (64, 2) [-0.05262055 -0.10579325]
> <class 'numpy.ndarray'

episode: 29   score: 12.0   memory length: 799   epsilon: 0.4964114134310989
> <class 'numpy.ndarray'> (64, 2) [-0.41260269 -0.01493207]
> <class 'numpy.ndarray'> (64, 2) [-0.14352252 -0.00130854]
> <class 'numpy.ndarray'> (64, 2) [-0.12629208  0.00677864]
> <class 'numpy.ndarray'> (64, 2) [-0.41529483 -0.42496559]
> <class 'numpy.ndarray'> (64, 2) [ 0.00313212 -0.25739598]
> <class 'numpy.ndarray'> (64, 2) [-0.24726301 -0.2606377 ]
> <class 'numpy.ndarray'> (64, 2) [-0.39278501 -0.32363129]
> <class 'numpy.ndarray'> (64, 2) [-0.14423108 -0.00753479]
> <class 'numpy.ndarray'> (64, 2) [-0.18776885 -0.08917111]
> <class 'numpy.ndarray'> (64, 2) [-0.07596254 -0.03009541]
> <class 'numpy.ndarray'> (64, 2) [-0.08163265 -0.01115609]
> <class 'numpy.ndarray'> (64, 2) [-0.33977044  0.00044233]
> <class 'numpy.ndarray'> (64, 2) [-0.41260269 -0.01493207]
> <class 'numpy.ndarray'> (64, 2) [-0.45415598 -0.14167497]
> <class 'numpy.ndarray'> (64, 2) [-0.34894893 -0.05604704]
> <class 'numpy.ndarray

episode: 34   score: 41.0   memory length: 930   epsilon: 0.4354323744883354
> <class 'numpy.ndarray'> (64, 2) [-0.24572664 -0.04678334]
> <class 'numpy.ndarray'> (64, 2) [-0.16878112 -0.12650204]
> <class 'numpy.ndarray'> (64, 2) [-0.10414823 -0.02993218]
> <class 'numpy.ndarray'> (64, 2) [-0.04675858 -0.03154506]
> <class 'numpy.ndarray'> (64, 2) [-0.03535861 -0.08297149]
> <class 'numpy.ndarray'> (64, 2) [-0.29205137 -0.26192528]
> <class 'numpy.ndarray'> (64, 2) [-0.43621287 -0.11539668]
> <class 'numpy.ndarray'> (64, 2) [-0.15530103 -0.11542554]
> <class 'numpy.ndarray'> (64, 2) [-0.47707611  0.11760885]
> <class 'numpy.ndarray'> (64, 2) [-0.68820149 -0.69170612]
> <class 'numpy.ndarray'> (64, 2) [-1.029984   -0.06281167]
episode: 35   score: 10.0   memory length: 941   epsilon: 0.43066649544671043
> <class 'numpy.ndarray'> (64, 2) [-0.39040244 -0.27404371]
> <class 'numpy.ndarray'> (64, 2) [-0.08224269 -0.13308108]
> <class 'numpy.ndarray'> (64, 2) [-0.02046242 -0.0573948 ]
> <cl

> <class 'numpy.ndarray'> (64, 2) [-0.08820908 -0.12091024]
episode: 38   score: 65.0   memory length: 1065   epsilon: 0.3804184978064605
> <class 'numpy.ndarray'> (64, 2) [-0.181007    0.01236243]
> <class 'numpy.ndarray'> (64, 2) [-0.26624212 -0.2293354 ]
> <class 'numpy.ndarray'> (64, 2) [-0.41682783 -0.05466685]
> <class 'numpy.ndarray'> (64, 2) [-0.82471466 -0.10295838]
> <class 'numpy.ndarray'> (64, 2) [-0.27507028 -0.25467476]
> <class 'numpy.ndarray'> (64, 2) [-0.3721875  -0.00336412]
> <class 'numpy.ndarray'> (64, 2) [-0.14014058 -0.15604141]
> <class 'numpy.ndarray'> (64, 2) [-0.11284405 -0.10920191]
> <class 'numpy.ndarray'> (64, 2) [-0.46251208 -0.04162133]
> <class 'numpy.ndarray'> (64, 2) [-0.48296189  0.10237321]
> <class 'numpy.ndarray'> (64, 2) [-0.11284405 -0.10920191]
episode: 39   score: 10.0   memory length: 1076   epsilon: 0.37625475470427916
> <class 'numpy.ndarray'> (64, 2) [-0.66294825 -0.64426363]
> <class 'numpy.ndarray'> (64, 2) [-0.08224269 -0.13308108]
> <

episode: 45   score: 20.0   memory length: 1212   epsilon: 0.32838884448265515
> <class 'numpy.ndarray'> (64, 2) [-0.20388871 -0.20163764]
> <class 'numpy.ndarray'> (64, 2) [-0.53709739 -0.25538096]
> <class 'numpy.ndarray'> (64, 2) [-0.68011385 -0.67527395]
> <class 'numpy.ndarray'> (64, 2) [-0.24028979 -0.1376839 ]
> <class 'numpy.ndarray'> (64, 2) [-0.05479089 -0.09684867]
> <class 'numpy.ndarray'> (64, 2) [-0.08163265 -0.01115609]
> <class 'numpy.ndarray'> (64, 2) [-0.04145055 -0.02130626]
> <class 'numpy.ndarray'> (64, 2) [-0.20019978  0.01084492]
> <class 'numpy.ndarray'> (64, 2) [-0.26655129 -0.14186329]
> <class 'numpy.ndarray'> (64, 2) [-0.01003074 -0.02175818]
> <class 'numpy.ndarray'> (64, 2) [-0.13999197 -0.11738464]
> <class 'numpy.ndarray'> (64, 2) [-0.26825997 -0.11943964]
> <class 'numpy.ndarray'> (64, 2) [-0.26927614 -0.1464549 ]
> <class 'numpy.ndarray'> (64, 2) [-0.42765072 -0.03267232]
episode: 46   score: 13.0   memory length: 1226   epsilon: 0.32382116483926726
> 

episode: 52   score: 38.0   memory length: 1360   epsilon: 0.28319178633180314
> <class 'numpy.ndarray'> (64, 2) [-0.00508411 -0.02113461]
> <class 'numpy.ndarray'> (64, 2) [-0.11104892 -0.10869867]
> <class 'numpy.ndarray'> (64, 2) [-0.28538871  0.00766249]
> <class 'numpy.ndarray'> (64, 2) [-0.66295689 -0.62177205]
> <class 'numpy.ndarray'> (64, 2) [-0.02147486 -0.04764834]
> <class 'numpy.ndarray'> (64, 2) [-0.70686233 -0.08164579]
> <class 'numpy.ndarray'> (64, 2) [-0.07957481 -0.31125087]
> <class 'numpy.ndarray'> (64, 2) [-0.43773019 -0.38782406]
> <class 'numpy.ndarray'> (64, 2) [-0.17854519 -0.34283942]
> <class 'numpy.ndarray'> (64, 2) [-0.28980154 -0.00357166]
> <class 'numpy.ndarray'> (64, 2) [-0.55723846  0.05184141]
> <class 'numpy.ndarray'> (64, 2) [-0.15178704 -0.01914519]
> <class 'numpy.ndarray'> (64, 2) [-0.66356736 -0.66021353]
> <class 'numpy.ndarray'> (64, 2) [-0.33977044  0.00044233]
> <class 'numpy.ndarray'> (64, 2) [-0.43144834 -0.42900583]
> <class 'numpy.ndarr

episode: 60   score: 18.0   memory length: 1500   epsilon: 0.2461778670932771
> <class 'numpy.ndarray'> (64, 2) [-0.44758248 -0.06575736]
> <class 'numpy.ndarray'> (64, 2) [-0.29205137 -0.26192528]
> <class 'numpy.ndarray'> (64, 2) [-0.70628375 -0.09330818]
> <class 'numpy.ndarray'> (64, 2) [-0.37460086  0.04901165]
> <class 'numpy.ndarray'> (64, 2) [-0.48075187 -0.07044291]
> <class 'numpy.ndarray'> (64, 2) [-0.92931378 -0.11235711]
> <class 'numpy.ndarray'> (64, 2) [-0.12469496 -0.03312704]
> <class 'numpy.ndarray'> (64, 2) [-0.14014058 -0.15604141]
> <class 'numpy.ndarray'> (64, 2) [-0.01781795 -0.0075058 ]
> <class 'numpy.ndarray'> (64, 2) [-0.33269987  0.01757725]
> <class 'numpy.ndarray'> (64, 2) [-0.28467312 -0.26621732]
> <class 'numpy.ndarray'> (64, 2) [-0.11580379 -0.12518728]
> <class 'numpy.ndarray'> (64, 2) [-0.45869842 -0.32908583]
> <class 'numpy.ndarray'> (64, 2) [-0.65345132 -0.08119577]
> <class 'numpy.ndarray'> (64, 2) [-0.5209378   0.04804829]
> <class 'numpy.ndarra

episode: 67   score: 16.0   memory length: 1643   epsilon: 0.21336040198243916
> <class 'numpy.ndarray'> (64, 2) [-0.62157995 -0.61053151]
> <class 'numpy.ndarray'> (64, 2) [-0.0659564  -0.10637001]
> <class 'numpy.ndarray'> (64, 2) [-0.28470027  0.06050748]
> <class 'numpy.ndarray'> (64, 2) [-0.10264238 -0.12442006]
> <class 'numpy.ndarray'> (64, 2) [-0.71104866 -0.02038714]
> <class 'numpy.ndarray'> (64, 2) [-0.09805732 -0.01658773]
> <class 'numpy.ndarray'> (64, 2) [-0.08284631 -0.10445548]
> <class 'numpy.ndarray'> (64, 2) [-0.01398017 -0.0190443 ]
> <class 'numpy.ndarray'> (64, 2) [-0.1833806  -0.11698075]
> <class 'numpy.ndarray'> (64, 2) [-0.42757198 -0.01443285]
> <class 'numpy.ndarray'> (64, 2) [-0.5687393  -0.38519233]
> <class 'numpy.ndarray'> (64, 2) [-0.30850077 -0.25652856]
episode: 68   score: 11.0   memory length: 1655   epsilon: 0.21081411211133688
> <class 'numpy.ndarray'> (64, 2) [-0.00071134 -0.07672475]
> <class 'numpy.ndarray'> (64, 2) [-0.40199709 -0.39857629]
> 

episode: 73   score: 42.0   memory length: 1799   epsilon: 0.18252820552270246
> <class 'numpy.ndarray'> (64, 2) [-0.11696903 -0.00785576]
> <class 'numpy.ndarray'> (64, 2) [-0.68146229 -0.20195256]
> <class 'numpy.ndarray'> (64, 2) [ 0.00708109 -0.07665967]
> <class 'numpy.ndarray'> (64, 2) [-0.16729963 -0.15770449]
> <class 'numpy.ndarray'> (64, 2) [-0.19758984 -0.02705829]
> <class 'numpy.ndarray'> (64, 2) [-0.3180359  -0.31770217]
> <class 'numpy.ndarray'> (64, 2) [-0.45882893 -0.05391544]
> <class 'numpy.ndarray'> (64, 2) [-0.53659511 -0.06552409]
> <class 'numpy.ndarray'> (64, 2) [-0.18196252 -0.02863519]
> <class 'numpy.ndarray'> (64, 2) [-0.08409486 -0.05551432]
> <class 'numpy.ndarray'> (64, 2) [-0.2886664 -0.2945157]
> <class 'numpy.ndarray'> (64, 2) [-0.36248547 -0.00752853]
> <class 'numpy.ndarray'> (64, 2) [-0.2540223 -0.2545557]
> <class 'numpy.ndarray'> (64, 2) [-0.82039022 -0.81956339]
> <class 'numpy.ndarray'> (64, 2) [-0.13432778 -0.17566741]
> <class 'numpy.ndarray'>

episode: 80   score: 20.0   memory length: 1941   epsilon: 0.1583540999779007
> <class 'numpy.ndarray'> (64, 2) [-0.65713429 -0.11792669]
> <class 'numpy.ndarray'> (64, 2) [-0.02216223 -0.04219145]
> <class 'numpy.ndarray'> (64, 2) [-0.22972402 -0.21458213]
> <class 'numpy.ndarray'> (64, 2) [-0.01820485  0.00070727]
> <class 'numpy.ndarray'> (64, 2) [-0.22094274 -0.09468174]
> <class 'numpy.ndarray'> (64, 2) [-0.05259816 -0.04613775]
> <class 'numpy.ndarray'> (64, 2) [-0.39406472 -0.35602713]
> <class 'numpy.ndarray'> (64, 2) [-1.00021398 -0.97375286]
> <class 'numpy.ndarray'> (64, 2) [-0.81482482 -0.78286785]
> <class 'numpy.ndarray'> (64, 2) [-0.26461288 -0.15181892]
> <class 'numpy.ndarray'> (64, 2) [-0.31715703 -0.02740407]
> <class 'numpy.ndarray'> (64, 2) [-0.73722821 -0.01987419]
> <class 'numpy.ndarray'> (64, 2) [-0.32977667 -0.02831936]
> <class 'numpy.ndarray'> (64, 2) [-0.25190765 -0.07641135]
> <class 'numpy.ndarray'> (64, 2) [-0.37372574  0.01248026]
> <class 'numpy.ndarra

episode: 86   score: 31.0   memory length: 2000   epsilon: 0.13533526065815754
> <class 'numpy.ndarray'> (64, 2) [-0.02202149 -0.08982024]
> <class 'numpy.ndarray'> (64, 2) [-0.52477944  0.00366613]
> <class 'numpy.ndarray'> (64, 2) [-0.36852986 -0.2366825 ]
> <class 'numpy.ndarray'> (64, 2) [-0.1626502  -0.13906887]
> <class 'numpy.ndarray'> (64, 2) [-0.09283025 -0.0174095 ]
> <class 'numpy.ndarray'> (64, 2) [-0.11121976 -0.11801048]
> <class 'numpy.ndarray'> (64, 2) [-0.68820149 -0.69170612]
> <class 'numpy.ndarray'> (64, 2) [ 0.00108842 -0.03244177]
> <class 'numpy.ndarray'> (64, 2) [-0.47707611  0.11760885]
> <class 'numpy.ndarray'> (64, 2) [-0.34371218  0.10640579]
> <class 'numpy.ndarray'> (64, 2) [-0.52415234 -0.13997352]
> <class 'numpy.ndarray'> (64, 2) [-0.92543823 -0.28599179]
episode: 87   score: 11.0   memory length: 2000   epsilon: 0.13372013995058965
> <class 'numpy.ndarray'> (64, 2) [-0.4107472   0.11816983]
> <class 'numpy.ndarray'> (64, 2) [-0.69610345 -0.04260272]
> 

episode: 93   score: 48.0   memory length: 2000   epsilon: 0.11485529795811826
> <class 'numpy.ndarray'> (64, 2) [-0.50279945 -0.04339236]
> <class 'numpy.ndarray'> (64, 2) [-0.01595274 -0.06423647]
> <class 'numpy.ndarray'> (64, 2) [-0.3962422   0.05717802]
> <class 'numpy.ndarray'> (64, 2) [-0.733136   -0.13307965]
> <class 'numpy.ndarray'> (64, 2) [-0.15017876 -0.01804912]
> <class 'numpy.ndarray'> (64, 2) [-0.1422476  -0.03024583]
> <class 'numpy.ndarray'> (64, 2) [-0.10133602  0.00468636]
> <class 'numpy.ndarray'> (64, 2) [-0.04732723 -0.02899633]
> <class 'numpy.ndarray'> (64, 2) [-0.16843508 -0.1264094 ]
> <class 'numpy.ndarray'> (64, 2) [-0.85049385 -0.87870783]
> <class 'numpy.ndarray'> (64, 2) [-0.01165056 -0.04360518]
> <class 'numpy.ndarray'> (64, 2) [-0.31667697 -0.22045681]
> <class 'numpy.ndarray'> (64, 2) [-0.32543111 -0.023792  ]
> <class 'numpy.ndarray'> (64, 2) [-0.05196225 -0.02683003]
> <class 'numpy.ndarray'> (64, 2) [-0.18455642  0.00090174]
> <class 'numpy.ndarr

episode: 98   score: 15.0   memory length: 2000   epsilon: 0.09974356180769112
> <class 'numpy.ndarray'> (64, 2) [-0.80164564 -0.75399774]
> <class 'numpy.ndarray'> (64, 2) [-0.01417869  0.00235639]
> <class 'numpy.ndarray'> (64, 2) [-0.15566836 -0.16755591]
> <class 'numpy.ndarray'> (64, 2) [-0.19174661 -0.15049939]
> <class 'numpy.ndarray'> (64, 2) [-0.03595595 -0.13637789]
> <class 'numpy.ndarray'> (64, 2) [-0.55344498 -0.06521487]
> <class 'numpy.ndarray'> (64, 2) [-0.00935244 -0.0393817 ]
> <class 'numpy.ndarray'> (64, 2) [-0.19773762 -0.00512096]
> <class 'numpy.ndarray'> (64, 2) [-0.11616556 -0.11351745]
> <class 'numpy.ndarray'> (64, 2) [-0.4232741  -0.07507834]
> <class 'numpy.ndarray'> (64, 2) [-0.33333558  0.01811996]
> <class 'numpy.ndarray'> (64, 2) [-0.42589206 -0.37436759]
> <class 'numpy.ndarray'> (64, 2) [-0.27651113 -0.04581176]
> <class 'numpy.ndarray'> (64, 2) [-0.07233669 -0.21907353]
> <class 'numpy.ndarray'> (64, 2) [-0.3079024   0.09622695]
> <class 'numpy.ndarr