## Game2: 4 $\times$ 4 그리드

## imports

In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython

## 예비학습: 시각화 

In [2]:
def show(states):
    fig = plt.Figure()
    ax = fig.subplots()
    ax.matshow(np.zeros([4,4]), cmap='bwr',alpha=0.0)
    sc = ax.scatter(0, 0, color='red', s=500)  
    ax.text(0, 0, 'start', ha='center', va='center')
    ax.text(3, 3, 'end', ha='center', va='center')
    # Adding grid lines to the plot
    ax.set_xticks(np.arange(-.5, 4, 1), minor=True)
    ax.set_yticks(np.arange(-.5, 4, 1), minor=True)
    ax.grid(which='minor', color='black', linestyle='-', linewidth=2)
    def update(t):
        sc.set_offsets(states[t])
    ani = FuncAnimation(fig,update,frames=len(states))
    display(IPython.display.HTML(ani.to_jshtml()))

In [3]:
states = [[0,0],[0,1],[1,1],[1,2],[1,3]]

In [4]:
show(states)

## Env 클래스 구현

`-` GridWorld: 강화학습에서 많이 사용되는 기본적인 시뮬레이션 환경

1. **State**: 각 격자 셀이 하나의 상태이며, 에이전트는 이러한 상태 중 하나에 있을 수 있음. 
2. **Action**: 에이전트는 상태에서 다른 상태로 이동하기 위해 상, 하, 좌, 우로 이동하는 행동을 할 수 있음. 
3. **Reward**: 에이전트가 특정 행동을 취할 때 환경에서 보상이 주어짐. 
4. **Terminal State**: 일반적으로 하나 또는 그 이상의 종료 상태가 있으며, 에이전트가 이 상태에 도달하면 에피소드가 종료됨. 

`-` 환경과 에이전트 

- env: $(S_t,A_t) \to (S_{t+1}, R_t)$

`-` 수학기호들 

- `state_space`: ${\cal S}=\{1,\dots,16\}=\{(0,0)\dots,(3,3)\}$
- `action_space`: ${\cal A} = \{0,1,2,3\} = \{\text{right}, \text{up}, \text{left}, \text{down}\}$
- `current_state`: $S_t \in {\cal S}$
- `next_state`: $S_{t+1} \in {\cal S}$
- `action`: $A_t \in {\cal A}$
- `reward`: $R_t \in {\cal R}=\{-1,-10,100\}$

In [5]:
class GridWorld:
    def __init__(self):
        self.reset()
        self._action_to_direction = {
            0: np.array([1, 0]), # x+ 
            1: np.array([0, 1]), # y+
            2: np.array([-1, 0]), # x-  
            3: np.array([0, -1]), # y-
        }
        self.state_space = gym.spaces.MultiDiscrete([4, 4])
        self.action_space = gym.spaces.Discrete(4)
    def reset(self):
        self.agent_action = None
        self.agent_state = np.array([0, 0])
        return self.agent_state
    def step(self,action):
        direction = self._action_to_direction[action]
        self.agent_state = self.agent_state + direction
        # 목표지점에 도달 
        if np.array_equal(np.array([3,3]), self.agent_state):
            reward = 100
            terminated = True
        else:
            reward = -1
            terminated = False
        # 4*4밖에 있을 경우 
        if self.agent_state not in self.state_space:
            reward = -10
            terminated = True
            self.agent_state = self.agent_state - 1/2*direction
        return self.agent_state, reward, terminated

In [6]:
env = GridWorld()
env.reset()
states = []
rewards = [] 
terminations = [] 
for t in range(500):
    action = env.action_space.sample()
    state, reward, terminated = env.step(action)
    states.append(state)
    rewards.append(reward)
    terminations.append(terminated)
    if terminated: 
        break 

In [7]:
[np.array([0,0])]+states

[array([0, 0]),
 array([1, 0]),
 array([1, 1]),
 array([2, 1]),
 array([3, 1]),
 array([3.5, 1. ])]

In [8]:
show([np.array([0,0])]+states)

## Agent1 클래스 구현 + Run

`-` 첫번째 시도

In [9]:
# learn 추가
class Agent1:
    def __init__(self,env):
        self.action_space = env.action_space
        self.state_space = env.state_space
        self.n_experiences = 0
        self.n_episode = 0  
        
        ## episode-wise info 
        self.scores = [] 
        self.playtimes = [] 

        ## time-wise info
        self.current_state = None 
        self.action = None
        self.reward = None        
        self.next_state = None 
        self.socre = 0
        
        ## ReplayBuffer
        self.actions = []
        self.rewards = []
        self.current_states = []
        self.next_states = [] 
        self.terminations = []
        
    def act(self):
        self.action = self.action_space.sample()
        
    def save_experience(self): 
        self.actions.append(self.action)
        self.current_states.append(self.current_state)
        self.next_states.append(self.next_state)
        self.rewards.append(self.reward)
        self.terminations.append(self.terminated)
        self.n_experiences += 1
        self.score += self.reward

    def learn(self):
        pass

In [10]:
env = GridWorld()
agent = Agent1(env)
for _ in range(20):
    ### 1. 본질적인 코드
    agent.current_state = env.reset() 
    agent.terminated = False
    agent.score = 0 
    for t in range(50):
        # step1: agent >> env 
        agent.act()
        env.agent_action = agent.action
        # step2: env << agent 
        agent.next_state, agent.reward, agent.terminated = env.step(env.agent_action)
        agent.save_experience()
        # step3: 학습
        agent.learn()
        # step4: state update 
        agent.current_state = agent.next_state 
        # step5: 종료조건 체크
        if agent.terminated: break 
    agent.scores.append(agent.score) 
    agent.playtimes.append(t+1)
    agent.n_episode = agent.n_episode + 1 
    ## 2. 비본질적 코드
    print(
        f'Episode {agent.n_episode}\t'
        f'Score: {agent.scores[-1]}\t'
        f'Playtime: {agent.playtimes[-1]}'
    )

Episode 1	Score: -13	Playtime: 4
Episode 2	Score: -17	Playtime: 8
Episode 3	Score: -10	Playtime: 1
Episode 4	Score: -10	Playtime: 1
Episode 5	Score: -11	Playtime: 2
Episode 6	Score: -25	Playtime: 16
Episode 7	Score: -10	Playtime: 1
Episode 8	Score: -14	Playtime: 5
Episode 9	Score: -11	Playtime: 2
Episode 10	Score: 85	Playtime: 16
Episode 11	Score: -12	Playtime: 3
Episode 12	Score: -10	Playtime: 1
Episode 13	Score: -10	Playtime: 1
Episode 14	Score: -10	Playtime: 1
Episode 15	Score: -10	Playtime: 1
Episode 16	Score: 95	Playtime: 6
Episode 17	Score: -10	Playtime: 1
Episode 18	Score: -10	Playtime: 1
Episode 19	Score: -10	Playtime: 1
Episode 20	Score: -11	Playtime: 2


`-` 어떻게 학습을 할까? 즉 어떻게 "환경의 이해 $\to$ 행동의 결정" 의 과정을 수행할까? 

1. 어떠한 상태에서, 어떠한 행동을 했을때, 어떠한 보상과 어떠한 다음상태를 받았는지 기록하자.
2. 1을 바탕으로 다음행동을 어떻게 할지 판단하자. 

In [11]:
states = [np.array([0,0])]+agent.next_states[-5:-1]
states

[array([0, 0]),
 array([-0.5,  0. ]),
 array([-0.5,  0. ]),
 array([-0.5,  0. ]),
 array([1, 0])]

In [12]:
show(states)

## 환경의 이해 (1차원적 이해)

`-` 무작위로 10000판을 진행하여 보자. 

In [13]:
env = GridWorld()
agent = Agent1(env)
for _ in range(10000):
    ### 1. 본질적인 코드
    agent.current_state = env.reset() 
    agent.terminated = False
    agent.score = 0 
    for t in range(50):
        # step1: agent >> env 
        agent.act()
        env.agent_action = agent.action
        # step2: agent << env
        agent.next_state, agent.reward, agent.terminated = env.step(env.agent_action)
        agent.save_experience()
        # step3: 학습
        agent.learn()
        # step4: state update 
        agent.current_state = agent.next_state 
        # step5: 종료조건 체크
        if agent.terminated: break 
    agent.scores.append(agent.score) 
    agent.playtimes.append(t+1)
    agent.n_episode = agent.n_episode + 1 

In [14]:
agent.n_experiences

33576

`-` 데이터관찰

In [15]:
agent.current_states[0], agent.actions[0], agent.rewards[0]

(array([0, 0]), 2, -10)

In [16]:
agent.current_states[1], agent.actions[1], agent.rewards[1]

(array([0, 0]), 3, -10)

In [17]:
agent.current_states[2], agent.actions[2], agent.rewards[2]

(array([0, 0]), 0, -1)

In [18]:
agent.current_states[3], agent.actions[3], agent.rewards[3]

(array([1, 0]), 3, -10)

`-` 환경을 이해하기 위한 기록 (1)

In [19]:
q = np.zeros([4,4,4])
count = np.zeros([4,4,4])
for i in range(agent.n_experiences):
    x,y = agent.current_states[i]
    a = agent.actions[i]
    q[x,y,a] = q[x,y,a] + agent.rewards[i]
    count[x,y,a] = count[x,y,a] + 1 

In [20]:
count[count==0] = 0.1
count

array([[[3.057e+03, 3.033e+03, 3.037e+03, 2.978e+03],
        [1.074e+03, 1.009e+03, 9.530e+02, 1.074e+03],
        [3.890e+02, 3.790e+02, 3.730e+02, 3.750e+02],
        [1.340e+02, 1.240e+02, 1.220e+02, 1.340e+02]],

       [[1.101e+03, 1.047e+03, 1.031e+03, 1.030e+03],
        [7.060e+02, 7.230e+02, 7.020e+02, 7.410e+02],
        [3.740e+02, 3.740e+02, 3.730e+02, 3.730e+02],
        [1.480e+02, 1.610e+02, 1.350e+02, 1.530e+02]],

       [[4.030e+02, 4.040e+02, 4.110e+02, 4.330e+02],
        [3.630e+02, 4.070e+02, 3.780e+02, 4.060e+02],
        [2.250e+02, 2.530e+02, 2.290e+02, 2.740e+02],
        [9.600e+01, 1.130e+02, 8.900e+01, 1.030e+02]],

       [[1.160e+02, 1.620e+02, 1.440e+02, 1.360e+02],
        [1.260e+02, 1.690e+02, 1.700e+02, 1.550e+02],
        [1.060e+02, 9.600e+01, 9.700e+01, 9.500e+01],
        [1.000e-01, 1.000e-01, 1.000e-01, 1.000e-01]]])

In [21]:
q = (q/count).round(2)
q

array([[[ -1.,  -1., -10., -10.],
        [ -1.,  -1., -10.,  -1.],
        [ -1.,  -1., -10.,  -1.],
        [ -1., -10., -10.,  -1.]],

       [[ -1.,  -1.,  -1., -10.],
        [ -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1.,  -1.,  -1.],
        [ -1., -10.,  -1.,  -1.]],

       [[ -1.,  -1.,  -1., -10.],
        [ -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1.,  -1.,  -1.],
        [100., -10.,  -1.,  -1.]],

       [[-10.,  -1.,  -1., -10.],
        [-10.,  -1.,  -1.,  -1.],
        [-10., 100.,  -1.,  -1.],
        [  0.,   0.,   0.,   0.]]])

In [22]:
for i in range(4):
    print(f"action = {i}\n"
          f"action-value function =\n {q[:,:,i]}\n"
          )

action = 0
action-value function =
 [[ -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1. 100.]
 [-10. -10. -10.   0.]]

action = 1
action-value function =
 [[ -1.  -1.  -1. -10.]
 [ -1.  -1.  -1. -10.]
 [ -1.  -1.  -1. -10.]
 [ -1.  -1. 100.   0.]]

action = 2
action-value function =
 [[-10. -10. -10. -10.]
 [ -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.   0.]]

action = 3
action-value function =
 [[-10.  -1.  -1.  -1.]
 [-10.  -1.  -1.  -1.]
 [-10.  -1.  -1.  -1.]
 [-10.  -1.  -1.   0.]]



`-` 환경을 이해하기 위한 기록 (2) -- 이렇게하면 count를 따로 기록할 필요 없음 

In [23]:
q = np.zeros([4,4,4])
for i in range(agent.n_experiences):
    x,y = agent.current_states[i]
    a = agent.actions[i]
    q_estimated = q[x,y,a] # 풀이한 답
    q_observed = agent.rewards[i] # 실제 답
    diff = q_observed - q_estimated # 실제답과 풀이한값의 차이 = 오차피드백값이라고 하자
    q[x,y,a] = q_estimated + 0.05 * diff ## 새로운답 = 원래답 + 오차피드백 * 학습률

In [24]:
for i in range(4):
    print(
        f"action = {i}\n"
        f"action-value function =\n {q[:,:,i].round(1)}\n"
    )

action = 0
action-value function =
 [[ -1.   -1.   -1.   -1. ]
 [ -1.   -1.   -1.   -1. ]
 [ -1.   -1.   -1.   99.3]
 [-10.  -10.  -10.    0. ]]

action = 1
action-value function =
 [[ -1.   -1.   -1.  -10. ]
 [ -1.   -1.   -1.  -10. ]
 [ -1.   -1.   -1.  -10. ]
 [ -1.   -1.   99.3   0. ]]

action = 2
action-value function =
 [[-10. -10. -10. -10.]
 [ -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.  -1.]
 [ -1.  -1.  -1.   0.]]

action = 3
action-value function =
 [[-10.  -1.  -1.  -1.]
 [-10.  -1.  -1.  -1.]
 [-10.  -1.  -1.  -1.]
 [-10.  -1.  -1.   0.]]



## 환경의 깊은 이해 (좀 더 고차원적인 이해)

`-` action=1 일때 각 state의 가치 (=기대보상)

In [25]:
q[:,:,1]

array([[-1.        , -1.        , -1.        , -9.9827127 ],
       [-1.        , -1.        , -1.        , -9.9974088 ],
       [-1.        , -1.        , -0.99999769, -9.96960743],
       [-0.99975384, -0.99982809, 99.27311433,  0.        ]])

`-` 분석1

In [26]:
q[3,2,1] 

99.27311432903872

- 상태 (3,2)에서 행동 1을 하게 되면 100의 보상을 얻으므로 기대보상값은 100근처 -> 합리적임

`-` 분석2 

In [27]:
q[3,1,1] 

-0.9998280946892258

- 상태 (3,1)에서 행동1을 하게 되면 -1의 보상을 얻으므로 기대보상값은 -1 근처 -> 합리적일까? 

`-` 비판: 분석2는 합리적인듯 하지만 data를 분석한뒤는 그다지 합리적이지 못함 

`-` 상황상상 

- 빈 종이를 줌 
- 빈 종이에는 0 또는 1을 쓸 수 있음 (action = 0 or 1)
- 0을 쓸 때와 1을 쓸 때의 보상은 다름
- 그런데 무수히 많은 데이터를 분석한 결과 0을 쓰면 0원을 보상으로 주고, 1을 쓰면 10만원을 보상으로 준다는 것을 "알게 되었음"
- 빈 종이의 가치는 5만원인가? 아니면 10만원인가? --> 10만원 아니야? 

`-` 직관: 생각해보니 `q[3,1,1]`에서는 실제보상(-1)과 잠재적보상(100)을 동시에 고려해야하는게 합리적인듯

In [28]:
q[3,1,1] = (-1) + 0.99 * (100) 

In [29]:
q[:,:,1]

array([[-1.        , -1.        , -1.        , -9.9827127 ],
       [-1.        , -1.        , -1.        , -9.9974088 ],
       [-1.        , -1.        , -0.99999769, -9.96960743],
       [-0.99975384, 98.        , 99.27311433,  0.        ]])

- 여기에서 0.99 는 "미래에 받을 보상이 현재에 비해 얼마나 중요한지를 결정하는 가중치" 이다. 
- 1에 가까울수록 미래에 받을 보상을 매우 중시한다는 의미 (즉 빈종이 = 십만원 으로 생각한다는 의미)

`-` 수식화: `q[3,1,1] = (-1) + 0.99 * (100)`를 수식화하면 아래와 같다. 

$$q(s,a) = r(s,a) + 0.99\times \max_{a} q(s',a)$$

좀 더 정확하게는 아래와 같이 볼 수 있다. 

$$q(s,a)= \begin{cases} r(s,a) & \text{terminated}  \\ r(s,a)+ 0.99\times \max_{a} q(s',a) & \text{not terminated} \end{cases} $$

In [30]:
q = np.zeros([4,4,4])
for i in range(agent.n_experiences):
    x,y = agent.current_states[i]
    xx,yy = agent.next_states[i]
    a = agent.actions[i]
    q_estimated = q[x,y,a]
    if agent.terminations[i]:
        q_observed = agent.rewards[i] 
    else:
        q_observed = agent.rewards[i] + 0.99*(q[xx,yy,:].max()) # 이걸 관측했다고 치는거임
    diff = q_observed - q_estimated 
    q[x,y,a] = q_estimated + 0.1 * diff

In [31]:
for i in range(4):
    print(
        f"action = {i}\n"
        f"action-value function =\n {q[:,:,i].round(1)}\n"
    )

action = 0
action-value function =
 [[ 90.2  92.1  94.   96. ]
 [ 92.1  94.1  96.   98. ]
 [ 94.   96.   98.  100. ]
 [-10.  -10.  -10.    0. ]]

action = 1
action-value function =
 [[ 90.2  92.1  94.  -10. ]
 [ 92.1  94.1  96.  -10. ]
 [ 94.   96.   98.  -10. ]
 [ 96.   98.  100.    0. ]]

action = 2
action-value function =
 [[-10.  -10.  -10.  -10. ]
 [ 88.3  90.2  92.1  93.9]
 [ 90.2  92.1  94.   95.9]
 [ 92.1  94.   96.    0. ]]

action = 3
action-value function =
 [[-10.   88.3  90.2  92.1]
 [-10.   90.2  92.1  94. ]
 [-10.   92.1  94.   96. ]
 [-10.   94.   96.    0. ]]



## 행동 전략 수립

`-` 상태 (0,0)에 있다고 가정해보자. 

In [32]:
q[0,0,:]

array([ 90.18834204,  90.18825943, -10.        , -10.        ])

- 행동 0 혹은 1을 하는게 유리함. 

`-` 상태 (2,3)에 있다고 가정해보자. 

In [33]:
q[2,3,:]

array([99.99595162, -9.99993248, 95.88410271, 95.9882404 ])

- 행동 0을 하는게 유리함.

`-` 상태 (3,2)에 있다고 가정해보자. 

In [34]:
q[3,2,:] 

array([-9.99985884, 99.99595162, 95.9635707 , 95.96574116])

- 행동 1을 하는게 유리함.

`-` 각 상태에서 최적은 action은 아래와 같다. 

In [35]:
q[0,0,:].argmax()

0

In [36]:
q[2,3,:].argmax()

0

In [37]:
q[3,2,:].argmax()

1

`-` 전략(=정책)을 정리해보자.  

In [38]:
policy = np.array(['?????']*16).reshape(4,4)
policy

array([['?????', '?????', '?????', '?????'],
       ['?????', '?????', '?????', '?????'],
       ['?????', '?????', '?????', '?????'],
       ['?????', '?????', '?????', '?????']], dtype='<U5')

In [39]:
directions = {0: 'down', 1: 'right', 2:'up', 3:'left'}

In [40]:
for i in range(4):
    for j in range(4):
        policy[i,j] = directions[q[i,j,:].argmax()]
policy

array([['down', 'down', 'down', 'down'],
       ['right', 'right', 'down', 'down'],
       ['right', 'right', 'down', 'down'],
       ['right', 'right', 'right', 'down']], dtype='<U5')

`-` 요약: 값이 큰 쪽으로 이동 

In [41]:
q.max(axis=-1)

array([[90.18834204, 92.10965337, 94.0491205 , 95.97470696],
       [92.109638  , 94.05087516, 96.01217929, 97.98721238],
       [94.04994531, 96.01140927, 97.99325348, 99.99595162],
       [95.99729911, 97.99021607, 99.99595162,  0.        ]])

## Agent2 클래스 구현 + Run

In [42]:
# learn 추가
class Agent2(Agent1):
    def __init__(self,env):
        super().__init__(env)
        self.q = np.zeros([4,4,4])
    def act(self):
        if self.n_experiences < 3000: 
            self.action = self.action_space.sample()
        else: 
            x,y = self.current_state
            self.action = self.q[x,y,:].argmax()
    def learn(self): # make q
        x,y = agent.current_state
        xx,yy = agent.next_state
        a = agent.action
        q_estimated = self.q[x,y,a]
        if agent.terminated:
            q_observed = agent.reward
        else:
            q_observed = agent.reward + 0.99*(q[xx,yy,:].max()) 
        # q_observed 와 q_estimated를 점점 비슷하게 만들어주는 역할
        diff = q_observed - q_estimated
        self.q[x,y,a] = q_estimated + 0.1 * diff 

In [43]:
env = GridWorld()
agent = Agent2(env)
for _ in range(2000):
    ### 1. 본질적인 코드
    agent.current_state = env.reset() 
    agent.terminated = False
    agent.score = 0 
    for t in range(50):
        # step1: agent >> env 
        agent.act()
        env.agent_action = agent.action
        # step2: env << agent 
        agent.next_state, agent.reward, agent.terminated = env.step(env.agent_action)
        # step3: 데이터저장 및 학습
        agent.learn()        
        # step4: 다음 iteration 준비 + 종료조건체크
        agent.current_state = agent.next_state 
        if agent.terminated: break 
    agent.scores.append(agent.score) 
    agent.playtimes.append(t+1)
    agent.n_episode = agent.n_episode + 1 
    ## 2. 비본질적 코드
    if (agent.n_episode % 100) == 0:
        print(
            f'Episode {agent.n_episode}\t'
            f'Score: {np.mean(agent.scores[-100:]) : .2f}\t'
            f'Playtime: {np.mean(agent.playtimes[-100:]) : .2f}\t'
            f'n_experiences: {agent.n_experiences}'
        )

Episode 100	Score:  0.00	Playtime:  2.99	n_experiences: 0
Episode 200	Score:  0.00	Playtime:  2.84	n_experiences: 0
Episode 300	Score:  0.00	Playtime:  3.14	n_experiences: 0
Episode 400	Score:  0.00	Playtime:  3.42	n_experiences: 0
Episode 500	Score:  0.00	Playtime:  3.17	n_experiences: 0
Episode 600	Score:  0.00	Playtime:  3.28	n_experiences: 0
Episode 700	Score:  0.00	Playtime:  3.34	n_experiences: 0
Episode 800	Score:  0.00	Playtime:  3.43	n_experiences: 0
Episode 900	Score:  0.00	Playtime:  3.16	n_experiences: 0
Episode 1000	Score:  0.00	Playtime:  3.30	n_experiences: 0
Episode 1100	Score:  0.00	Playtime:  3.31	n_experiences: 0
Episode 1200	Score:  0.00	Playtime:  3.38	n_experiences: 0
Episode 1300	Score:  0.00	Playtime:  2.87	n_experiences: 0
Episode 1400	Score:  0.00	Playtime:  3.28	n_experiences: 0
Episode 1500	Score:  0.00	Playtime:  3.54	n_experiences: 0
Episode 1600	Score:  0.00	Playtime:  3.48	n_experiences: 0
Episode 1700	Score:  0.00	Playtime:  3.24	n_experiences: 0
Episod

In [44]:
states = [np.array([0,0])] + agent.next_states[-agent.playtimes[-1]:]
show(states)

## Agent3 클래스 구현 + Run

In [45]:
class Agent3(Agent2):
    def __init__(self,env):
        super().__init__(env)
        self.eps = 1
    def act(self):
        if self.eps > np.random.rand():
            self.action = self.action_space.sample()
        else: 
            x,y = self.current_state
            self.action = self.q[x,y,:].argmax()

In [46]:
env = GridWorld()
agent = Agent3(env)
for _ in range(5000):
    ### 1. 본질적인 코드
    agent.current_state = env.reset() 
    agent.terminated = False
    agent.score = 0 
    for t in range(50):
        # step1: agent >> env 
        agent.act()
        env.agent_action = agent.action
        # step2: env << agent 
        agent.next_state, agent.reward, agent.terminated = env.step(env.agent_action)
        agent.save_experience()
        # step3: 데이터저장 및 학습
        agent.learn()        
        # step4: 다음 iteration 준비 + 종료조건체크
        agent.current_state = agent.next_state 
        if agent.terminated: break 
    agent.scores.append(agent.score) 
    agent.playtimes.append(t+1)
    agent.n_episode = agent.n_episode + 1 
    agent.eps = agent.eps * 0.999
    ## 2. 비본질적 코드
    if (agent.n_episode % 1000) == 0:
        print(
            f'Episode {agent.n_episode}\t'
            f'Score: {np.mean(agent.scores[-100:]) : .2f}\t'
            f'Playtime: {np.mean(agent.playtimes[-100:]) : .2f}\t'
            f'n_experiences: {agent.eps}'
        )

Episode 1000	Score:  22.74	Playtime:  6.76	n_experiences: 0.3676954247709635
Episode 2000	Score:  75.07	Playtime:  6.13	n_experiences: 0.1351999253974994
Episode 3000	Score:  92.63	Playtime:  6.17	n_experiences: 0.04971239399803625
Episode 4000	Score:  90.73	Playtime:  5.87	n_experiences: 0.018279019827489446
Episode 5000	Score:  93.93	Playtime:  5.97	n_experiences: 0.006721111959865607


In [47]:
states = [np.array([0,0])] + agent.next_states[-agent.playtimes[-1]:]
show(states)

## 최종 Agent 클래스 구현 + Run

In [48]:
# learn 추가
class Agent(Agent3):
    # agent >> env 
    def __rshift__(self,env):
        self.act()
        env.agent_action = self.action
    # agent << env 
    def __lshift__(self,env): 
        self.next_state, self.reward, self.terminated = env.step(env.agent_action)
        self.save_experience()

In [51]:
env = GridWorld()
agent = Agent(env)
for _ in range(5000):
    ### 1. 본질적인 코드
    agent.current_state = env.reset() 
    agent.terminated = False
    agent.score = 0 
    for t in range(50):
        ## step1 
        agent >> env 
        ## step2 
        agent << env 
        ## step3 
        agent.learn()    
        ## step4 
        agent.current_state = agent.next_state 
        ## step5 
        if agent.terminated: break 
    agent.scores.append(agent.score) 
    agent.playtimes.append(t+1)
    agent.n_episode = agent.n_episode + 1 
    agent.eps = agent.eps*0.999
    ## 2. 비본질적 코드
    if (agent.n_episode % 1000) == 0:
        print(
            f'Episode {agent.n_episode}\t'
            f'Score: {np.mean(agent.scores[-100:]) : .2f}\t'
            f'Playtime: {np.mean(agent.playtimes[-100:]) : .2f}\t'
            f'n_eps: {agent.eps}'
        )

Episode 1000	Score:  16.06	Playtime:  5.74	n_eps: 0.3676954247709635
Episode 2000	Score:  76.00	Playtime:  6.30	n_eps: 0.1351999253974994
Episode 3000	Score:  89.47	Playtime:  6.03	n_eps: 0.04971239399803625
Episode 4000	Score:  94.86	Playtime:  6.14	n_eps: 0.018279019827489446
Episode 5000	Score:  95.00	Playtime:  6.00	n_eps: 0.006721111959865607


In [52]:
states = [np.array([0,0])] + agent.next_states[-agent.playtimes[-1]:]
show(states)