# Chapter 10 주어진 환경과 상호작용하며 학습하는 DQN

## 10.2 카트폴 게임 마스터하기

In [1]:
# 필수 모듈 import
import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt

* 에피소드 : 총 플레이할 게임 수
* 엡실론   : 에이전트가 무작위로 행동할 확률. 에이전트가 가능한 모든 행동을 경험하도록 하기 위함.
* 엡실론 감소율 : EPS_START -> EPS_END 까지 점진적으로 감소시키는 값
* 감마     : 에이전트가 현재 보상을 미래 보상보다 얼마나 가치 있게 여기는지에 대한 값

In [2]:
# 하이퍼파라미터
EPISODES = 50    # 에피소드 반복 횟수
EPS_START = 0.9  # 학습 시작 시 에이전트가 무작위로 행동할 확률
EPS_END = 0.05   # 학습 막바지에 에이전트가 무작위로 행동할 확률
EPS_DECAY = 200  # 학습 진행 시 에이전트가 무작위로 행동할 확률을 감소시키는 값
GAMMA = 0.8      # 할인계수
LR = 0.001       # 학습률
BATCH_SIZE = 64  # 배치 크기

In [3]:
# DQN 에이전트
class DQNAgent:
    def __init__(self):
        self.model = nn.Sequential(
            nn.Linear(4, 256),
            nn.ReLU(),
            nn.Linear(256, 2)
        )

        self.optimizer = optim.Adam(self.model.parameters(), LR)
        self.steps_done = 0

        # 이전 경험들을 기억하기위한 배열
        self.memory = deque(maxlen=10000)  # [(상태, 행동, 보상, 다음 상태), ...]

    def memorize(self, state, action, reward, next_state):
        self.memory.append((state,
                            action,
                            torch.FloatTensor([reward]),
                            torch.FloatTensor([next_state])))
        
    # by epsilon greedy argorithm
    def act(self, state):
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
                        math.exp(-1. * self.steps_done / EPS_DECAY)
        self.steps_done += 1
        if random.random() > eps_threshold:
            return self.model(state).data.max(1)[1].view(1, 1)
        else:
            return torch.LongTensor([[random.randrange(2)]])

    def learn(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states = zip(*batch)

        states = torch.cat(states)
        actions = torch.cat(actions)
        rewards = torch.cat(rewards)
        next_states = torch.cat(next_states)

        current_q = self.model(states).gather(1, actions)
        max_next_q = self.model(next_states).detach().max(1)[0]
        expected_q = rewards + (GAMMA * max_next_q)

        loss = F.mse_loss(current_q.squeeze(), expected_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [4]:
# 학습
env = gym.make('CartPole-v0')

agent = DQNAgent()
score_history = []

for e in range(1, EPISODES+1):
    state = env.reset()
    steps = 0

    while True:
        env.render()
        state = torch.FloatTensor([state])
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action.item())

        if done:
            reward = -1
        
        agent.memorize(state, action, reward, next_state)
        agent.learn()

        state = next_state
        steps += 1

        if done:
            print(f"에피소드: {e} 점수: {steps}")
            score_history.append(steps)
            break
    
plt.plot(score_history)
plt.ylabel('score')
plt.show()

  f"The environment {id} is out of date. You should consider "
  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  "You are calling render method, "


error: ignored