<a href="https://colab.research.google.com/github/merucode/RL/blob/01-Study-Book-%EB%B0%94%EB%8B%A5%EB%B6%80%ED%84%B0_%EB%B0%B0%EC%9A%B0%EB%8A%94_%EA%B0%95%ED%99%94%ED%95%99%EC%8A%B5/ch9_REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Labrary and Hyper Parameters

In [21]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical


# Hyperparameters
learning_rate = 0.0002
gamma         = 0.98

# Policy Network

In [22]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x

    def put_data(self, item):
        self.data.append(item)  # r, prob[a]

    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]: # # r, prob[a] 역순으로 불러오기
            # G_t = r + γ*G_t+1
            R = r + gamma * R
            # -G_t * log 𝝅_θ(s_t, a_t)
            loss = -R * torch.log(prob)
            # Policy gradient
            loss.backward()
        self.optimizer.step()
        self.data = []

# Main

In [32]:
def main():
    env = gym.make('CartPole-v1')
    pi = Policy()
    score = 0.0
    print_interval = 100

    for n_epi in range(1501):
        s = env.reset()
        done = False

        while not done: # CartPole-v1 forced to terminates at 500 step.
            prob = pi(torch.from_numpy(s).float())  # 정책 함수에 따른 확률 계산(순전파)
            m = Categorical(prob)                   # a 확률 → catgeorical
            a = m.sample()                          # 확률에 따른 a 선택
            s_prime, r, done, info = env.step(a.item())  # a에 따른 액션 수행
            pi.put_data((r,prob[a]))
            s = s_prime                             # s = s'
            score += r

        pi.train_net()  # epiosde 마무리 후 update 수행

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
            score = 0.0
    env.close()

In [33]:
if __name__ == '__main__':
    main()

# of episode :100, avg score : 22.99
# of episode :200, avg score : 30.52
# of episode :300, avg score : 33.76
# of episode :400, avg score : 38.64
# of episode :500, avg score : 46.43
# of episode :600, avg score : 50.14
# of episode :700, avg score : 55.06
# of episode :800, avg score : 62.66
# of episode :900, avg score : 71.08
# of episode :1000, avg score : 94.7
# of episode :1100, avg score : 104.95
# of episode :1200, avg score : 158.68
# of episode :1300, avg score : 162.31
# of episode :1400, avg score : 186.91
# of episode :1500, avg score : 200.85
