<a href="https://colab.research.google.com/github/merucode/RL/blob/01-Study-Book-%EB%B0%94%EB%8B%A5%EB%B6%80%ED%84%B0_%EB%B0%B0%EC%9A%B0%EB%8A%94_%EA%B0%95%ED%99%94%ED%95%99%EC%8A%B5/ch9_TD_Actor_Critic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Labrary and Hyper Parameters

In [4]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
lr        = 0.0002
gamma     = 0.98
n_rollout = 10

# Actor-Critic Class

In [15]:
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()

        self.data = []

        self.fc1 = nn.Linear(4, 256)
        self.fc_pi = nn.Linear(256, 2)
        self.fc_v = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)

    def pi(self, x, softmax_dim=0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob

    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v

    def put_data(self, transition):
        self.data.append(transition)

    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r/100.0])
            s_prime_lst.append(s_prime)
            done_mask = 0.0 if done else 1.0
            done_lst.append([done_mask])

        s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float),\
        torch.tensor(a_lst), torch.tensor(r_lst, dtype=torch.float),\
        torch.tensor(s_prime_lst, dtype=torch.float), torch.tensor(done_lst, dtype=torch.float)

        self.data = []
        return s_batch, a_batch, r_batch, s_prime_batch, done_batch

    def train_net(self):
        s, a, r, s_prime, done = self.make_batch()
        td_target = r + gamma * self.v(s_prime) * done  # td_target for v
        delta = td_target - self.v(s)

        pi = self.pi(s, softmax_dim=1)
        pi_a = pi.gather(1, a)
        # loss = loss_for_pi + loss_for_v
        loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach())

        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()


# Main

In [19]:
def main():
    env = gym.make('CartPole-v1')
    model = ActorCritic()
    print_interval = 100
    score = 0.0

    for n_epi in range(1000):
        done = False
        s = env.reset()

        while not done:
            for t in range(n_rollout):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()
                s_prime, r, done, info = env.step(a)
                model.put_data((s,a,r,s_prime,done))

                s = s_prime
                score += r

                if done:
                    break

            model.train_net()

        if n_epi%print_interval==0 and n_epi!=0:
            print(f"# of episode :{n_epi}, avg score : {score/print_interval:.1f}")
            score = 0.0

    env.close()

In [20]:
if __name__ == '__main__':
    main()

# of episode :100, avg score : 23.4
# of episode :200, avg score : 28.1
# of episode :300, avg score : 51.3
# of episode :400, avg score : 75.7
# of episode :500, avg score : 178.0
# of episode :600, avg score : 208.8
# of episode :700, avg score : 334.4
# of episode :800, avg score : 452.6
# of episode :900, avg score : 472.5
