In [1]:
import numpy as np  
import torch  
import torch.optim as optim
from torch.autograd import Variable


class Agent():

    def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.num_in = self.env.observation_space.shape[0]
        self.num_out = self.env.action_space.n

        self.ac_net = ActorCritic(self.env.observation_space.shape[0], self.env.action_space.n)
        self.ac_optimizer = optim.Adam(self.ac_net.parameters(), lr=learning_rate)
    
    def update(self, rewards, values, next_value, log_probs, entropy):
        qvals = np.zeros(len(values))
        qval = next_value
        for t in reversed(range(len(rewards))):
            qval = rewards[t] + self.gamma * qval
            qvals[t] = qval
        
        values = torch.FloatTensor(values)
        qvals = torch.FloatTensor(qvals)
        log_probs = torch.stack(log_probs)
        
        advantage = qvals - values
        actor_loss = (-log_probs * advantage).mean()
        critic_loss = advantage.pow(2).mean()
        ac_loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        self.ac_optimizer.zero_grad()
        ac_loss.backward()
        self.ac_optimizer.step()

    def get_ac_output(self, state):
        state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        value, policy_dist = self.ac_net.forward(state)
        action = np.random.choice(self.num_out, p=policy_dist.detach().numpy().squeeze(0))

        return action, policy_dist, value

    def train(self, max_episode, max_step):
        for episode in range(max_episode):
            rewards = []
            values = []
            log_probs = []
            entropy_term = 0
            episode_reward = 0
            
            state = self.env.reset()
            for steps in range(max_step):
                action, policy_dist, value = self.get_ac_output(state)
                new_state, reward, done, _ = self.env.step(action)  

                log_prob = torch.log(policy_dist.squeeze(0)[action])
                entropy = -torch.sum(policy_dist.mean() * torch.log(policy_dist))
                
                rewards.append(reward)
                values.append(value.detach().numpy()[0])
                log_probs.append(log_prob)
                entropy_term += entropy
                state = new_state
                episode_reward += reward
                
                if done:
                    if episode % 10 == 0:                    
                        print("episode: " + str(episode) + ": " + str(episode_reward)) 
                    break

            _, _, next_value = self.get_ac_output(state)
            self.update(rewards, values, next_value, log_probs, entropy_term)

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable


class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size=256, learning_rate=3e-4):
        super(ActorCritic, self).__init__()

        self.num_actions = num_actions
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)

        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state_tensor):
        value = F.relu(self.critic_linear1(state_tensor))
        value = self.critic_linear2(value)
        
        policy_dist = F.relu(self.actor_linear1(state_tensor))
        policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1)

        return value, policy_dist

In [4]:
import gym


env_id = 'MountainCar-v0'#"CartPole-v0"
env = gym.make(env_id)
agent = Agent(env)
agent.train(1500, 300)

episode: 0: -200.0
episode: 10: -200.0
episode: 20: -200.0
episode: 30: -200.0
episode: 40: -200.0
episode: 50: -200.0
episode: 60: -200.0
episode: 70: -200.0
episode: 80: -200.0
episode: 90: -200.0
episode: 100: -200.0
episode: 110: -200.0
episode: 120: -200.0
episode: 130: -200.0
episode: 140: -200.0
episode: 150: -200.0
episode: 160: -200.0
episode: 170: -200.0
episode: 180: -200.0
episode: 190: -200.0
episode: 200: -200.0
episode: 210: -200.0
episode: 220: -200.0
episode: 230: -200.0
episode: 240: -200.0
episode: 250: -200.0
episode: 260: -200.0
episode: 270: -200.0
episode: 280: -200.0
episode: 290: -200.0
episode: 300: -200.0
episode: 310: -200.0
episode: 320: -200.0
episode: 330: -200.0
episode: 340: -200.0
episode: 350: -200.0
episode: 360: -200.0
episode: 370: -200.0
episode: 380: -200.0
episode: 390: -200.0
episode: 400: -200.0
episode: 410: -200.0
episode: 420: -200.0
episode: 430: -200.0
episode: 440: -200.0
episode: 450: -200.0
episode: 460: -200.0
episode: 470: -200.0
epi