In [2]:
import warnings

warnings.filterwarnings("ignore")
import gym
import pybullet_envs
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import torch.multiprocessing as mp
import time
import random
import numpy as np
from collections import deque
from statistics import mean, stdev

In [3]:
ENV = gym.make("HalfCheetahBulletEnv-v0")
OBS_DIM = ENV.observation_space.shape[0]
ACT_DIM = ENV.action_space.shape[0]

ACT_LIMIT = ENV.action_space.high[0]
ENV.close()

GAMMA = 0.99
SEED = 300 #{1,100,200,300,600}

hidden_size = 256                                          
lr          = 3e-4
num_steps   = 32
num_episodes = 2000

## for Mac OS

In [4]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

def plot(episode, rewards):
    clear_output(True)
    plt.figure(figsize=(40,10))
    plt.subplot(131)
    plt.title('episode %s. reward: %s' % (episode, rewards[-1]))
    plt.xlabel('episode')
    plt.ylabel('rewards')
    plt.plot(rewards)
    plt.show()
    
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")
use_cuda


False

## Multiprocessing env

from multiprocessing_env import SubprocVecEnv

num_envs = 1
env_name = "HalfCheetahBulletEnv-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

## Neural Network

In [5]:

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)
        
class ActorCritic(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=256, seed=100, std=0.0):
        super(ActorCritic, self).__init__()
        self.seed = torch.manual_seed(seed)
        
        self.critic = nn.Sequential(
            nn.Linear(state_size, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 1)
        )

        self.actor = nn.Sequential(
            nn.Linear(state_size, 256),
            nn.ReLU(),
            nn.Linear(256, action_size),
            nn.Tanh()
        )
        
        self.log_std = nn.Parameter(torch.ones(1, action_size) * std)

        self.apply(init_weights)
        
    def forward(self, x):
        self.value = self.critic(x)
        self.mu    = self.actor(x)        
        self.std   = self.log_std.exp().expand_as(self.mu)      #
        self.dist  = Normal(self.mu, self.std)
        return self.dist, self.value
    
    # Actor를 이용해 state를 받아서 action을 예측, 반환
    def get_action(self, x):
        return  # TODO
    
    def learn(self, state_lst, logprob_lst, q_val_lst, entropy, optimizer):
        """

            Computes advantages by subtracting a bseline(V(from critic)) from the estimated Q values
            추가로 해볼 수 있는 것 : advantage normalize
            Training a ActorCritic Agent refers to updating its actor using the given observations/actions
            and the calculated q_values/ advantages that come from the seen rewards

        """
        
        self.log_probs = torch.cat(logprob_lst)
        self.returns   = torch.cat(q_val_lst).detach()
        self.values    = torch.cat(state_lst)
        
        self.advantage = self.returns - self.values
              
        self.actor_loss  = -(self.log_probs * self.advantage.detach()).mean()
        self.critic_loss = self.advantage.pow(2).mean()

        self.loss = self.actor_loss + 0.5 * self.critic_loss - 0.001 * entropy 

        optimizer.zero_grad()
        self.loss.backward()    
        optimizer.step()
 


## N-Step

In [6]:
def n_step_td(reward_lst, V):
    q_val_lst = []
    # TODO: n_step td return
    R = V
    for step in reversed(range(len(reward_lst))):  # TODO:
        R = reward_lst[step] + GAMMA * R
        q_val_lst.insert(0, R)
    return q_val_lst

def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

In [7]:


def Worker(num_episodes, n_steps):
    env = gym.make("HalfCheetahBulletEnv-v0")

    agent = ActorCritic(OBS_DIM, ACT_DIM).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=1e-4)  
    
    ####################################################
    episode_rewards = deque(maxlen=100)
    start_time = time.time()
    epi_plot = []
    finish = False
    ####################################################

    # TODO
    
    for episode in range(num_episodes):
        done = False
        state = env.reset()
        epi_reward = 0.
        
        while not done:
            s_lst, a_lst, r_lst = [], [], []
            entropy = 0 # 
            masks = []
            
            # N-step rollout
            for t in range(n_steps):
                # TODO
                # action = agent.get_action # TODO
                # while env takes in/out in numpy, nn.module does in tensor, convert!
                state = torch.FloatTensor(state).unsqueeze(0).to(device)
                dist, value = agent(state) 
                
                action = dist.sample()
                #####################################################
                next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
                epi_reward += reward
                #####################################################
            
                # TODO
                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()
                
                a_lst.append(log_prob)
                s_lst.append(value)
                
                # scalar reward, done to tensor
                r_lst.append(torch.FloatTensor([reward]).unsqueeze(1).to(device))
                #masks.append(torch.FloatTensor([1 - done]).unsqueeze(1).to(device))
                
                if done:
                    break
                    
                state = next_state

            # HINT : if done -> V = 0, else -> V = agent.critic(last state of N-step rollout trajectory)
            V =  0 if done else value # TODO            
            q_val_lst = n_step_td(r_lst, V)
            #q_val_lst = compute_gae(V, r_lst, masks, s_lst)
                                  
            agent.learn(s_lst, a_lst, q_val_lst, entropy, optimizer)
        
        
        ######################################################################

        episode_rewards.append(epi_reward)
        
        
        if episode >= 100:
            mean_100_episode_reward = np.mean(episode_rewards)
            epi_plot.append(mean_100_episode_reward)
            if episode % 10 == 0:
                print("\nEpisode: {}, avg score: {:.1f}".format(episode, mean_100_episode_reward))

            if mean_100_episode_reward >= 500:
                finish = True
                print("Solved (1)!!!, Time : {:.2f}".format(time.time() - start_time))
                np.save("./single.npy", np.array(epi_plot))
                return
            
            
    env.close()
    print("Fail... Retry")
    

In [8]:
def run(num_episodes):
    n_steps = 5  # TODO up to you

    Worker(num_episodes, n_steps)

if __name__ == '__main__':
    run(2000)


Episode: 100, avg score: -1075.8

Episode: 110, avg score: -1024.2

Episode: 120, avg score: -1009.8

Episode: 130, avg score: -961.4

Episode: 140, avg score: -921.0

Episode: 150, avg score: -882.3

Episode: 160, avg score: -857.9

Episode: 170, avg score: -813.7

Episode: 180, avg score: -770.2

Episode: 190, avg score: -735.9

Episode: 200, avg score: -697.1

Episode: 210, avg score: -655.4

Episode: 220, avg score: -610.4

Episode: 230, avg score: -580.4

Episode: 240, avg score: -563.8

Episode: 250, avg score: -548.3

Episode: 260, avg score: -505.1

Episode: 270, avg score: -483.3

Episode: 280, avg score: -469.1

Episode: 290, avg score: -432.9

Episode: 300, avg score: -404.9

Episode: 310, avg score: -381.7

Episode: 320, avg score: -389.2

Episode: 330, avg score: -409.1

Episode: 340, avg score: -424.4

Episode: 350, avg score: -402.0

Episode: 360, avg score: -455.1

Episode: 370, avg score: -471.0

Episode: 380, avg score: -471.3

Episode: 390, avg score: -476.4

Episod