# Test PPO agent in pendulum environment

In [1]:
import numpy as np
import gym
import sys
import torch

from ppo_agent_test import PPO_agent

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
## Explore environment
env = gym.make('Pendulum-v0')
print('### Env Name ######', env.unwrapped.spec.id)

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

### Env Name ###### Pendulum-v0
###### Observation space ####### 
 Box(-8.0, 8.0, (3,), float32)
###### Size of observation space ####### 
 3
###### Action space ####### 
 Box(-2.0, 2.0, (1,), float32)
###### Number of actions ####### 
 1


In [3]:
## Define agent
agent = PPO_agent(state_size=state_size, action_size=act_size, seed = 2)

## Function to save in hdf5 file during learning
def save_score(file_name, data):
    # Open data file
    dat_file = h5py.File(file_name, 'a')
    
    # Create datasets for score 
    dat_file.create_dataset('scores', dtype='f', data=data)

In [4]:
def run_ppo(num_iterations = 1000, num_trajs = 900, length_traj = 150):
    scores = []
    for k in range(num_iterations):
        ## Data for trajectories
        trajs_states = []
        trajs_acts = []
        all_rews = []
        rews_t_future = []
        trajs_log_pol = []
        len_trajs = []
        
        ## Run to collect trajs for a maximum of length_traj
        for i in range(num_trajs):
            ## Episodic data. Keeps track of rewards per traj
            ep_rews = []
            state = env.reset()
            done = False

            for t_traj in range(length_traj):
                env.render()
                trajs_states.append(state)

                # Calculate action and log policy and perform a step of th env
                action, log_policy = agent.act(state)
                state, reward, done, info = env.step(action)
                ep_rews.append(reward)

                # Track recent reward, action, and action log policy
                trajs_acts.append(action)
                trajs_log_pol.append(log_policy)

                if done:
                    break

            len_trajs.append(1 + t_traj)
            all_rews.append(ep_rews)

        # Reshape data as tensors
        trajs_states = torch.tensor(trajs_states, dtype=torch.float).to(device)
        trajs_acts = torch.tensor(trajs_acts, dtype=torch.float).to(device)
        rews_t_future = agent.compute_return_fut(all_rews).to(device)
        trajs_log_pol = torch.tensor(trajs_log_pol, dtype=torch.float).to(device)

        # Run step for learning
        agent.step(trajs_states, trajs_acts, trajs_log_pol, rews_t_future, len_trajs)
        
        # Calculate metrics to print
        avg_iter_lens = np.mean(len_trajs)
        avg_iter_retur = np.mean([np.sum(ep_rews) for ep_rews in all_rews])
        scores.append(avg_iter_retur)
        save_score('score.hdf5', scores)
        
        # Print logging statements
        print(flush=True)
        print(f"-------------------- Iteration #{agent.k_step} --------------------", flush=True)
        print(f"Average Episodic Length: {avg_iter_lens}", flush=True)
        print(f"Average Episodic Return: {avg_iter_retur}", flush=True)
        print(f"Timesteps So Far: {agent.t_step}", flush=True)
        print(f"------------------------------------------------------", flush=True)
        print(flush=True)
    return scores

In [5]:
scores = run_ppo()

  trajs_states = torch.tensor(trajs_states, dtype=torch.float).to(device)


NameError: name 'save_score' is not defined