# Test PPO agent in pendulum environment

In [1]:
import numpy as np
import gym
import sys
import torch

from ppo_agent import PPO_agent

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
## Explore environment
env = gym.make('Pendulum-v0')
print('### Env Name ######', env.unwrapped.spec.id)

obs_space = env.observation_space

print('###### Observation space ####### \n', obs_space)

state_size = env.observation_space.shape[-1]

print('###### Size of observation space ####### \n', state_size)

act_space = env.action_space

print('###### Action space ####### \n', act_space)

act_size = env.action_space.shape[-1]

print('###### Number of actions ####### \n', act_size)

### Env Name ###### Pendulum-v0
###### Observation space ####### 
 Box(-8.0, 8.0, (3,), float32)
###### Size of observation space ####### 
 3
###### Action space ####### 
 Box(-2.0, 2.0, (1,), float32)
###### Number of actions ####### 
 1


In [3]:
## Define agent
agent = PPO_agent(state_size=state_size, action_size=act_size, seed = 2)

## Function to save in hdf5 file during learning
def save_score(file_name, data):
    # Open data file
    dat_file = h5py.File(file_name, 'a')
    
    # Create datasets for score 
    dat_file.create_dataset('scores', dtype='f', data=data)

In [4]:
def run_ppo(num_iterations = 1000, t_tot_trajs = 2048, length_traj = 200):
    scores = []
    for k in range(num_iterations):
        ## Data for trajectories
        trajs_states = []
        trajs_acts = []
        all_rews = []
        rews_t_future = []
        trajs_log_pol = []
        len_trajs = []
        
        # Episodic data. Keeps track of rewards per episode, will get cleared at each ep
        ep_rews = []
        t = 0
        i = 0
        
        ## Run to collect trajs for a maximum of length_traj
        while t < t_tot_trajs:
            ## Episodic data. Keeps track of rewards per traj
            print(f'#####{i}th Traj #####')
            i += 1
            ep_rews = []
            state = env.reset()
            done = False

            for t_traj in range(length_traj):
                env.render()
                t += 1
                
                # Track observations in this batch
                trajs_states.append(state)

                # Calculate action and log policy and perform a step of th env
                action, log_policy = agent.act(state)
                state, reward, done, info = env.step(action)
                ep_rews.append(reward)

                # Track recent reward, action, and action log policy
                trajs_acts.append(action)
                trajs_log_pol.append(log_policy)

                if done:
                    break

            len_trajs.append(1 + t_traj)
            all_rews.append(ep_rews)

        # Reshape data as tensors
        trajs_states = torch.tensor(trajs_states, dtype=torch.float)
        trajs_acts = torch.tensor(trajs_acts, dtype=torch.float)
        rews_t_future = agent.compute_return_fut(all_rews)
        trajs_log_pol = torch.tensor(trajs_log_pol, dtype=torch.float)

        # Run step for learning
        agent.step(trajs_states, trajs_acts, trajs_log_pol, rews_t_future, len_trajs)
        
        # Calculate metrics to print
        avg_iter_lens = np.mean(len_trajs)
        avg_iter_retur = np.mean([np.sum(ep_rews) for ep_rews in all_rews])
        scores.append(avg_iter_retur)
        
        # Print logging statements
        print(flush=True)
        print(f"-------------------- Iteration #{agent.k_step} --------------------", flush=True)
        print(f"Average Episodic Length: {avg_iter_lens}", flush=True)
        print(f"Average Episodic Return: {avg_iter_retur}", flush=True)
        print(f"Timesteps So Far: {agent.t_step}", flush=True)
        print(f"------------------------------------------------------", flush=True)
        print(flush=True)
    return scores

In [None]:
scores = run_ppo()

#####0th Traj #####
#####1th Traj #####
#####2th Traj #####
#####3th Traj #####
#####4th Traj #####
#####5th Traj #####
#####6th Traj #####
#####7th Traj #####
#####8th Traj #####
#####9th Traj #####
#####10th Traj #####


  trajs_states = torch.tensor(trajs_states, dtype=torch.float)



-------------------- Iteration #1 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1275.3155662505358
Timesteps So Far: 2200
------------------------------------------------------

#####0th Traj #####
#####1th Traj #####
#####2th Traj #####
#####3th Traj #####
#####4th Traj #####
#####5th Traj #####
#####6th Traj #####
#####7th Traj #####
#####8th Traj #####
#####9th Traj #####
#####10th Traj #####

-------------------- Iteration #2 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1206.714419418078
Timesteps So Far: 4400
------------------------------------------------------

#####0th Traj #####
#####1th Traj #####
#####2th Traj #####
#####3th Traj #####
#####4th Traj #####
#####5th Traj #####
#####6th Traj #####
#####7th Traj #####
#####8th Traj #####
#####9th Traj #####
#####10th Traj #####

-------------------- Iteration #3 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1143.1857152137904
Times

#####10th Traj #####

-------------------- Iteration #20 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1370.1789996189275
Timesteps So Far: 44000
------------------------------------------------------

#####0th Traj #####
#####1th Traj #####
#####2th Traj #####
#####3th Traj #####
#####4th Traj #####
#####5th Traj #####
#####6th Traj #####
#####7th Traj #####
#####8th Traj #####
#####9th Traj #####
#####10th Traj #####

-------------------- Iteration #21 --------------------
Average Episodic Length: 200.0
Average Episodic Return: -1089.4902473162692
Timesteps So Far: 46200
------------------------------------------------------

#####0th Traj #####
#####1th Traj #####
#####2th Traj #####
#####3th Traj #####
#####4th Traj #####
#####5th Traj #####
#####6th Traj #####
#####7th Traj #####
#####8th Traj #####
#####9th Traj #####
#####10th Traj #####

-------------------- Iteration #22 --------------------
Average Episodic Length: 200.0
Average Episodic Return