In [1]:
from unityagents import UnityEnvironment
import numpy as np
import agent
import random

## Preparing the environment for training

In [2]:
path_to_env='Reacher_Linux/Reacher.x86_64'

env = UnityEnvironment(file_name=path_to_env)

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:\n', states[0])

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like:
 [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [3]:
rB = agent.ReplayBuffer(action_size, buffer_size=8, batch_size=4, episode_len=[1,2,3], seed=42)

In [4]:
env_info = env.reset(train_mode=True)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)

# Tests for memory usage
myAgent = agent.Agent(3)
l = myAgent.memory_size

for i in range(20):
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actionsClipped = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actionsClipped)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    
    myAgent.add(states[0], actions[0], rewards[0], next_states[0], dones[0])
    for N, j in enumerate(rB.episode_len):
        l_now = len(myAgent.individual_memory)
        if ( l_now >= j):
            past_states = [myAgent.individual_memory[k].state for k in range(l_now-j,l_now)]
            past_actions = [myAgent.individual_memory[k].action for k in range(l_now-j,l_now)]
            past_rewards = [myAgent.individual_memory[k].reward for k in range(l_now-j,l_now)]
            past_next_states = [myAgent.individual_memory[k].next_state for k in range(l_now-j,l_now)]
            past_dones = [myAgent.individual_memory[k].done for k in range(l_now-j,l_now)]
            rB.add(past_states, past_actions, past_rewards, past_next_states, past_dones, N)
    
    states = next_states                               # roll over states to next time step
    #if np.any(dones):                                  # exit loop if episode finished
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

Total score (averaged over agents) this episode: 0.0


In [5]:
l = myAgent.memory_size
[myAgent.individual_memory[i].action[0] for i in range(l-3,l)] 

[-0.7728301518677191, -0.6445358405570221, 1.058744843149161]

In [6]:
rB.memory[0][0].actions

[array([-0.13850457,  0.3455402 , -0.42659857,  0.01879785])]

In [7]:
rB.batch_size = 3
a = rB.sample(1)

In [8]:
a[1]

array([[[-0.13850457,  0.3455402 , -0.42659857,  0.01879785],
        [ 0.88927398, -0.63029372, -1.31911886,  0.08894766]],

       [[-0.36133722,  0.31233581,  0.384222  , -0.53539421],
        [-0.13850457,  0.3455402 , -0.42659857,  0.01879785]],

       [[-2.38297198,  1.06701266, -1.13694576,  0.01531064],
        [-0.77283015, -0.83172079, -0.16306301,  0.70528663]]])

In [9]:
Nsample = 0
stt = a[0][Nsample][0]
act = a[1][Nsample][0]

In [10]:
gamma = 0.8
gammaList = [0.8**i for i in range(len(a[0][Nsample]))]

In [11]:
gammaList

[1.0, 0.8]

In [12]:
rwds = a[2][Nsample]

In [13]:
rwds

array([0., 0.])

In [14]:
rwds*gammaList

array([0., 0.])

In [15]:
def Qcrit(x):
    return 0
QlastEp = Qcrit(a[3][Nsample][-1])
Qest = np.sum(rwds*gammaList) + QlastEp

In [16]:
Qest

0.0