In [4]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('Algorithm.py'))))
import Algorithm as alg

n_obs = 1000

observation_space = 5
action_space = 10

### Create some random observations

In [5]:
observations = np.array(torch.rand(n_obs, observation_space))
observations

array([[0.6981348 , 0.17054302, 0.23344076, 0.6068035 , 0.48673177],
       [0.55503494, 0.8825433 , 0.43170655, 0.32128835, 0.39663136],
       [0.7671813 , 0.71338356, 0.5175633 , 0.50776565, 0.9140857 ],
       ...,
       [0.85870045, 0.62725854, 0.6310296 , 0.8121867 , 0.24251568],
       [0.34314162, 0.7313304 , 0.23605317, 0.76098365, 0.50064695],
       [0.37325394, 0.1548971 , 0.60220337, 0.6551872 , 0.97253746]],
      dtype=float32)

In [6]:
DQAlg = alg.DQAlgorithm(
        observation_space=observation_space,
        action_space=action_space,
        loss_function=nn.MSELoss, 
        regularizer=None, 
        recurrence=None, 
        optimizer=optim.Adam, 
        gamma=0.99,
        batch_size=50, 
        learning_rate=0.005,
        beta=None,
        epsilon=(1, 0.02, 10**5),
        C=10000)

## create_agent()

In [7]:
n_agents = 5
agents = []
for _ in range(n_agents):
    agents.append(DQAlg.create_agent())

## get_action()

- data comes from the **environment**

In [8]:
rand_agent = agents[random.randint(0, n_agents-1)]

for obs in observations:
    act = DQAlg.get_action(rand_agent, obs, 10000)
    print(act, '\t')

4 	
4 	
2 	
5 	
0 	
8 	
4 	
7 	
1 	
4 	
8 	
3 	
4 	
0 	
0 	
2 	
7 	
3 	
9 	
4 	
1 	
7 	
7 	
8 	
6 	
1 	
3 	
5 	
8 	
8 	
7 	
4 	
9 	
4 	
1 	
3 	
3 	
7 	
6 	
9 	
5 	
4 	
8 	
4 	
7 	
1 	
8 	
8 	
8 	
7 	
5 	
8 	
9 	
7 	
7 	
6 	
6 	
8 	
6 	
7 	
7 	
7 	
4 	
2 	
0 	
3 	
3 	
9 	
2 	
6 	
9 	
3 	
3 	
1 	
4 	
3 	
1 	
8 	
6 	
7 	
9 	
0 	
2 	
2 	
4 	
6 	
2 	
8 	
2 	
9 	
4 	
9 	
6 	
1 	
5 	
2 	
7 	
4 	
4 	
0 	
5 	
2 	
3 	
0 	
2 	
5 	
9 	
8 	
5 	
7 	
8 	
8 	
9 	
4 	
8 	
6 	
9 	
2 	
1 	
9 	
9 	
1 	
6 	
6 	
9 	
5 	
1 	
5 	
9 	
0 	
4 	
3 	
4 	
8 	
9 	
1 	
8 	
5 	
6 	
1 	
8 	
0 	
3 	
7 	
8 	
9 	
0 	
8 	
9 	
0 	
5 	
7 	
1 	
2 	
2 	
3 	
8 	
5 	
0 	
9 	
8 	
6 	
2 	
9 	
2 	
8 	
8 	
3 	
4 	
3 	
4 	
6 	
1 	
9 	
3 	
8 	
5 	
3 	
9 	
3 	
5 	
0 	
8 	
1 	
5 	
8 	
6 	
7 	
9 	
5 	
4 	
6 	
8 	
6 	
0 	
6 	
6 	
5 	
8 	
6 	
3 	
7 	
9 	
5 	
8 	
4 	
8 	
8 	
3 	
3 	
0 	
6 	
3 	
3 	
7 	
7 	
4 	
7 	
8 	
8 	
1 	
9 	
9 	
2 	
1 	
4 	
6 	
4 	
6 	
9 	
6 	
7 	
8 	
7 	
4 	
2 	
0 	
0 	
6 	
0 	
6 	
6 	
6 	
4 	
4 	
4 	
0 	
4 	
8 	
9 	


## update()

- data comes from the **buffer**

In [9]:
states = np.array(torch.rand(n_obs, observation_space))
actions = np.array(torch.randint(0, action_space-1, (n_obs,)))
rewards = np.array(torch.rand(n_obs))
done = np.array(torch.randint(0, 2, (n_obs,)))
next_state = np.array(torch.rand(n_obs, observation_space))

In [10]:
states.shape, actions.shape, rewards.shape, done.shape, next_state.shape

((1000, 5), (1000,), (1000,), (1000,), (1000, 5))

In [11]:
buffer = [states, actions, rewards, done, next_state]

In [12]:
rand_agent = agents[random.randint(0, n_agents-1)]

DQAlg.update(rand_agent, buffer, 1000)



### AUX - Test Action Pipeline

In [13]:
t = torch.tensor(observations[0])
t

tensor([0.6981, 0.1705, 0.2334, 0.6068, 0.4867])

In [14]:
ag = DQAlg.agents[0]
ag

<Agent.DQAgent at 0x7f406c905610>

In [15]:
q_vals_v = ag.policy(t)
q_vals_v

tensor([ 0.0560, -0.1586, -0.0623, -0.0019,  0.1263,  0.0641,  0.1256, -0.0257,
         0.1581,  0.1397], grad_fn=<AddBackward0>)

In [16]:
act_val, act_idx = torch.max(q_vals_v, dim=0)
act_val, act_idx

(tensor(0.1581, grad_fn=<MaxBackward0>), tensor(8))