In [1]:
colab = False

if colab:
    from google.colab import drive
    drive.mount('/content/gdrive')
    !git clone https://github.com/deepmind/pycolab.git
    !git clone https://github.com/nicoladainese96/RelationalModule.git
    !pip install pycolab

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from RelationalModule import ActorCritic, ControlActorCritic
from RelationalModule import train_agent as train
from RelationalModule import utils
from importlib import reload

In [3]:
reload(train)

<module 'RelationalModule.train_agent' from '/home/nicola/Nicola_unipd/MasterThesis/RelationalDeepRL/RelationalModule/train_agent.py'>

In [4]:
GRID_SIZE = 7
game_params = dict(grid_size=GRID_SIZE,
                solution_length=[0],
                num_forward = [0], # number of distractors
                num_backward=[0], # just set to 0 for now
                branch_length=1, # length of forward distractors
                max_num_steps = 120
               )

In [5]:
HPs = dict(action_space=4, lr=0.003, gamma=0.99, TD=True, twin=True, tau=0.2, n_steps=40,
           n_kernels=96, vocab_size = 117, n_dim=12, n_features=64, n_heads=4, n_attn_modules=2, 
           n_linears=4, max_pool=False, linear_size=GRID_SIZE+2)
if colab:
    HPs['device'] = 'cuda'
else:
    HPs['device'] = 'cpu'

print('device: ', HPs['device'])  

# Relational Agent
agent = ActorCritic.BoxWorldA2C(**HPs)

control_HPs = dict(action_space=4, lr=0.003, gamma=0.99, TD=True, twin=True, tau=0.2, n_steps=40, linear_size=GRID_SIZE+2)
           
# Control Agent
control_agent = ControlActorCritic.ControlA2C(**control_HPs)

device:  cpu
Discount factor:  0.99
Learning rate:  0.003
Action space:  4
Temporal Difference learning:  True
Twin networks:  True
Update critic target factor:  0.2
n_steps for TD:  40
Device used:  cpu


Actor architecture: 
 BoxWorldActor(
  (boxnet): BoxWorldNet(
    (net): Sequential(
      (0): ExtractEntities(
        (embed): Embedding(117, 12)
        (net): Sequential(
          (0): Conv2d(12, 48, kernel_size=(2, 2), stride=(1, 1))
          (1): ReLU()
          (2): Conv2d(48, 96, kernel_size=(2, 2), stride=(1, 1))
          (3): ReLU()
        )
      )
      (1): RelationalModule(
        (net): Sequential(
          (0): PositionalEncoding(
            (projection): Linear(in_features=98, out_features=64, bias=True)
          )
          (1): AttentionBlock(
            (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (attn): MultiheadAttention(
              (out_proj): Linear(in_features=64, 

In [6]:
# Random Agent

class RandomAgent():
    def __init__(self, n_actions):
        self.n_actions = n_actions
    
    def get_action(self,state, *args, **kwargs):
        a = np.random.choice(self.n_actions)
        log_prob = np.log(1./self.n_actions) # just because it's the standard output of the other agent
        return a, log_prob
    
    def update(self, *args):
        return

rnd_agent = RandomAgent(4)

In [7]:
%%time
results = train.train_boxworld(agent, game_params, n_episodes=5000, 
                               max_steps=game_params['max_num_steps'], return_agent=True, mask=True)
score, asymptotic_score, asymptotic_std, trained_agent, time_profile = results

  art = np.vstack(np.fromstring(line, dtype=np.uint8) for line in art)


distribution:  tensor([[0.0469, 0.3524, 0.4029, 0.1978]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.1030, 0.2861, 0.2850, 0.3260]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0565, 0.2489, 0.5387, 0.1558]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0718, 0.3318, 0.2542, 0.3421]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0868, 0.1520, 0.5506, 0.2107]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0906, 0.4149, 0.3017, 0.1927]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0637, 0.2144, 0.3684, 0.3534]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.1399, 0.3144, 0.2605, 0.2852]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0675, 0.3322, 0.4236, 0.1767]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0936, 0.3682, 0.4459, 0.0923]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0500, 0.3600, 0.2992, 0.2909]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0692, 0.4174, 0.3606, 0.1528]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.04

V1.shape:  torch.Size([47])
V1:  tensor([[ 0.5859],
        [ 0.2206],
        [ 0.3176],
        [ 0.3007],
        [ 0.2747],
        [ 0.1583],
        [ 0.6236],
        [ 0.4317],
        [-0.7255],
        [-0.3691],
        [ 0.4405],
        [ 0.1400],
        [-0.3781],
        [ 0.4212],
        [ 0.0704],
        [ 0.0202],
        [ 0.1691],
        [ 0.9707],
        [-0.2781],
        [ 0.1818],
        [ 0.3323],
        [-0.4326],
        [-1.0807],
        [-0.0749],
        [-0.7235],
        [-0.0991],
        [ 0.5995],
        [-0.1823],
        [-0.2194],
        [ 0.3198],
        [-0.1203],
        [-0.6519],
        [ 0.2617],
        [ 0.3949],
        [-0.0048],
        [-0.3949],
        [ 0.1856],
        [-0.7357],
        [-0.0497],
        [ 0.1285],
        [-0.7248],
        [ 0.3275],
        [-0.4716],
        [ 0.9068],
        [-0.1003],
        [-0.2856],
        [-0.0735]], grad_fn=<AddmmBackward>)
Updating actor...
V_trg.shape:  torch.Size([47])

distribution:  tensor([[0.0577, 0.5383, 0.2654, 0.1387]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0153, 0.7150, 0.0681, 0.2017]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0191, 0.7043, 0.1336, 0.1430]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0360, 0.6718, 0.1857, 0.1065]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0168, 0.6997, 0.1283, 0.1552]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0595, 0.4326, 0.3744, 0.1334]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0260, 0.5345, 0.1099, 0.3295]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0416, 0.4727, 0.1134, 0.3723]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0399, 0.7494, 0.1220, 0.0886]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0950, 0.3664, 0.1992, 0.3394]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0641, 0.4299, 0.2522, 0.2538]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0490, 0.5777, 0.2180, 0.1553]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.02

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([ 0.2570,  0.1100, -0.0914,  0.2078, -0.2322,  0.0607,  0.0584, -0.1715,
         0.4650,  0.1238, -0.2338, -0.1211,  0.7462,  0.1060,  0.1926,  0.0347,
         0.2076,  0.3013, -0.0686,  0.3416, -0.2913, -0.1524,  0.2432,  0.4209,
         0.2129, -0.1509,  0.3038, -0.0232,  0.0351, -0.3811,  0.4732,  0.0496,
         0.2777, -0.0486,  0.3886,  0.2561,  0.5540,  0.0395,  0.0224,  0.3976,
         0.1485,  0.1888, -0.4534,  0.4603,  0.5567,  0.0235, -0.0297,  0.5398,
         0.3063,  0.1360, -0.1075,  0.2464, -0.2688,  0.4652,  0.3323,  0.6300,
         0.2728,  0.1424, -0.2156,  0.2677,  0.1033,  0.4736, -0.1904, -0.0161,
         0.0779, -0.0126,  0.2039, -0.0159,  0.1423,  0.3091,  0.2807, -0.0091,
        -0.1826,  0.1870,  0.4205, -0.0306,  0.6309,  0.3488,  0.1520, -0.0734,
        -0.1420,  0.0310,  0.50

distribution:  tensor([[0.0275, 0.5323, 0.3050, 0.1352]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0177, 0.8844, 0.0726, 0.0253]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0475, 0.6115, 0.1560, 0.1850]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0610, 0.4755, 0.3523, 0.1112]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0521, 0.7322, 0.0642, 0.1515]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0161, 0.8146, 0.1206, 0.0487]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0478, 0.4943, 0.0942, 0.3637]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0224, 0.5352, 0.1757, 0.2668]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0291, 0.7507, 0.0946, 0.1255]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0113, 0.6378, 0.1448, 0.2061]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0249, 0.8705, 0.0717, 0.0329]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0138, 0.8580, 0.0657, 0.0625]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.01

distribution:  tensor([[0.0366, 0.4295, 0.2484, 0.2855]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0099, 0.7427, 0.1618, 0.0856]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0380, 0.6354, 0.1894, 0.1372]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0355, 0.4774, 0.0676, 0.4194]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0930, 0.3316, 0.2812, 0.2942]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0124, 0.9362, 0.0373, 0.0142]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0530, 0.6083, 0.1251, 0.2135]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0281, 0.5910, 0.1763, 0.2046]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0385, 0.2447, 0.2426, 0.4742]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0278, 0.3296, 0.3508, 0.2917]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0382, 0.4456, 0.1271, 0.3892]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0153, 0.8477, 0.0703, 0.0667]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.01

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([ 5.9489e-01,  4.6012e-01,  6.6426e-01,  6.8205e-01,  2.1336e-01,
         6.0229e-01,  6.3206e-01,  7.6581e-01,  8.4131e-01,  4.5807e-01,
         4.0974e-01,  6.6420e-02,  4.9295e-01,  7.6558e-01,  2.6622e-01,
         4.5730e-01,  4.2124e-01,  7.5740e-01,  2.6394e-01,  3.5273e-01,
         3.5184e-01,  2.1945e-01,  5.0457e-02,  6.3897e-01,  3.4294e-01,
         5.6414e-01,  3.4865e-02,  1.7185e-01,  4.3358e-01,  4.9552e-01,
         3.9015e-01, -8.6056e-03,  4.9933e-01,  2.3513e-01,  5.3634e-01,
         2.1133e-01,  3.0070e-01,  3.3733e-01,  4.2510e-01,  1.8285e-01,
         1.0551e-01,  2.6509e-01,  7.0395e-01,  3.9195e-01,  5.9955e-01,
         3.3030e-01,  5.4225e-01,  3.2606e-01, -2.9198e-01,  2.6824e-01,
        -1.1794e-01,  6.9178e-01,  5.2804e-01,  4.6889e-01,  3.7590e-01,
         5.7392e-02,  1.2026

distribution:  tensor([[0.0305, 0.5661, 0.1596, 0.2438]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0170, 0.7700, 0.1567, 0.0563]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0371, 0.3597, 0.4431, 0.1600]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0075, 0.4584, 0.1412, 0.3930]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0735, 0.2104, 0.4624, 0.2537]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0269, 0.4117, 0.3144, 0.2470]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0207, 0.6285, 0.1105, 0.2403]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0224, 0.4653, 0.2930, 0.2193]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0299, 0.8357, 0.0744, 0.0600]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0133, 0.8945, 0.0545, 0.0377]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0357, 0.3083, 0.1174, 0.5386]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0205, 0.5764, 0.2178, 0.1853]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.05

distribution:  tensor([[0.0181, 0.6857, 0.1898, 0.1064]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0214, 0.7844, 0.1062, 0.0879]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0112, 0.4650, 0.1278, 0.3960]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0350, 0.5127, 0.2122, 0.2400]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0409, 0.3659, 0.2190, 0.3742]], grad_fn=<ExpBackward>)
n_step_rewards.shape:  (121,)
rewards.shape:  (121,)
n_step_rewards:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
rewards:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([ 0.2241,  0.2141, -0.2116,  0.3722,  0.3254,  1.0426,  0.3312,  0.2949,
         0.1325,  0.4083,  0.5798,  0.8059,  0.9436,  0.6693,  0.9693,  0.9345,
         0.6457,  0.0142,  0.4662,  0.4120,  0.5218,  0.3693,  0.4911,  0.7913,
         0.5940,  0.4541,  0.9294,  0.3552,  0.4957,  0.6121,  0.2034,  0.4555,
         1.0980,  0.4745,  0.9856,  0.4800,  0.6455,  0.4161,  0.6312, -0.1739,
        -0.0211,  0.4001,  0.7173,  0.8608,  0.9922,  0.4992, -0.0473,  0.0215,
         0.7557,  0.4737,  0.3115,  0.6323,  0.5449,  0.3995,  0.6018,  0.3066,
         0.2421,  0.8900,  0.4333,  0.3125,  0.2034,  0.6323,  0.8370,  0.6373,
         0.5736,  0.1636,  1.0352,  0.3426,  0.4931,  0.1873,  0.1731,  0.1890,
         0.2991,  1.0782,  0.3442,  0.2824,  0.1196,  0.0330,  0.1444,  0.7162,
         0.4506,  0.4321,  0.49

distribution:  tensor([[0.0039, 0.6936, 0.0836, 0.2189]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0277, 0.3862, 0.2203, 0.3658]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0097, 0.9280, 0.0336, 0.0288]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0159, 0.7887, 0.0618, 0.1336]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0030, 0.8495, 0.1101, 0.0374]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0104, 0.8503, 0.0696, 0.0697]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0078, 0.7386, 0.0776, 0.1761]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0217, 0.5641, 0.2090, 0.2053]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0413, 0.5305, 0.0718, 0.3563]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0280, 0.8005, 0.0767, 0.0949]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0075, 0.7973, 0.0922, 0.1031]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0157, 0.4676, 0.2330, 0.2837]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.00

log_probs:  [tensor(-1.5190, grad_fn=<SelectBackward>), tensor(-1.5127, grad_fn=<SelectBackward>), tensor(-0.0748, grad_fn=<SelectBackward>), tensor(-0.2374, grad_fn=<SelectBackward>), tensor(-0.1631, grad_fn=<SelectBackward>), tensor(-0.1622, grad_fn=<SelectBackward>), tensor(-1.7369, grad_fn=<SelectBackward>), tensor(-1.5835, grad_fn=<SelectBackward>), tensor(-1.0320, grad_fn=<SelectBackward>), tensor(-0.2226, grad_fn=<SelectBackward>), tensor(-0.2266, grad_fn=<SelectBackward>), tensor(-1.4569, grad_fn=<SelectBackward>), tensor(-0.0656, grad_fn=<SelectBackward>), tensor(-2.5171, grad_fn=<SelectBackward>), tensor(-0.3555, grad_fn=<SelectBackward>), tensor(-0.4262, grad_fn=<SelectBackward>), tensor(-2.9779, grad_fn=<SelectBackward>), tensor(-2.9714, grad_fn=<SelectBackward>), tensor(-1.0563, grad_fn=<SelectBackward>), tensor(-0.2049, grad_fn=<SelectBackward>), tensor(-0.1053, grad_fn=<SelectBackward>), tensor(-0.8921, grad_fn=<SelectBackward>), tensor(-0.3062, grad_fn=<SelectBackward>)

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([ 0.6294,  0.2954,  0.6881,  0.4879,  0.3624,  0.3493,  0.5807,  0.2752,
         0.6929,  0.2941,  0.3132,  0.9823,  0.2140,  0.3253,  0.3342,  0.6804,
        -0.3964,  0.5970,  0.7848,  0.6319,  1.0561,  0.7121,  0.5154,  0.4233,
         0.4176,  0.2586,  0.9142,  0.7955,  0.3017,  0.8682,  1.1710,  0.6997,
         0.4158,  0.5419,  0.7709,  0.9176,  1.0152,  0.4087,  0.8303,  0.2624,
         0.4808,  0.5831,  0.4994,  0.2423,  0.9124,  0.8784,  0.5230,  0.7515,
         0.4496,  0.5449,  0.4970,  0.6810,  0.6290,  0.4851,  0.8371,  0.8721,
         0.6920,  0.4038,  0.3005,  0.4533,  0.2489,  0.7282,  0.2501,  0.5623,
         0.1544,  0.5456,  0.3159,  0.0119,  0.3379,  0.0752,  1.1461,  0.5609,
         1.0726,  0.4783,  1.1571,  0.0993,  0.5408,  0.6234,  0.1158, -0.1740,
         0.5285,  0.5535,  0.52

distribution:  tensor([[0.0086, 0.6765, 0.1725, 0.1424]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0101, 0.6443, 0.2498, 0.0959]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0028, 0.4219, 0.0306, 0.5447]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0103, 0.4773, 0.2700, 0.2424]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0484, 0.1290, 0.0952, 0.7274]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0158, 0.8202, 0.0496, 0.1143]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0173, 0.2714, 0.1020, 0.6093]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0178, 0.3822, 0.0756, 0.5245]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0437, 0.7786, 0.0830, 0.0947]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0345, 0.4950, 0.1490, 0.3215]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0172, 0.7416, 0.0514, 0.1898]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0119, 0.6436, 0.0818, 0.2627]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.01

distribution:  tensor([[0.0417, 0.5725, 0.1130, 0.2728]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0460, 0.3279, 0.0913, 0.5348]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0170, 0.3854, 0.0511, 0.5465]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0194, 0.5878, 0.0770, 0.3158]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0094, 0.7249, 0.0232, 0.2425]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0128, 0.8643, 0.0619, 0.0610]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0237, 0.3697, 0.2389, 0.3676]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0232, 0.8238, 0.0744, 0.0787]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0158, 0.6084, 0.1324, 0.2434]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0118, 0.7916, 0.0367, 0.1599]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0069, 0.7238, 0.0896, 0.1797]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0318, 0.5124, 0.2104, 0.2455]], grad_fn=<ExpBackward>)
n_step_rewards.shape:  (121,

distributions:  tensor([[[0.0086, 0.6765, 0.1725, 0.1424],
         [0.0101, 0.6443, 0.2498, 0.0959],
         [0.0028, 0.4219, 0.0306, 0.5447],
         [0.0103, 0.4773, 0.2700, 0.2424],
         [0.0484, 0.1290, 0.0952, 0.7274],
         [0.0158, 0.8202, 0.0496, 0.1143],
         [0.0173, 0.2714, 0.1020, 0.6093],
         [0.0178, 0.3822, 0.0756, 0.5245],
         [0.0437, 0.7786, 0.0830, 0.0947],
         [0.0345, 0.4950, 0.1490, 0.3215],
         [0.0172, 0.7416, 0.0514, 0.1898],
         [0.0119, 0.6436, 0.0818, 0.2627],
         [0.0167, 0.4493, 0.1931, 0.3409],
         [0.0113, 0.6460, 0.1195, 0.2232],
         [0.0155, 0.5982, 0.1371, 0.2492],
         [0.0324, 0.6122, 0.0507, 0.3047],
         [0.0203, 0.4069, 0.0621, 0.5106],
         [0.0066, 0.7247, 0.0400, 0.2287],
         [0.0215, 0.5645, 0.1674, 0.2466],
         [0.0115, 0.6454, 0.0634, 0.2798],
         [0.0107, 0.6603, 0.1329, 0.1961],
         [0.0055, 0.3804, 0.0131, 0.6010],
         [0.0222, 0.7887, 0.0819, 0.10

Updating actor...
V_trg.shape:  torch.Size([121])
V_trg:  tensor([ 0.7877,  0.6252,  0.6084,  0.4328,  0.6751,  0.3080,  0.6060,  0.6187,
         0.9163,  0.3512,  0.0694,  0.6070,  0.1658,  0.3665,  0.5509,  0.3520,
         0.4976,  0.2073,  0.5805,  0.9061,  0.4036,  0.5312,  0.9205,  0.7904,
         0.7873, -0.2200,  0.5297,  0.4026,  0.5305,  0.3743,  0.8038,  0.2984,
         0.5264,  0.2920,  0.7706,  0.5850,  0.7887,  0.2196,  0.3252,  0.6935,
         0.4145,  0.2629,  0.6057,  0.1738,  0.8565,  0.6073,  0.3433,  0.9763,
         0.5796,  0.4530,  0.6586,  0.2016,  0.3084,  0.5687,  0.6980,  0.8844,
         0.9527,  0.4560,  0.6909,  0.9366,  0.2795,  0.5318,  0.6623,  0.4997,
         0.6522,  0.3971,  0.3030,  0.5393,  0.3894,  0.6299,  0.6985,  0.4324,
         0.6001,  0.3121,  0.7017,  0.6144,  0.5015,  0.8701,  0.3389,  0.2263,
         0.8082,  0.5128,  0.5293,  0.6253,  0.6716,  0.4833,  0.6909,  0.6404,
         0.7825,  0.2521,  0.5218,  0.4531,  0.3452,  1.0266, 

distribution:  tensor([[0.0179, 0.2634, 0.0503, 0.6684]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0252, 0.7109, 0.1388, 0.1251]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0150, 0.3340, 0.0304, 0.6206]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0341, 0.7483, 0.0987, 0.1189]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0055, 0.9153, 0.0366, 0.0426]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0052, 0.9389, 0.0362, 0.0198]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0063, 0.5997, 0.0411, 0.3528]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0310, 0.2321, 0.0836, 0.6533]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0238, 0.2919, 0.1821, 0.5022]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0075, 0.5694, 0.0735, 0.3496]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0075, 0.4982, 0.0590, 0.4353]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0806, 0.1271, 0.0713, 0.7210]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.00

log_probs:  [tensor(-0.3282, grad_fn=<SelectBackward>), tensor(-0.1174, grad_fn=<SelectBackward>), tensor(-1.2529, grad_fn=<SelectBackward>), tensor(-0.2388, grad_fn=<SelectBackward>), tensor(-0.0976, grad_fn=<SelectBackward>), tensor(-2.4095, grad_fn=<SelectBackward>), tensor(-0.6753, grad_fn=<SelectBackward>), tensor(-0.1004, grad_fn=<SelectBackward>), tensor(-1.3499, grad_fn=<SelectBackward>), tensor(-0.2360, grad_fn=<SelectBackward>), tensor(-0.4763, grad_fn=<SelectBackward>), tensor(-0.4703, grad_fn=<SelectBackward>), tensor(-0.2397, grad_fn=<SelectBackward>), tensor(-0.4314, grad_fn=<SelectBackward>), tensor(-0.1294, grad_fn=<SelectBackward>), tensor(-1.9115, grad_fn=<SelectBackward>), tensor(-0.1189, grad_fn=<SelectBackward>), tensor(-0.1345, grad_fn=<SelectBackward>), tensor(-1.2186, grad_fn=<SelectBackward>), tensor(-0.1795, grad_fn=<SelectBackward>), tensor(-0.4088, grad_fn=<SelectBackward>), tensor(-0.3472, grad_fn=<SelectBackward>), tensor(-0.1111, grad_fn=<SelectBackward>)

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([ 0.6456,  0.9894,  0.8643,  0.3906,  0.3409,  0.3960,  0.2440,  0.7955,
         0.1503,  0.4107,  0.7521,  0.7888,  0.3913,  0.5567,  0.6945,  0.0902,
         0.3885,  0.8299,  0.2282,  0.1730,  0.2216,  0.2254,  0.7432,  0.4884,
         0.6608,  0.1999,  0.6710,  0.5663,  0.3857,  0.1856,  0.2842,  0.3063,
         0.3049,  0.2977,  0.2938,  0.3533,  0.5253,  0.2707,  0.7913,  0.3209,
         0.3328,  0.3838,  0.1669,  0.7106,  0.2967,  0.5823,  0.6064,  0.6722,
         0.2661,  0.6016,  0.8090,  0.6978,  0.6093,  0.1445,  0.2854,  0.3194,
         0.3875,  0.5281,  0.7819,  0.5341,  0.6321,  0.7113,  0.1994,  0.5939,
         0.1628,  0.5628,  0.5591,  0.2641,  0.4653,  0.7620,  0.5971,  0.3875,
         0.3964, -0.0360,  0.9535,  0.6067,  0.8171,  0.2863,  0.7336,  0.7238,
         1.0215, -0.0067,  0.69

distribution:  tensor([[0.0032, 0.8930, 0.0234, 0.0804]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0121, 0.5968, 0.1828, 0.2083]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0286, 0.5465, 0.0900, 0.3349]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0350, 0.3036, 0.0431, 0.6183]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0072, 0.8994, 0.0483, 0.0451]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0185, 0.7572, 0.0767, 0.1476]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0284, 0.4135, 0.2509, 0.3072]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0114, 0.8791, 0.0269, 0.0826]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0049, 0.7099, 0.0128, 0.2723]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0191, 0.7921, 0.0571, 0.1317]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0019, 0.8435, 0.0678, 0.0868]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0245, 0.7006, 0.0322, 0.2427]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.00

n_step_rewards.shape:  (121,)
rewards.shape:  (121,)
n_step_rewards:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
rewards:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
bootstrap:  [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False Fal

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([ 0.3963,  0.4217,  0.5237,  0.4862,  0.6801,  0.6123, -0.0437,  0.6187,
         0.3806,  0.0512,  0.9405,  0.0490,  0.0828,  0.4176,  0.5132,  0.6086,
         0.3722,  0.4692,  0.6691,  0.3618,  0.7327, -0.0430,  0.6480,  0.6231,
         0.0369,  0.3171,  0.3145,  0.8203,  0.7878,  0.9147,  0.4102,  0.3121,
         0.7236,  0.6386,  0.2295,  0.3357,  0.1948,  0.6600,  0.3215, -0.0799,
         0.4675,  0.4517,  0.4442,  0.5164,  0.8158,  0.3297,  0.2554,  0.7167,
         0.1751,  0.3909,  0.6549,  0.3500, -0.0366,  0.3840,  0.4173, -0.0406,
         0.2460,  0.3235,  0.5474,  0.5596,  0.3818,  0.4547,  0.6292,  0.2292,
         0.4692,  0.8067,  0.3307,  0.8355,  0.9818,  0.3875,  0.3668,  0.5064,
         0.6577,  0.5330,  0.7229,  0.5658,  0.2051,  0.5816,  0.4653,  0.5461,
         0.4828,  0.5596,  0.65

distribution:  tensor([[0.0229, 0.6764, 0.0281, 0.2725]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0369, 0.4177, 0.0324, 0.5131]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0079, 0.8398, 0.1017, 0.0506]], grad_fn=<ExpBackward>)
n_step_rewards.shape:  (3,)
rewards.shape:  (3,)
n_step_rewards:  [ 9.801  9.9   10.   ]
rewards:  [ 0.  0. 10.]
bootstrap:  [False False False]
done.shape: (before n_steps) (3,)
done: (before n_steps) [False False  True]
done.shape: (after n_steps) (3,)
Gamma_V.shape:  (3,)
done: (after n_steps) [ True  True  True]
Gamma_V:  [0.970299 0.9801   0.99    ]
old_states.shape:  torch.Size([3, 1, 9, 9])
new_states.shape:  torch.Size([3, 1, 9, 9])
log_probs:  [tensor(-0.3909, grad_fn=<SelectBackward>), tensor(-0.6673, grad_fn=<SelectBackward>), tensor(-2.9840, grad_fn=<SelectBackward>)]
log_probs:  tensor([-0.3909, -0.6673, -2.9840], grad_fn=<StackBackward>)
distributions.shape:  torch.Size([1, 3, 4])
distributions:  tensor([[[0.0229, 0.6764, 0.0281, 0.2

distribution:  tensor([[0.0051, 0.5731, 0.0281, 0.3937]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0047, 0.8337, 0.0103, 0.1514]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0114, 0.4522, 0.0327, 0.5037]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0016, 0.6118, 0.0187, 0.3679]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0059, 0.1498, 0.0277, 0.8167]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0062, 0.5841, 0.0372, 0.3726]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0057, 0.2791, 0.0423, 0.6729]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0145, 0.7838, 0.0365, 0.1651]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0060, 0.6616, 0.0357, 0.2968]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0044, 0.7524, 0.0445, 0.1987]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0061, 0.8141, 0.0336, 0.1463]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0159, 0.2874, 0.0885, 0.6082]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.00

log_probs:  [tensor(-1.3262, grad_fn=<SelectBackward>), tensor(-0.1862, grad_fn=<SelectBackward>), tensor(-0.5745, grad_fn=<SelectBackward>), tensor(-0.5867, grad_fn=<SelectBackward>), tensor(-0.0442, grad_fn=<SelectBackward>), tensor(-0.1032, grad_fn=<SelectBackward>), tensor(-0.2930, grad_fn=<SelectBackward>), tensor(-1.0297, grad_fn=<SelectBackward>), tensor(-1.9208, grad_fn=<SelectBackward>), tensor(-0.4100, grad_fn=<SelectBackward>), tensor(-0.0667, grad_fn=<SelectBackward>), tensor(-1.3261, grad_fn=<SelectBackward>), tensor(-2.2444, grad_fn=<SelectBackward>), tensor(-0.0523, grad_fn=<SelectBackward>), tensor(-0.4185, grad_fn=<SelectBackward>), tensor(-0.1361, grad_fn=<SelectBackward>), tensor(-0.1056, grad_fn=<SelectBackward>), tensor(-0.2281, grad_fn=<SelectBackward>), tensor(-0.2341, grad_fn=<SelectBackward>), tensor(-0.4133, grad_fn=<SelectBackward>), tensor(-0.2621, grad_fn=<SelectBackward>), tensor(-0.2002, grad_fn=<SelectBackward>), tensor(-0.2028, grad_fn=<SelectBackward>)

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([ 0.8121,  0.5636,  0.3421,  0.1849,  0.3672,  0.4389,  0.6042,  0.6871,
         0.9670,  0.5574,  0.4064,  0.4459,  0.2782,  0.2599,  0.3906,  0.4997,
         0.6275,  0.2694, -0.0250,  0.6539,  0.6219,  0.0570,  0.4382,  0.5231,
         0.3212,  0.4753,  0.4936,  0.5569,  0.6771,  0.1288,  0.3176,  0.6388,
         0.4963,  0.6764,  0.2673,  0.1373,  0.6395,  0.4485,  0.6238,  0.2953,
         0.4419,  0.6422,  0.2390,  0.3036,  0.5453,  0.5264,  0.8032,  0.5897,
         0.7437,  0.6998,  0.5422,  0.7456,  0.5804,  0.5252,  0.1791,  0.4482,
         0.2097,  0.4118,  0.5088,  0.3922,  0.9517,  0.2820,  0.3715,  0.3355,
         0.5258,  0.7159,  0.2922,  0.4292,  0.5795,  0.1416,  0.6146,  0.2245,
         1.0913,  0.5287,  0.7426,  0.5480,  0.1952,  0.2943,  0.2670,  0.5028,
         0.2598,  0.2435,  0.34

distribution:  tensor([[0.0033, 0.8164, 0.0268, 0.1535]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0016, 0.0925, 0.0064, 0.8995]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0045, 0.5803, 0.0269, 0.3883]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0030, 0.9239, 0.0155, 0.0576]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0185, 0.7168, 0.0496, 0.2151]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0020, 0.0676, 0.0043, 0.9261]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0050, 0.2347, 0.0174, 0.7429]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0042, 0.1650, 0.0518, 0.7790]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0192, 0.7846, 0.0264, 0.1698]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0014, 0.2209, 0.0035, 0.7742]], grad_fn=<ExpBackward>)
distribution:  tensor([[5.5091e-04, 5.4420e-02, 6.1773e-03, 9.3885e-01]],
       grad_fn=<ExpBackward>)
distribution:  tensor([[0.0016, 0.6021, 0.0239, 0.3724]], grad_fn=<ExpBackward>)
distr

log_probs:  [tensor(-0.2028, grad_fn=<SelectBackward>), tensor(-0.1059, grad_fn=<SelectBackward>), tensor(-0.5442, grad_fn=<SelectBackward>), tensor(-0.0791, grad_fn=<SelectBackward>), tensor(-0.3330, grad_fn=<SelectBackward>), tensor(-0.0768, grad_fn=<SelectBackward>), tensor(-0.2971, grad_fn=<SelectBackward>), tensor(-1.8018, grad_fn=<SelectBackward>), tensor(-3.9526, grad_fn=<SelectBackward>), tensor(-0.2559, grad_fn=<SelectBackward>), tensor(-0.0631, grad_fn=<SelectBackward>), tensor(-0.5073, grad_fn=<SelectBackward>), tensor(-0.2551, grad_fn=<SelectBackward>), tensor(-0.8162, grad_fn=<SelectBackward>), tensor(-0.2974, grad_fn=<SelectBackward>), tensor(-2.2648, grad_fn=<SelectBackward>), tensor(-1.0483, grad_fn=<SelectBackward>), tensor(-0.2419, grad_fn=<SelectBackward>), tensor(-0.0298, grad_fn=<SelectBackward>), tensor(-0.5725, grad_fn=<SelectBackward>), tensor(-3.3193, grad_fn=<SelectBackward>), tensor(-0.5242, grad_fn=<SelectBackward>), tensor(-0.5378, grad_fn=<SelectBackward>)

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([0.5000, 0.2771, 0.3944, 0.5946, 0.4264, 0.4868, 0.3649, 0.5346, 0.5763,
        0.8304, 0.2899, 0.7460, 0.2952, 0.2950, 0.5990, 0.7806, 0.3564, 0.2777,
        0.6107, 0.5846, 0.4284, 0.3086, 0.3097, 0.5231, 0.5183, 0.7221, 0.3971,
        0.7014, 0.9777, 0.7623, 0.2563, 0.5925, 0.5499, 0.6113, 0.4268, 0.6667,
        1.0365, 0.5068, 0.3890, 0.5275, 0.3169, 0.6652, 0.7204, 0.6528, 0.8387,
        0.4201, 0.6545, 0.4366, 0.5177, 0.1632, 0.2375, 0.8678, 0.5186, 0.5357,
        0.7564, 0.7287, 0.3964, 0.5921, 0.5756, 0.3483, 0.3151, 0.2019, 0.5097,
        0.9189, 0.4352, 0.5530, 0.3486, 0.6507, 0.9032, 0.1763, 0.6912, 0.5733,
        0.1352, 0.5644, 0.6572, 0.4594, 0.3922, 0.7986, 0.5901, 0.5588, 0.6101,
        0.5522, 0.5604, 0.0293, 0.4523, 0.5801, 0.6463, 0.9080, 0.6175, 1.0564,
        0.8697, 0.6658, 0.4335,

distribution:  tensor([[0.0146, 0.7320, 0.0108, 0.2426]], grad_fn=<ExpBackward>)
n_step_rewards.shape:  (1,)
rewards.shape:  (1,)
n_step_rewards:  [10.]
rewards:  [10.]
bootstrap:  [False]
done.shape: (before n_steps) (1,)
done: (before n_steps) [ True]
done.shape: (after n_steps) (1,)
Gamma_V.shape:  (1,)
done: (after n_steps) [ True]
Gamma_V:  [0.99]
old_states.shape:  torch.Size([1, 1, 9, 9])
new_states.shape:  torch.Size([1, 1, 9, 9])
log_probs:  [tensor(-1.4163, grad_fn=<SelectBackward>)]
log_probs:  tensor([-1.4163], grad_fn=<StackBackward>)
distributions.shape:  torch.Size([1, 1, 4])
distributions:  tensor([[[0.0146, 0.7320, 0.0108, 0.2426]]], grad_fn=<StackBackward>)
Updating critic...
V_trg.shape (after critic):  torch.Size([])
V_trg.shape (after sum):  torch.Size([1])
V_trg.shape (after squeeze):  torch.Size([])
V_trg.shape (after squeeze):  tensor(10.)
V1.shape:  torch.Size([])
V1:  tensor([[1.8007]], grad_fn=<AddmmBackward>)
Updating actor...
V_trg.shape:  torch.Size([1])
V

distribution:  tensor([[1.8122e-04, 1.7409e-02, 5.0551e-04, 9.8190e-01]],
       grad_fn=<ExpBackward>)
distribution:  tensor([[0.0034, 0.1182, 0.0083, 0.8701]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0131, 0.1874, 0.0288, 0.7707]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0101, 0.3975, 0.0444, 0.5480]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0048, 0.4196, 0.0234, 0.5522]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0026, 0.2064, 0.0029, 0.7881]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0011, 0.2890, 0.0038, 0.7061]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0016, 0.1323, 0.0151, 0.8509]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0029, 0.3908, 0.0658, 0.5406]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0196, 0.1509, 0.0434, 0.7861]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0018, 0.5808, 0.0219, 0.3954]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0048, 0.8546, 0.0117, 0.1290]], grad_fn=<ExpBackward>)
distr

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([ 0.5738,  0.6104,  0.5511,  0.6906,  0.8225,  0.5154,  0.8526,  0.7682,
         0.5376,  0.6742,  0.9454,  0.6130,  0.6713,  1.0631,  0.7417,  0.8446,
         0.8679,  0.6169,  1.0472,  0.2958, -0.0696,  0.7297,  0.7050,  0.6661,
         0.9077,  1.2574,  0.5316,  0.4586,  0.7473,  0.2964,  0.4462,  0.7954,
         0.4869,  0.6856,  0.3689,  0.4118,  0.4903,  0.8901,  0.4911,  0.5354,
         1.1028,  0.6805,  0.8600,  0.6682,  0.6146,  0.7621,  0.6380,  0.8922,
         0.8848,  0.8053,  0.8377,  0.5395,  0.5027,  0.5657,  0.9564,  0.8364,
         0.1721,  0.2799,  0.9869,  0.2644,  0.2923,  0.7986,  0.4818,  0.4773,
         0.6395,  0.9427,  0.4739,  0.9172,  0.8055,  0.4096,  0.5598,  0.7220,
         0.5975,  0.4529,  1.1029,  0.8274,  1.0230,  1.0984,  0.7205,  0.4684,
         0.2042,  0.7883,  0.96

distribution:  tensor([[0.0037, 0.3403, 0.0047, 0.6513]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0026, 0.0464, 0.0038, 0.9472]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0014, 0.1026, 0.0095, 0.8865]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0098, 0.3318, 0.0335, 0.6249]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0015, 0.0534, 0.0042, 0.9409]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0192, 0.2662, 0.0579, 0.6567]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0035, 0.2827, 0.0633, 0.6506]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0078, 0.3565, 0.0318, 0.6039]], grad_fn=<ExpBackward>)
distribution:  tensor([[2.6220e-04, 1.5005e-02, 4.9477e-04, 9.8424e-01]],
       grad_fn=<ExpBackward>)
distribution:  tensor([[0.0082, 0.5478, 0.0139, 0.4300]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0100, 0.5620, 0.0433, 0.3847]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0048, 0.1266, 0.0016, 0.8670]], grad_fn=<ExpBackward>)
distr

distribution:  tensor([[0.0092, 0.1052, 0.0039, 0.8817]], grad_fn=<ExpBackward>)
distribution:  tensor([[0.0080, 0.0824, 0.0578, 0.8518]], grad_fn=<ExpBackward>)
n_step_rewards.shape:  (121,)
rewards.shape:  (121,)
n_step_rewards:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
rewards:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
bootstrap:  [False

V_trg.shape (after critic):  torch.Size([121])
V_trg.shape (after sum):  torch.Size([121])
V_trg.shape (after squeeze):  torch.Size([121])
V_trg.shape (after squeeze):  tensor([0.7212, 0.7206, 1.1046, 0.8684, 0.9098, 0.6617, 0.4136, 0.1305, 0.4089,
        0.7743, 0.7112, 0.7051, 0.3978, 0.6012, 0.8845, 0.8841, 0.5931, 0.8899,
        0.6572, 0.5082, 0.6362, 1.2112, 0.6973, 0.7983, 0.5948, 0.8058, 1.0111,
        0.5564, 0.5529, 0.5691, 0.6187, 0.3627, 0.8247, 0.7986, 0.7298, 0.6381,
        0.8043, 0.4672, 0.9362, 1.0432, 0.8461, 0.8355, 0.7464, 0.7489, 0.9878,
        0.5901, 0.5978, 0.7857, 1.1877, 0.8350, 0.6563, 0.8072, 0.8075, 0.5594,
        0.7860, 1.1396, 0.3381, 1.1596, 0.8587, 0.8311, 0.7036, 0.6395, 0.9760,
        1.0463, 0.7925, 0.5898, 0.8675, 0.7353, 0.3403, 0.8559, 0.4525, 0.9463,
        0.6892, 0.8039, 0.4847, 0.6992, 0.4918, 1.1734, 0.9529, 1.2067, 1.0017,
        1.1834, 1.1458, 0.4891, 0.9409, 1.0515, 0.9805, 0.6454, 0.9403, 1.2870,
        0.8040, 0.9034, 0.9217,

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(8,6))
n_epochs = np.arange(100, len(score))
average_score = np.array([np.mean(score[i:i+100]) for i in range(len(score)-100)])
plt.plot(n_epochs, average_score, alpha=0.9)
plt.title("Performance", fontsize=16)
plt.xlabel("Number of epochs", fontsize=16)
plt.ylabel("Total reward", fontsize=16)
plt.show()

In [None]:
save = False
keywords = ['relational', 'residual','unboxed_gem',str(len(control_score))+"-episodes","50-steps"] # example

if colab and save:
    %cd ~
    parent_dir = "/content/gdrive/My Drive/Colab Notebooks/"
    save_dir  = "RelationalTrained/"
    %cd "{parent_dir}"
    !mkdir "{save_dir}"
    ID = utils.save_session(save_dir, keywords, game_params, HPs, score)
    torch.save(trained_agent, save_dir+"agent_"+ID)