In [7]:
import numpy as np
import os
import random
import sys
import tensorflow as tf
from tensorflow.contrib.distributions import MultivariateNormalFullCovariance
import vrep
import keras
from keras.models import Sequential, Model
from keras.models import load_model
from keras.layers import BatchNormalization, Dense, Input
from keras.callbacks import TensorBoard
from keras import backend as K
from common import *



In [None]:
K

## policyLoss(actions, advantage)
Idea:  
If advantage is small, probably need to change direction (positive action to negative and vice versa)
Reference:https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-3-model-based-rl-9a6fe0cce99

In [2]:
class GaussianPolicy:
    """
    self.action(6,) is sampled from self.policy, 
    which is a gaussian distribution with mean as the output from self.model
    self.model takes state (24,) and produces an action mean (6,)
    TODO: Constrain self.model output?
    """
    def __init__(self, cov=None, lr=0.05, epochs=1, batch_size=128, seed=0, model_file=None):
        if not model_file:
            self.model = Sequential([
                #Input(shape=(24,), dtype="float32", name="state_l"),
                Dense(256, input_shape=(24,), dtype="float32", kernel_initializer="random_normal", kernel_regularizer=keras.regularizers.l2(0.01), bias_initializer="zeros", activation="relu", name="hidden_l1"),
                BatchNormalization(),
                Dense(64, kernel_initializer="random_normal",  bias_initializer="zeros", kernel_regularizer=keras.regularizers.l2(0.01), activation="relu", name="hidden_l2"),
                BatchNormalization(),
                Dense(6, kernel_initializer="random_normal", kernel_regularizer=keras.regularizers.l2(0.01), bias_initializer="zeros", name="action_l")
            ])
        else:
            # load an existing model
            self.model = load_model(model_file)
        self.trainable_params = self.model.trainable_weights
        print(self.trainable_params)
        self.state = tf.placeholder(dtype=tf.float32, shape=(None, 24))
        self.action_taken = tf.placeholder(dtype=tf.float32, shape=(None, 6))
        self.advantage = tf.placeholder(dtype=tf.float32, shape=(None, 1))
        self.action_mean = self.model(self.state)
        self.action_cov = tf.constant(np.eye(6).astype("float32")) if cov==None else tf.constant(cov)
        self.action_dist = MultivariateNormalFullCovariance(self.action_mean, self.action_cov)
        self.action_probability = self.action_dist.prob(self.action_taken)
        self.action_sample =self.action_dist.sample()
        #self.loglike = tf.log(self.action_probability)
        self.loglike = -0.5 * tf.matmul(tf.square(self.action_taken - self.action_mean), tf.matrix_inverse(self.action_cov))
        self.loss = -tf.reduce_mean(self.loglike * self.advantage)
        #self.loss = -tf.reduce_mean(self.loglike)
        self.params_updates = tf.train.GradientDescentOptimizer(lr).minimize(self.loss, var_list=self.trainable_params)
        
        self.epochs = epochs
        self.batch_size = batch_size
        self.seed = seed
        self.session = tf.Session()

        for param in self.trainable_params:         
            #print(param.initializer)
            self.session.run(param.initializer)
            self.session.run(tf.global_variables_initializer())
  
    def __del__(self):
        self.session.close()
    
    def sampleAction(self, state):
        return self.session.run(self.action_sample, feed_dict={self.state:state, K.learning_phase():0})

    def train(self, state, action_taken, advantage):
        # Gradient ascent
        #params_updates = []
        #self.session.run(tf.global_variables_initializer())
        #batch_size = self.session.run(tf.shape(self.loglike)[0], feed_dict={self.action_taken:action_taken, self.state:state})
        #print(batch_size)
        for train_step in range(self.epochs):
            np.random.seed(self.seed)
            np.random.shuffle(action_taken)
            np.random.seed(self.seed)
            np.random.shuffle(state)
            np.random.seed(self.seed)
            np.random.shuffle(advantage)
            
            self.session.run(self.params_updates, feed_dict={self.action_taken:action_taken, self.state:state, self.advantage:advantage
                                                            , K.learning_phase():1})
            print("== Epoch: %d =="%(train_step))
            #if train_step % 10 == 0:
            loglike_vec, loss, action_prob = self.session.run([self.loglike, self.loss, self.action_probability], feed_dict={self.action_taken:action_taken, self.state:state, self.advantage:advantage
                                                                                    , K.learning_phase():0})
            print("Log likelihood")
            print(loglike_vec)
            print("Loss")
            print(loss)
            #print("Action taken")
            #print(action_taken)
            print("Action prob")
            print(action_prob)
            print("Advantage")
            #print(advantage)

    def save(self, model_file):
        """
        Save the internal model to location model_file
        """
        self.model.save(model_file)

In [3]:
def getModel():
    """
    Model for transition dynamics, rewards and termination of episodes.
    Inputs: [PrevState(24,), Action(6,)](30,)
    Outputs: [NestState-PrevState(24,), Reward](25,)
    """
    prevState_action_l = Input(shape=(30,), dtype="float32", name="prevState_action_l")
    H_l = Dense(256, kernel_initializer="normal", activation="relu", name="hidden_l1")(prevState_action_l)
    H_l = BatchNormalization()(H_l)
    H_l = Dense(64, kernel_initializer="normal", activation="relu", name="hidden_l2")(H_l)
    H_l = BatchNormalization()(H_l)
    nextState_reward_l = Dense(25, kernel_initializer="normal", name="nextState_l")(H_l)
    #dest_l = Dense(1, kernel_initializer="normal", activation="sigmoid", name="dest_l")(H_l)
    model = Model(inputs=prevState_action_l, outputs=nextState_reward_l)
    model.compile(loss="mse", optimizer="rmsprop")
    return model

def getPolicy():
    return GaussianPolicy()


def getAdvantages(rewards, discount_factor):
    eps = rewards.shape[0]
    advantages = np.zeros_like(rewards)
    running_discounted_advantages = 0
    for i in range(eps - 1, -1, -1):
        running_discounted_advantages = running_discounted_advantages * discount_factor + rewards[i]
        advantages[i] = running_discounted_advantages
    return advantages

In [4]:
class VREPPushTaskEnvironment():
    MAX_JOINT_VELOCITY = 1.0
    INITIAL_JOINT_POSITIONS = [np.pi, 1.5 * np.pi, 1.5 * np.pi, np.pi, np.pi, np.pi]
    INITIAL_CUBOID_POSITION = [0., 0.5, 0.05]
    
    def __init__(self):
        vrep.simxFinish(-1) # just in case, close all opened connections
        self.client_ID=vrep.simxStart('127.0.0.1',19997,True,True,5000,5) # Connect to V-REP
        
         # enable the synchronous mode on the client:
        vrep.simxSynchronous(self.client_ID,True)
        # start the simulation:
        vrep.simxStartSimulation(self.client_ID, vrep.simx_opmode_blocking)

        
    def __del__(self):
        # stop the simulation:
        vrep.simxStopSimulation(self.client_ID, vrep.simx_opmode_blocking)
        # before closing the connection to V-REP, make sure that the last command sent out had time to arrive. You can guarantee this with (for example):
        vrep.simxGetPingTime(self.client_ID)
        # disconnect
        vrep.simxFinish(self.client_ID)
        
        
    def __enter__(self):
        return self
        
    def __exit__(self, exception_type, exception_value, traceback):
        del self
        
    def reset(self, is_first_reset):
        """
        Reset the environment
        Return initial state
        
        """
        
        if not is_first_reset:
            # tear down datastreams
            for i in range(6):
                _, _ = vrep.simxGetObjectFloatParameter(self.client_ID, self.joint_handles[i], 2012, vrep.simx_opmode_discontinue)
                _, _ = vrep.simxGetJointPosition(self.client_ID, self.joint_handles[i],
                        vrep.simx_opmode_discontinue)
            _, _ = vrep.simxGetObjectPosition(self.client_ID, self.gripper_handle, -1, vrep.simx_opmode_discontinue)
            _, _ = vrep.simxGetObjectOrientation(self.client_ID, self.gripper_handle, -1, vrep.simx_opmode_discontinue)
            _, _ = vrep.simxGetObjectPosition(self.client_ID, self.cuboid_handle, -1, vrep.simx_opmode_discontinue)
            _, _ = vrep.simxGetObjectPosition(self.client_ID, self.target_plane_handle, -1, vrep.simx_opmode_discontinue)

            # remove Mico
            vrep.simxRemoveModel(self.client_ID, self.model_base_handle, vrep.simx_opmode_blocking)
        
        # get handles
        _, self.cuboid_handle = vrep.simxGetObjectHandle(self.client_ID, 'Cuboid', vrep.simx_opmode_blocking)
        _, self.target_plane_handle = vrep.simxGetObjectHandle(self.client_ID, 'TargetPlane', vrep.simx_opmode_blocking)
        
        _, self.model_base_handle = vrep.simxLoadModel(self.client_ID, 'models/robots/non-mobile/MicoRobot.ttm', 0, vrep.simx_opmode_blocking)
        self.joint_handles = [-1, -1, -1, -1, -1, -1]
        for i in range(6):
            _, self.joint_handles[i] = vrep.simxGetObjectHandle(self.client_ID, 'Mico_joint' + str(i+1), vrep.simx_opmode_blocking)
        _, self.gripper_handle = vrep.simxGetObjectHandle(self.client_ID, 'MicoHand', vrep.simx_opmode_blocking)

        # initialise mico joint positions, cuboid orientation and cuboid position
        vrep.simxPauseCommunication(self.client_ID, 1)
        for i in range(6):
            vrep.simxSetJointPosition(self.client_ID, self.joint_handles[i], VREPPushTaskEnvironment.INITIAL_JOINT_POSITIONS[i], vrep.simx_opmode_oneshot)
        vrep.simxSetObjectOrientation(self.client_ID, self.cuboid_handle, -1, [0, 0, 0], vrep.simx_opmode_oneshot)
        vrep.simxSetObjectPosition(self.client_ID, self.cuboid_handle, -1, VREPPushTaskEnvironment.INITIAL_CUBOID_POSITION, vrep.simx_opmode_oneshot)
        vrep.simxPauseCommunication(self.client_ID, 0)
        vrep.simxGetPingTime(self.client_ID)

        current_vel = np.array([0, 0, 0, 0, 0, 0], dtype='float')
        joint_angles = np.array([0, 0, 0, 0, 0, 0], dtype='float')

        # set up datastreams
        for i in range(6):
            _, current_vel[i] = vrep.simxGetObjectFloatParameter(self.client_ID, self.joint_handles[i], 2012,
                    vrep.simx_opmode_streaming)
            _, joint_angles[i] = vrep.simxGetJointPosition(self.client_ID, self.joint_handles[i], vrep.simx_opmode_streaming)
        _, gripper_pos = vrep.simxGetObjectPosition(self.client_ID, self.gripper_handle, -1, vrep.simx_opmode_streaming)
        _, gripper_orient = vrep.simxGetObjectOrientation(self.client_ID, self.gripper_handle, -1, vrep.simx_opmode_streaming)
        _, cuboid_pos = vrep.simxGetObjectPosition(self.client_ID, self.cuboid_handle, -1, vrep.simx_opmode_streaming)
        _, target_plane_pos = vrep.simxGetObjectPosition(self.client_ID, self.target_plane_handle, -1, vrep.simx_opmode_streaming)

        # destroy dummy arrays for setting up the datastream
        del current_vel, joint_angles, gripper_pos, gripper_orient, cuboid_pos, target_plane_pos

        # obtain first state
        current_state = getCurrentState(self.client_ID, self.joint_handles, self.gripper_handle, self.cuboid_handle,
                self.target_plane_handle)
        
        self.state = current_state
        
        return current_state
    
    def getRewards(self, state):
        """
            Return the sum of the Euclidean distance between gripper and cuboid and the Euclidean distance between cuboid and targetPlane.
        """
        return -np.sqrt(np.sum(np.square(state[-6:-3]))) + np.sqrt(np.sum(np.square(state[-3:])))

    
    def getCurrentState(self, client_ID, joint_handles, gripper_handle, cuboid_handle, target_plane_handle):
        """
            TODO: Refactor away arguments
            Return the state as an array of shape (24, )
            [current_vel, joint_angles, gripper_pos, gripper_orient, cuboid_gripper_vec, target_plane_cuboid_vec]
             6              6               3           3               3                   3
        """
        current_vel = np.array([0, 0, 0, 0, 0, 0], dtype='float')
        joint_angles = np.array([0, 0, 0, 0, 0, 0], dtype='float')
        # obtain first state
        for i in range(6):
            ret, current_vel[i] = vrep.simxGetObjectFloatParameter(client_ID, joint_handles[i], 2012,
                    vrep.simx_opmode_buffer)
            while ret != vrep.simx_return_ok:
                ret, current_vel[i] = vrep.simxGetObjectFloatParameter(client_ID, joint_handles[i], 2012,
                        vrep.simx_opmode_buffer)
            ret, joint_angles[i] = vrep.simxGetJointPosition(client_ID, joint_handles[i], vrep.simx_opmode_buffer)
            while ret != vrep.simx_return_ok:
                ret, joint_angles[i] = vrep.simxGetJointPosition(client_ID, joint_handles[i], vrep.simx_opmode_buffer)
        ret, gripper_pos = vrep.simxGetObjectPosition(client_ID, gripper_handle, -1, vrep.simx_opmode_buffer)
        while ret != vrep.simx_return_ok:
            ret, gripper_pos = vrep.simxGetObjectPosition(client_ID, gripper_handle, -1, vrep.simx_opmode_buffer)
        ret, gripper_orient = vrep.simxGetObjectOrientation(client_ID, gripper_handle, -1, vrep.simx_opmode_buffer)
        while ret != vrep.simx_return_ok:
            ret, gripper_orient = vrep.simxGetObjectOrientation(client_ID, gripper_handle, -1, vrep.simx_opmode_buffer)
        gripper_pos = np.array(gripper_pos)
        gripper_orient = np.array(gripper_orient)

        ret, cuboid_pos = vrep.simxGetObjectPosition(client_ID, cuboid_handle, -1, vrep.simx_opmode_buffer)
        while ret != vrep.simx_return_ok:
            ret, cuboid_pos = vrep.simxGetObjectPosition(client_ID, cuboid_handle, -1, vrep.simx_opmode_buffer)
        cuboid_pos = np.array(cuboid_pos)

        ret, target_plane_pos = vrep.simxGetObjectPosition(client_ID, target_plane_handle, -1, vrep.simx_opmode_buffer)
        while ret != vrep.simx_return_ok:
            ret, target_plane_pos = vrep.simxGetObjectPosition(client_ID, target_plane_handle, -1, vrep.simx_opmode_buffer)
        target_plane_pos = np.array(target_plane_pos)

        cuboid_gripper_vec = cuboid_pos - gripper_pos
        target_plane_cuboid_vec = target_plane_pos - cuboid_pos

        return np.concatenate([current_vel, joint_angles, gripper_pos, gripper_orient, cuboid_gripper_vec,
            target_plane_cuboid_vec])

    
    def step(self, actions):
        """
        Execute sequences of actions (None, 6) in the environment
        Return sequences of subsequent states and rewards
        """
        next_states = []
        rewards = []
        for i in range(actions.shape[0]):
            current_vel = self.state[:6] + actions[i, :]
            vrep.simxPauseCommunication(self.client_ID, 1)
            for i in range(6):
                vrep.simxSetJointTargetVelocity(self.client_ID, self.joint_handles[i], current_vel[i],
                        vrep.simx_opmode_oneshot)
            vrep.simxPauseCommunication(self.client_ID, 0)
            vrep.simxSynchronousTrigger(self.client_ID)
            vrep.simxSynchronousTrigger(self.client_ID)
            vrep.simxSynchronousTrigger(self.client_ID)
            vrep.simxSynchronousTrigger(self.client_ID)
            vrep.simxSynchronousTrigger(self.client_ID)
            # make sure all commands are exeucted
            vrep.simxGetPingTime(self.client_ID)
            # obtain next state
            next_state = getCurrentState(self.client_ID, self.joint_handles, self.gripper_handle, self.cuboid_handle,
                    self.target_plane_handle)
            next_states.append(next_state)
            rewards.append(self.getRewards(next_state))

        next_states = np.concatenate(next_states)
        rewards = np.array(rewards)
        return next_states, rewards
    
        

In [None]:
a = np.array([])
print(a is not None)

In [18]:
GREEDY_FACTOR = 0.9 # Fraction of times to choose greedy actions TODO: Increase over time

EPS_LENGTH = 200
NUM_EPS = 150

NEW_MODEL = False
MODEL_FILE = "model/PolicyGradient/model01.h5"

NEW_POLICY = True
POLICY_FILE = "model/PolicyGradient/policy02.h5"
SEED = 10

MONITOR_LOG_FILE = "" 

DISCOUNT_FACTOR = 0.9

EPSILON = 0.1

BATCH_SIZE = 128
EPOCHS = 20

random.seed(SEED)

with VREPPushTaskEnvironment() as env:
    model = getModel() if NEW_MODEL else load_model(MODEL_FILE)
    policy = getPolicy() if NEW_POLICY else GaussianPolicy(model_file=POLICY_FILE)
    # obtain first state
    current_state = env.reset(True)
    Xs_mean = None
    Xs_std = None
    ys_mean = None
    ys_std = None
    
    policy_Xs_mean = None
    policy_Xs_std = None
    policy_ys_mean = None
    policy_ys_std = None
    policy_rs_mean = None
    policy_rs_std = None
    for i in range(NUM_EPS):
        print("{}th episode".format(i))
        states = [current_state[np.newaxis, :]]
        actions = []
        rewards = []
        GREEDY_FACTOR = 0.5 if i < 100 else 0.9
        # collect trajectory
        for step in range(EPS_LENGTH):
            # select greedy action according to GREEDY_FACTOR
            if (not NEW_POLICY) or (np.random.rand() < GREEDY_FACTOR and policy_Xs_mean is not None):
                #action = policy.sampleAction(invStandardise(current_state[np.newaxis, :], policy_Xs_mean, policy_Xs_std))
                #action = invStandardise(action, policy_ys_mean, policy_ys_std)
                action = policy.sampleAction(current_state[np.newaxis, :])
            else:
                action = generateRandomVel(MAX_JOINT_VELOCITY)[np.newaxis, :]
            next_state, reward = env.step(action)
           
            actions.append(action)
            states.append(next_state[np.newaxis, :])
            rewards.append(reward)
            # proceed to next state
            current_state = next_state[:]
            
        states = np.concatenate(states, axis=0)
        actions = np.concatenate(actions, axis=0)
        rewards = np.array(rewards)
        advantages = getAdvantages(rewards, DISCOUNT_FACTOR)
        if NEW_MODEL:
            # X = [current_states(,24), actions(,6)]
            # y = [next_states - current_states(,24), rewards]
            X = np.concatenate([states[:-1, :], actions], axis=1)
            y = np.concatenate([states[1:, :] - states[:-1, :], rewards], axis=1)
            # standardise training data
            if Xs_mean is None:
                Xs_mean = np.mean(X, axis=0)
                Xs_std = np.std(X, axis=0)
                ys_mean = np.mean(y, axis=0)
                ys_std = np.std(y, axis=0)
            X = standardise(X, Xs_mean, Xs_std)
            y = standardise(y, ys_mean, ys_std)    
            # add zero-mean gaussian noise
            X += np.random.normal(0, 0.05, X.shape)
            y += np.random.normal(0, 0.05, y.shape)

            # train model
            model.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)
        if NEW_POLICY:
            policy_X = states[:-1, :]
            policy_y = actions[:]
            # standardise training data
            """
            if policy_Xs_mean is None:
                policy_Xs_mean = np.mean(policy_X, 0)
                policy_Xs_std = np.std(policy_X, 0)
                policy_ys_mean = np.mean(policy_y, 0)
                policy_ys_std = np.std(policy_y, 0)
                #policy_rs_mean = np.mean(advantages, 0)
                #policy_rs_std = np.std(advantages, 0)
            policy_X = standardise(policy_X, policy_Xs_mean, policy_Xs_std)
            policy_y = standardise(policy_y, policy_ys_mean, policy_ys_std)
            """
            advantages = standardise(advantages, np.mean(advantages), np.std(advantages))

            # train policy
            policy.train(policy_X, policy_y, advantages)
        # Reset the scene
        env.reset(False)
        
    if NEW_MODEL:
        model.save(MODEL_FILE)
    if NEW_POLICY:
        policy.save(POLICY_FILE)

[<tf.Variable 'hidden_l1_11/kernel:0' shape=(24, 256) dtype=float32_ref>, <tf.Variable 'hidden_l1_11/bias:0' shape=(256,) dtype=float32_ref>, <tf.Variable 'batch_normalization_7/gamma:0' shape=(256,) dtype=float32_ref>, <tf.Variable 'batch_normalization_7/beta:0' shape=(256,) dtype=float32_ref>, <tf.Variable 'hidden_l2_11/kernel:0' shape=(256, 64) dtype=float32_ref>, <tf.Variable 'hidden_l2_11/bias:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'batch_normalization_8/gamma:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'batch_normalization_8/beta:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'action_l_5/kernel:0' shape=(64, 6) dtype=float32_ref>, <tf.Variable 'action_l_5/bias:0' shape=(6,) dtype=float32_ref>]
0th episode
== Epoch: 0 ==
Log likelihood
[[ -6.33027077e-01  -1.97710702e-03  -9.12820280e-04  -1.19018387e-02
   -2.30341451e-03  -1.77757849e-03]
 [ -7.82773718e-02  -1.52097018e-02  -3.51491421e-01  -1.96412988e-02
   -3.09578717e-01  -7.14047253e-02]
 [ -5.47033429e-01  -

3th episode
== Epoch: 0 ==
Log likelihood
[[ -1.50495660e-04  -4.23022360e-01  -1.30974531e-01  -2.67477274e-01
   -3.05957973e-01  -6.65186495e-02]
 [ -3.43124479e-01  -1.30678266e-01  -3.91113520e-01  -1.82469022e-02
   -5.04511548e-03  -1.28339007e-02]
 [ -1.98206216e-01  -1.12211946e-02  -2.99180895e-01  -1.95213124e-01
   -2.25535035e-01  -2.17760690e-02]
 ..., 
 [ -1.14028171e-01  -2.87600979e-02  -4.02431279e-01  -4.74252993e-05
   -1.58505049e-02  -6.29887059e-02]
 [ -5.18703982e-02  -1.94632933e-02  -2.52746999e-01  -1.01929568e-01
   -1.39753968e-01  -5.92812411e-02]
 [ -1.80697843e-01  -2.30904624e-01  -8.70618150e-02  -8.19935277e-02
   -4.02981648e-03  -8.43116716e-02]]
Loss
-0.00724819
Action prob
[ 0.00122143  0.00163735  0.00155736  0.00185632  0.00250291  0.00092623
  0.00050621  0.00281833  0.00322584  0.00124615  0.00108931  0.00191659
  0.00159795  0.00166095  0.00162758  0.00133     0.00124676  0.00243821
  0.00125279  0.0011813   0.00103122  0.00093753  0.00146214

6th episode
== Epoch: 0 ==
Log likelihood
[[ -5.27155120e-03  -4.28037882e-01  -1.14686258e-01  -3.19952846e-01
   -3.95744979e-01  -1.92456618e-02]
 [ -9.56076533e-02  -1.29657768e-04  -2.79143639e-03  -1.84620082e-01
   -2.69749284e-01  -3.17330053e-03]
 [ -5.04233360e-01  -9.57325697e-02  -2.33398393e-01  -2.82373396e-04
   -7.52566010e-03  -4.92095910e-02]
 ..., 
 [ -4.28712964e-02  -2.28297576e-01  -2.73725122e-01  -8.94449011e-04
   -2.42123902e-01  -9.50598717e-03]
 [ -1.56355351e-01  -3.09206367e-01  -1.18970042e-02  -1.48361688e-02
   -5.08029275e-02  -3.49392325e-01]
 [ -1.10710740e-01  -1.63330068e-03  -1.09432258e-01  -6.19193278e-02
   -8.49197153e-03  -1.30430430e-01]]
Loss
0.00188604
Action prob
[ 0.0011176   0.00231186  0.0016549   0.00163238  0.00302306  0.00092705
  0.00163084  0.00101911  0.00197793  0.00222261  0.00134897  0.00108197
  0.00177855  0.00175274  0.00084929  0.00152377  0.00230955  0.00095031
  0.00151352  0.00116558  0.00230396  0.00110363  0.00107941 

9th episode
== Epoch: 0 ==
Log likelihood
[[ -6.00259960e-01  -4.52634096e-01  -4.90378216e-02  -1.52093530e-01
   -1.09377911e-03  -5.12631610e-02]
 [ -3.88525754e-01  -1.49171308e-01  -1.31873563e-01  -4.42938507e-01
   -3.99555504e-01  -3.85340393e-01]
 [ -5.80338717e-01  -1.53860211e-01  -1.04308985e-02  -2.32498899e-01
   -7.83169735e-03  -3.09820443e-01]
 ..., 
 [ -7.05734715e-02  -5.76423816e-02  -6.50157928e-02  -7.38709559e-06
   -6.11620490e-03  -1.37909979e-01]
 [ -3.68856796e-04  -1.12715364e-01  -2.47852817e-01  -4.08452600e-02
   -2.71805584e-01  -5.31699993e-02]
 [ -3.80714715e-01  -2.49191970e-02  -2.64729917e-01  -1.37924567e-01
   -6.31700456e-02  -7.44094653e-03]]
Loss
0.00328661
Action prob
[ 0.00109171  0.00060454  0.00110445  0.00106909  0.00121144  0.00104934
  0.00211355  0.00128001  0.00157752  0.00101827  0.00173878  0.00120449
  0.00134565  0.00161239  0.00140606  0.00078115  0.00135974  0.00165742
  0.00114162  0.00181872  0.00110167  0.00140396  0.00254125 

12th episode
== Epoch: 0 ==
Log likelihood
[[ -2.23718147e-04  -1.42538212e-02  -5.37263528e-02  -2.12185588e-02
   -1.09091967e-01  -1.17943145e-01]
 [ -2.50016093e-01  -4.76217508e-01  -8.52967948e-02  -2.84777023e-02
   -2.76316613e-01  -1.08007744e-01]
 [ -3.51430506e-01  -2.68773198e-01  -2.95720133e-03  -2.00234890e-01
   -3.52225080e-02  -3.99917364e-01]
 ..., 
 [ -4.79972363e-01  -5.35405567e-03  -3.02313119e-01  -3.38039994e-01
   -4.15116847e-01  -6.69898018e-02]
 [ -6.77547709e-04  -1.51287613e-03  -2.60620147e-01  -4.31755781e-02
   -2.27115765e-01  -1.06399141e-01]
 [ -3.87238397e-04  -7.63851628e-02  -2.14460909e-01  -3.54887396e-01
   -1.78151876e-01  -2.71531474e-02]]
Loss
-0.00170474
Action prob
[ 0.00293782  0.00118506  0.00114521  0.00122264  0.0017565   0.00099398
  0.00193187  0.00204914  0.00056199  0.00093463  0.00167106  0.0012566
  0.0020011   0.00133866  0.00133774  0.00184537  0.00212454  0.00246786
  0.00187244  0.00150874  0.00272743  0.00157276  0.00110464

15th episode
== Epoch: 0 ==
Log likelihood
[[-0.2827943  -0.01931789 -0.10379293 -0.10056666 -0.47336322 -0.05807465]
 [-0.28526163 -0.24153239 -0.43293014 -0.10216781 -0.17192048 -0.09854978]
 [-0.21639505 -0.4267635  -0.34965813 -0.29233786 -0.00113209 -0.41973811]
 ..., 
 [-0.11712513 -0.0118594  -0.16404356 -0.41779459 -0.61728549 -0.2868512 ]
 [-0.15872827 -0.23679653 -0.15223867 -0.14511691 -0.40761003 -0.01294957]
 [-0.06828479 -0.03989597 -0.10569062 -0.32725555 -0.01778696 -0.00101564]]
Loss
0.00150582
Action prob
[ 0.00142791  0.00106371  0.00073205  0.00145552  0.00238201  0.00146382
  0.00155127  0.00183278  0.00144298  0.0021773   0.00200308  0.00093989
  0.00144871  0.0020216   0.00213252  0.00139884  0.00069921  0.00161538
  0.00110201  0.00138308  0.00142894  0.0014372   0.00384321  0.00172504
  0.00172393  0.00142636  0.00152778  0.00233932  0.00167468  0.00071741
  0.00218165  0.00111566  0.00078706  0.00200196  0.00149282  0.00097947
  0.0015432   0.00189869  0.00184

18th episode
== Epoch: 0 ==
Log likelihood
[[-0.00244876 -0.27014965 -0.15815739 -0.21771938 -0.13692614 -0.04734083]
 [-0.10932175 -0.00238439 -0.14178598 -0.04026662 -0.09568062 -0.0292917 ]
 [-0.19725314 -0.3125684  -0.15007864 -0.04344784 -0.13267101 -0.25571406]
 ..., 
 [-0.54445302 -0.01904266 -0.50939846 -0.10967664 -0.09402507 -0.36704281]
 [-0.55076253 -0.00173089 -0.21819338 -0.46154305 -0.02138245 -0.52041876]
 [-0.04516328 -0.01205064 -0.01229249 -0.30730715 -0.01364887 -0.05136154]]
Loss
-0.00338689
Action prob
[ 0.00175309  0.00265221  0.00135309  0.00215666  0.00061152  0.00082133
  0.0025072   0.0022349   0.00142749  0.0015108   0.00228666  0.00168362
  0.00256121  0.00170639  0.00155401  0.00268407  0.00140815  0.00126639
  0.0009431   0.00078944  0.00229779  0.00166651  0.00088419  0.00269517
  0.00077351  0.0021831   0.00198323  0.0011266   0.00144851  0.00119428
  0.00196728  0.0012124   0.00070372  0.00114107  0.0017138   0.00204769
  0.00161093  0.00104048  0.0024

21th episode
== Epoch: 0 ==
Log likelihood
[[ -2.29215458e-01  -2.03206196e-01  -1.34360307e-04  -4.43971574e-01
   -4.60530102e-01  -2.88208835e-02]
 [ -7.56158009e-02  -3.55396513e-03  -5.42286634e-01  -3.67399633e-01
   -2.32729942e-01  -6.04911661e-03]
 [ -5.35880141e-02  -7.79657951e-03  -2.41633859e-02  -2.13684827e-01
   -1.92324117e-01  -2.34425869e-02]
 ..., 
 [ -1.05981510e-02  -2.92368352e-01  -6.11851476e-02  -1.69562306e-02
   -3.96402210e-01  -9.10898894e-02]
 [ -5.79913966e-02  -2.43305024e-02  -7.08397925e-01  -1.17223002e-01
   -8.81245732e-02  -3.85523409e-01]
 [ -1.76451102e-01  -1.28970027e-01  -2.65826762e-01  -1.13914765e-01
   -1.34377018e-01  -2.87092149e-01]]
Loss
0.000225662
Action prob
[ 0.00102865  0.00118115  0.00240879  0.00133245  0.00163376  0.00310605
  0.00231126  0.00176944  0.00197978  0.00107639  0.00104166  0.00225456
  0.0006634   0.00129129  0.00164483  0.00280132  0.00109969  0.0019352
  0.00115058  0.00126369  0.00119208  0.00182964  0.00160569

24th episode
== Epoch: 0 ==
Log likelihood
[[ -9.48546678e-02  -3.94191891e-01  -1.63385589e-02  -4.53091860e-01
   -3.76453221e-01  -1.60154685e-01]
 [ -4.77048814e-01  -1.37382835e-01  -3.10356259e-01  -3.04101199e-01
   -3.64209935e-02  -1.15475580e-02]
 [ -3.08686793e-01  -1.51087806e-01  -3.39655399e-01  -2.82579362e-01
   -2.50445426e-01  -2.95319315e-02]
 ..., 
 [ -2.37381130e-01  -3.88367509e-04  -1.83336556e-01  -2.29291208e-02
   -1.43010691e-02  -4.73813474e-01]
 [ -6.75370336e-01  -5.11516690e-01  -2.38190014e-02  -1.79761991e-01
   -2.27271006e-01  -3.53767246e-01]
 [ -3.06961880e-07  -9.80281308e-02  -1.31464586e-01  -8.78640935e-02
   -1.22299036e-02  -4.05614913e-01]]
Loss
0.000560873
Action prob
[ 0.00090397  0.00112442  0.00103266  0.00114385  0.00123746  0.00104428
  0.00276749  0.00203651  0.00154452  0.00155208  0.00276083  0.00086855
  0.00166851  0.00100598  0.00192854  0.00096955  0.00213965  0.00143788
  0.00116116  0.00117673  0.00159079  0.00176778  0.0023010

27th episode
== Epoch: 0 ==
Log likelihood
[[ -2.68971175e-02  -2.01778382e-01  -8.46328028e-03  -7.68583640e-02
   -1.80366620e-01  -1.01144239e-03]
 [ -1.26265585e-01  -1.12047695e-01  -7.52012851e-03  -4.28155124e-01
   -2.43384801e-02  -3.81264626e-03]
 [ -1.75902054e-01  -1.58152670e-01  -3.56037438e-01  -1.86335444e-02
   -3.70142572e-02  -2.29403600e-01]
 ..., 
 [ -6.61108717e-02  -2.80590206e-01  -2.14308836e-02  -3.92147228e-02
   -3.00079197e-01  -2.47283340e-01]
 [ -2.13211849e-01  -2.31500827e-02  -6.13347143e-02  -1.75743952e-01
   -6.29722774e-02  -2.30890736e-01]
 [ -2.33263537e-01  -2.96180010e-01  -1.40040254e-04  -2.94686466e-01
   -8.48050714e-02  -2.19095170e-01]]
Loss
-0.00462801
Action prob
[ 0.00245653  0.00199768  0.00152041  0.00113526  0.00085667  0.00123233
  0.00152126  0.00106349  0.00170147  0.00175012  0.00293463  0.00305617
  0.00261513  0.00107127  0.00134551  0.00110765  0.00187692  0.00145542
  0.00132117  0.00160213  0.00150935  0.00172547  0.0027501

30th episode
== Epoch: 0 ==
Log likelihood
[[ -1.32904336e-01  -5.44994533e-01  -6.06011860e-02  -6.63268147e-03
   -1.90934986e-01  -2.79955238e-01]
 [ -2.44062707e-01  -1.53341070e-01  -2.26294428e-01  -9.28126872e-02
   -5.14136612e-01  -3.83133799e-01]
 [ -3.90972376e-01  -1.96795817e-02  -3.36487591e-01  -4.80682706e-04
   -1.06209531e-01  -2.13803500e-01]
 ..., 
 [ -4.81709000e-03  -3.24567524e-03  -2.01560259e-01  -3.74826379e-02
   -1.37352720e-01  -1.39807731e-01]
 [ -8.90943259e-02  -3.52785051e-01  -1.73043251e-01  -2.09692881e-01
   -2.35250983e-02  -6.29169419e-02]
 [ -2.89992422e-01  -1.23597898e-01  -5.20708740e-01  -1.45188533e-02
   -4.40823101e-02  -9.64504783e-04]]
Loss
0.00316765
Action prob
[ 0.00119495  0.00080279  0.0013861   0.00082877  0.00114059  0.00185188
  0.00090393  0.00105856  0.00206269  0.00152067  0.00174203  0.001387
  0.00123389  0.00208449  0.00220257  0.00187822  0.00245684  0.00090979
  0.00120934  0.00180259  0.00176757  0.00213174  0.0010422   

33th episode
== Epoch: 0 ==
Log likelihood
[[ -3.97228599e-01  -3.94286186e-01  -2.55694151e-01  -4.15999532e-01
   -3.75959367e-01  -2.31458828e-01]
 [ -2.37095281e-01  -1.83256835e-01  -1.30815879e-01  -1.85830176e-01
   -3.77612948e-01  -2.57365219e-03]
 [ -6.67871302e-03  -3.47419381e-01  -6.60725176e-01  -8.93902034e-02
   -9.25605819e-02  -1.93777476e-02]
 ..., 
 [ -2.93517504e-02  -2.34568492e-03  -2.88457215e-01  -4.44724739e-01
   -2.87984461e-01  -1.02094769e-01]
 [ -7.29612634e-02  -1.13801360e-01  -2.58644558e-02  -2.01956965e-02
   -1.72806293e-04  -3.40209484e-01]
 [ -7.46282935e-02  -6.58031330e-02  -4.19607572e-02  -5.79254217e-02
   -1.09931827e-02  -2.60620534e-01]]
Loss
-0.000225302
Action prob
[ 0.00050839  0.00131909  0.00119479  0.00040292  0.00118564  0.00079494
  0.0008737   0.00326932  0.00115923  0.00165532  0.00140342  0.00138435
  0.00068329  0.00181123  0.00301811  0.0017878   0.00066979  0.00213237
  0.00083267  0.00094662  0.00114685  0.00233578  0.000737

36th episode
== Epoch: 0 ==
Log likelihood
[[-0.48687732 -0.00915726 -0.2392288  -0.0108025  -0.38963807 -0.46069476]
 [-0.06097033 -0.38719186 -0.34277478 -0.07573365 -0.42724255 -0.12825474]
 [-0.00965869 -0.01921169 -0.06663944 -0.5174163  -0.04912871 -0.00335521]
 ..., 
 [-0.73987585 -0.4800328  -0.06461366 -0.1477873  -0.01233462 -0.39839575]
 [-0.341061   -0.10809378 -0.14949721 -0.2793791  -0.27135059 -0.21852359]
 [-0.65893477 -0.47524861 -0.02153875 -0.26145697 -0.05833233 -0.02560936]]
Loss
0.00103284
Action prob
[ 0.00081687  0.00097235  0.00207241  0.00139062  0.00275789  0.00167238
  0.00088029  0.00048094  0.00147256  0.00178035  0.00124266  0.00141249
  0.0006988   0.00096483  0.00150488  0.00142335  0.00086467  0.00136404
  0.00098249  0.00212241  0.00148971  0.00230491  0.00178641  0.0014604
  0.00204077  0.00151319  0.001432    0.00101852  0.00161818  0.00079733
  0.00100701  0.00169063  0.00129457  0.00214682  0.00117487  0.00066737
  0.00131347  0.00227065  0.002967

39th episode
== Epoch: 0 ==
Log likelihood
[[ -8.41542482e-02  -2.77922928e-01  -9.67503861e-02  -1.88822404e-01
   -2.11087525e-01  -1.01347223e-01]
 [ -7.35133290e-05  -2.32497200e-01  -1.60377398e-01  -2.11210623e-01
   -2.00959638e-01  -3.27273726e-01]
 [ -2.77291145e-03  -4.14314002e-01  -4.39965248e-01  -7.13008195e-02
   -1.32446721e-01  -5.32897189e-03]
 ..., 
 [ -4.22631279e-02  -3.68398689e-02  -6.06938362e-01  -4.82035041e-01
   -3.37965786e-02  -6.17902279e-02]
 [ -5.21475337e-02  -7.73060089e-03  -3.19338739e-01  -1.97232570e-02
   -3.35792229e-02  -3.82576168e-01]
 [ -9.53646451e-02  -6.16025962e-02  -6.70653284e-02  -4.19989713e-02
   -4.83159674e-04  -3.96778256e-01]]
Loss
-0.00935242
Action prob
[ 0.00154348  0.00129918  0.00138818  0.00190429  0.00241898  0.00133862
  0.00096875  0.0011501   0.00167255  0.00094215  0.00191884  0.0015599
  0.00089059  0.00298279  0.00179634  0.00084275  0.00244042  0.00175114
  0.00076565  0.00139257  0.001109    0.00189524  0.0011207 

42th episode
== Epoch: 0 ==
Log likelihood
[[ -1.29825976e-02  -2.25522388e-02  -1.69697210e-01  -2.13930041e-01
   -7.81628788e-02  -4.29637015e-01]
 [ -7.60913864e-02  -1.07090481e-01  -1.79365754e-01  -8.48119110e-02
   -1.41006023e-01  -2.36998703e-02]
 [ -6.50402129e-01  -3.28929236e-05  -3.93534750e-02  -3.44169945e-01
   -7.50481412e-02  -6.89996919e-03]
 ..., 
 [ -6.00781641e-04  -4.83932309e-02  -8.85450304e-01  -3.53931375e-02
   -1.81738231e-02  -4.32663888e-01]
 [ -1.57720402e-01  -3.85468036e-01  -1.76779300e-01  -3.22085589e-01
   -7.98959583e-02  -3.46990734e-01]
 [ -6.08654469e-02  -3.44192572e-02  -1.33908212e-01  -4.57300186e-01
   -9.47632045e-02  -4.53306377e-01]]
Loss
-0.00788298
Action prob
[ 0.00159546  0.00218597  0.00132077  0.00095904  0.00114862  0.0010163
  0.00204824  0.00166235  0.00163617  0.0019308   0.00153742  0.00138474
  0.00083977  0.00080424  0.00139234  0.00158902  0.00135141  0.00155273
  0.00132812  0.00221932  0.00211184  0.00218744  0.00293857

45th episode
== Epoch: 0 ==
Log likelihood
[[ -1.69810295e-01  -1.60093963e-01  -1.74887508e-01  -5.97936625e-04
   -3.37794781e-01  -4.60209310e-01]
 [ -2.05914244e-01  -2.56637245e-01  -3.41815829e-01  -4.72659290e-01
   -3.14323783e-01  -1.74080476e-01]
 [ -2.68660724e-01  -2.86936522e-01  -1.21899033e-02  -1.10377430e-03
   -4.78569716e-01  -6.08640304e-03]
 ..., 
 [ -2.65081853e-01  -2.11800281e-02  -2.77595013e-01  -4.02467735e-02
   -9.88011062e-02  -1.17813386e-02]
 [ -2.56590247e-01  -1.68278906e-03  -3.29327844e-02  -2.68570274e-01
   -1.71986744e-01  -1.47791870e-03]
 [ -3.59626502e-01  -2.14497104e-01  -2.36224130e-01  -3.69249642e-01
   -1.03652738e-02  -2.17061213e-04]]
Loss
-0.00780256
Action prob
[ 0.00109497  0.00068983  0.00140576  0.00146346  0.00104706  0.00148684
  0.00141385  0.00083971  0.0013414   0.00190395  0.00152861  0.00178239
  0.00180725  0.00179938  0.00224949  0.00157313  0.00143216  0.00174023
  0.00225763  0.00124229  0.00091719  0.00073637  0.0033976

48th episode
== Epoch: 0 ==
Log likelihood
[[ -7.89037999e-03  -7.97952339e-03  -7.73757875e-01  -1.47599503e-01
   -2.95854151e-01  -3.63489017e-02]
 [ -3.74518007e-01  -7.11175613e-04  -2.45358407e-01  -8.89801532e-02
   -3.22842449e-02  -1.16705619e-01]
 [ -1.18369549e-04  -4.47353631e-01  -1.97565466e-01  -8.65548477e-02
   -3.11007649e-01  -2.22678438e-01]
 ..., 
 [ -3.68031919e-01  -4.80982959e-01  -2.66543757e-02  -3.30807924e-01
   -3.58538963e-02  -1.99476719e-01]
 [ -7.98394158e-02  -3.04086417e-01  -1.27819572e-02  -2.32530758e-02
   -1.21679805e-01  -1.26970932e-01]
 [ -2.14074120e-01  -4.74029183e-01  -5.69657655e-03  -2.88143039e-01
   -1.76187053e-01  -2.87315130e-01]]
Loss
-0.00628561
Action prob
[ 0.0011328   0.00170842  0.00113751  0.00071133  0.00063766  0.00118549
  0.00114634  0.0010412   0.00187471  0.00072192  0.00055488  0.00277038
  0.00158555  0.00056767  0.00184207  0.00219882  0.00119469  0.00183454
  0.00087127  0.00128237  0.00363107  0.00152459  0.0022293

51th episode
== Epoch: 0 ==
Log likelihood
[[ -6.74737990e-03  -1.57229215e-01  -1.56385586e-01  -1.26910862e-02
   -4.03915495e-02  -9.97899398e-02]
 [ -7.21519232e-01  -6.27484798e-01  -1.70825756e+00  -3.27580065e-01
   -3.95442605e-01  -5.46285436e-02]
 [ -3.37070107e-01  -1.54290140e-01  -1.02190232e+00  -1.14216611e-01
   -1.33524609e+00  -4.45157349e-01]
 ..., 
 [ -2.25362018e-01  -2.82321591e-02  -4.44604665e-01  -6.30157769e-01
   -2.25941967e-02  -1.11656249e-01]
 [ -7.34085217e-04  -7.78682390e-03  -3.46923143e-01  -2.54228041e-02
   -2.19157159e-01  -7.36902729e-02]
 [ -2.91905235e-05  -3.74969505e-02  -3.63944978e-01  -4.31109779e-02
   -2.72792205e-03  -7.21614733e-02]]
Loss
-0.102045
Action prob
[  2.51152366e-03   8.70921285e-05   1.33485999e-04   8.75150494e-04
   9.46018205e-04   1.00049400e-03   9.68494103e-04   2.17298348e-05
   1.68402586e-03   3.05430149e-04   2.78262375e-03   1.26258435e-03
   1.09591521e-03   3.34817101e-04   1.39344460e-03   1.03329855e-03
   1

53th episode
== Epoch: 0 ==
Log likelihood
[[ -2.32390031e-01  -3.85225564e-02  -6.00736178e-02  -2.40893453e-01
   -4.46129829e-01  -2.84037620e-01]
 [ -5.03182830e-03  -2.79735208e-01  -9.24811438e-02  -2.30721176e-01
   -1.40612257e+00  -1.97733045e-01]
 [ -3.03354040e-02  -1.13691324e-02  -6.30684793e-02  -2.21352279e-02
   -2.01947674e-01  -9.33497027e-02]
 ..., 
 [ -5.32518566e-01  -7.74764980e-04  -6.71172261e-01  -1.66098159e-02
   -9.82179940e-02  -1.07900262e-01]
 [ -1.70040801e-02  -1.68314263e-01  -4.16565649e-02  -5.22142172e-01
   -5.80877326e-02  -3.34125012e-01]
 [ -3.50635529e-01  -2.34987717e-02  -1.73662472e+00  -2.25581273e-01
   -1.62204672e-02  -3.28189552e-01]]
Loss
0.00824943
Action prob
[  1.09644933e-03   4.41445562e-04   2.64301011e-03   6.77047763e-04
   2.09010002e-04   2.99794192e-04   1.79154449e-03   1.14347728e-03
   5.38645254e-04   1.51657348e-03   2.09447718e-03   8.07790202e-04
   1.71138893e-03   7.86247372e-04   1.17860676e-03   2.11913930e-03
   

55th episode
== Epoch: 0 ==
Log likelihood
[[ -7.61632696e-02  -1.79560576e-02  -9.42613125e-01  -4.52460080e-01
   -9.52802181e-01  -2.84322514e-03]
 [ -9.66437280e-01  -8.92944634e-02  -1.32433844e+00  -5.45372348e-03
   -1.24045029e-01  -2.82035261e-01]
 [ -1.09617241e-01  -5.91099799e-01  -7.10290968e-01  -1.55889258e-01
   -1.52794912e-01  -1.86877009e-02]
 ..., 
 [ -5.48704445e-01  -3.20552140e-01  -3.53028655e+00  -2.77529061e-02
   -1.67018461e+00  -2.34069094e-01]
 [ -1.43456161e-01  -2.38384604e-01  -2.70139861e+00  -1.11270383e-01
   -1.37583923e+00  -1.04793139e-01]
 [ -5.26368443e-04  -2.63897125e-02  -2.96793747e+00  -4.14606668e-02
   -1.71630073e+00  -2.92980624e-03]]
Loss
-0.0191039
Action prob
[  3.49688053e-04   2.47219170e-04   7.08747597e-04   1.10022760e-04
   6.30332943e-05   4.69765582e-05   1.35896835e-05   8.49629985e-04
   3.81380698e-04   2.98393745e-04   1.67705766e-05   2.65116862e-04
   2.22608505e-04   3.31556403e-05   3.92141490e-04   1.23829186e-05
   

57th episode
== Epoch: 0 ==
Log likelihood
[[ -2.48982042e-01  -4.87016421e-03  -5.43738365e-01  -1.10684417e-01
   -3.67141366e-02  -2.98638165e-01]
 [ -6.01653346e-06  -1.97095826e-01  -6.10999882e-01  -7.70965144e-02
   -1.38248608e-04  -2.45026410e-01]
 [ -6.53123796e-01  -2.83244520e-01  -2.80581445e-01  -2.31171474e-01
   -7.57164121e-01  -1.58163592e-01]
 ..., 
 [ -1.14350624e-01  -7.80715328e-03  -8.53980407e-02  -1.77673846e-02
   -1.04820561e+00  -2.17658584e-04]
 [ -6.34935796e-02  -4.55416411e-01  -1.46919692e+00  -1.74007043e-01
   -6.03313148e-01  -3.48176181e-01]
 [ -1.17797315e-01  -4.01072651e-01  -3.06760162e-01  -9.14295483e-03
   -8.72616351e-01  -4.43559021e-01]]
Loss
-0.121405
Action prob
[  1.16241153e-03   1.30181725e-03   3.79339093e-04   8.13994091e-04
   3.62367748e-04   9.51261085e-04   1.34857232e-03   1.43343746e-03
   1.53573218e-03   3.30104725e-04   6.13671378e-04   1.92781226e-04
   1.51181477e-03   2.00616126e-03   2.03044433e-03   1.87492828e-04
   6

59th episode
== Epoch: 0 ==
Log likelihood
[[ -2.01752806e+00  -2.58084953e-01  -2.02387600e+01  -1.08761156e+00
   -9.24947739e+00  -7.54091740e-01]
 [ -7.10451007e-02  -5.52588254e-02  -3.33836466e-01  -5.77642992e-02
   -1.20664001e-01  -1.35855004e-02]
 [ -9.50874209e-01  -2.80610383e-01  -4.67362672e-01  -4.13784990e-03
   -1.10434353e-01  -3.11891019e-01]
 ..., 
 [ -1.22108653e-01  -4.15782370e-02  -8.17284882e-01  -1.89330086e-01
   -7.94891715e-01  -3.86570871e-01]
 [ -1.14832491e-01  -1.96745336e-01  -1.44433622e+01  -5.90761423e-01
   -7.37364483e+00  -3.38890165e-01]
 [ -7.29677752e-02  -6.04740195e-02  -2.61796975e+00  -2.88112909e-02
   -7.69114733e-01  -9.48441401e-02]]
Loss
-1.68777
Action prob
[  1.02507133e-17   2.10006861e-03   4.81337658e-04   4.73854161e-05
   1.91877334e-04   6.50742848e-04   1.37043215e-04   2.26217944e-05
   1.18974996e-04   5.35878702e-04   7.68370914e-19   6.77695934e-05
   5.24869392e-05   4.42764285e-04   2.97271763e-04   6.98909207e-05
   1.

61th episode
== Epoch: 0 ==
Log likelihood
[[ -2.38556080e-04  -8.10795277e-02  -4.98231459e+00  -9.11339998e-01
   -1.63354003e+00  -1.72709137e-01]
 [ -1.12262266e-02  -1.21539958e-01  -9.62715912e+00  -7.80586898e-02
   -3.52084637e+00  -3.25661674e-02]
 [ -3.13304812e-02  -1.56878456e-01  -7.42290449e+00  -4.38844301e-02
   -7.02312768e-01  -3.01668316e-01]
 ..., 
 [ -5.59639871e-01  -3.22505534e-01  -2.19542813e+00  -2.43155971e-01
   -4.46214020e-01  -4.02563885e-02]
 [ -3.45401049e-01  -2.79062986e-01  -2.90992785e+00  -1.49291689e-02
   -1.08831298e+00  -1.02745630e-01]
 [ -1.28854048e+00  -6.63494289e-01  -7.35199642e+00  -2.39392474e-01
   -6.99316406e+00  -5.80488667e-02]]
Loss
-0.321214
Action prob
[  1.68313409e-06   6.16099349e-09   6.99703037e-07   3.85757448e-06
   8.40564951e-09   2.42219321e-05   6.64418985e-05   5.06476319e-11
   3.00005759e-05   1.12757175e-06   3.47658715e-06   4.65244653e-07
   3.35030741e-08   2.69942257e-06   8.52539160e-05   1.08889444e-05
   1

63th episode
== Epoch: 0 ==
Log likelihood
[[ -7.36078387e-03  -2.97371787e-03  -1.18036232e+01  -3.53896350e-01
   -3.53952289e+00  -1.91501938e-02]
 [ -1.29587075e-04  -9.02524590e-03  -1.31794751e+00  -7.83676744e-01
   -2.19812274e+00  -4.43905085e-01]
 [ -6.77938992e-03  -1.17602549e-01  -1.60134244e+00  -1.86928451e-01
   -1.96742654e+00  -2.17750226e-03]
 ..., 
 [ -1.57674979e-02  -4.44618851e-01  -4.39223719e+00  -9.55742300e-02
   -1.67840207e+00  -6.52334765e-02]
 [ -3.38041377e+00  -1.51325643e-01  -6.09555283e+01  -2.42858195e+00
   -2.10959682e+01  -5.58353543e-01]
 [ -6.49555922e-01  -6.74612224e-02  -1.64390612e+00  -2.85831660e-01
   -3.19343321e-02  -1.64902536e-03]]
Loss
-3.62454
Action prob
[  5.96371008e-10   3.47810492e-05   8.30649515e-05   1.10634564e-05
   7.25285572e-05   5.60892522e-06   8.94467078e-09   1.51151762e-05
   6.41089673e-06   2.63436785e-04   1.60445462e-30   1.45660499e-06
   8.72668519e-04   5.97924110e-04   2.67362429e-05   2.29776870e-05
   1.

65th episode
== Epoch: 0 ==
Log likelihood
[[ -9.11212899e-03  -5.10164583e-03  -8.88493061e-01  -3.83937925e-01
   -1.53293476e-01  -2.62553729e-02]
 [ -4.88869190e-01  -3.10800150e-02  -4.52072525e+00  -5.93788505e-01
   -2.85910666e-01  -7.22167501e-03]
 [ -5.44049690e-05  -3.62250248e-05  -4.17960024e+00  -5.23265414e-02
   -9.00673985e-01  -2.08226424e-02]
 ..., 
 [ -1.94158442e-02  -4.31030780e-01  -1.06653607e+00  -7.48430099e-03
   -8.27416033e-02  -1.79703131e-01]
 [ -7.61450171e-01  -5.18075526e-02  -2.35702634e+00  -4.95067388e-02
   -6.03457212e-01  -8.86321738e-02]
 [ -4.60622728e-01  -2.87066191e-01  -1.42687643e+00  -8.18271413e-02
   -1.32611990e+00  -4.09343001e-03]]
Loss
-0.0962022
Action prob
[  9.30466282e-04   1.07433179e-05   2.32979473e-05   1.80602205e-04
   1.79780491e-05   1.25544306e-04   9.85455034e-12   3.58462530e-05
   4.42246470e-04   7.30306571e-09   3.00248968e-04   3.46736010e-06
   2.57503070e-05   8.69449490e-09   5.07531979e-04   1.34094362e-03
   

67th episode
== Epoch: 0 ==
Log likelihood
[[ -1.00432272e+01  -8.95754158e-01  -2.30245270e+02  -8.45676422e+00
   -9.31555939e+01  -1.04119710e-03]
 [ -1.07757540e+01  -1.04990184e+00  -2.82084259e+02  -3.52685428e+00
   -9.33872223e+01  -4.70957547e-01]
 [ -3.41580343e+00  -7.99259520e-04  -1.46806824e+02  -6.06101990e+00
   -5.23622551e+01  -7.98301220e-01]
 ..., 
 [ -4.51676178e+00  -7.27464557e-02  -1.46593948e+02  -2.25016928e+00
   -4.80442390e+01  -1.05969235e-02]
 [ -6.89576197e+00  -1.61159897e+00  -3.07284088e+02  -8.16165924e+00
   -1.19241829e+02  -1.44277477e+00]
 [ -1.08748446e+01  -1.11361063e+00  -3.39839905e+02  -5.59149599e+00
   -1.25508400e+02  -1.60086250e+00]]
Loss
-11.4186
Action prob
[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   1.66338756e-22   0.00000000e+00   8.78432579e-35
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   5.64628810e-24   0.00000000e+00   0.00000000e+00
   0.

69th episode
== Epoch: 0 ==
Log likelihood
[[ -2.48694744e+01  -1.12454283e+00  -7.01692505e+02  -2.12775593e+01
   -2.40028015e+02  -2.36622620e+00]
 [ -5.07897091e+00  -7.99752414e-01  -1.40214523e+02  -4.98580122e+00
   -3.80701714e+01  -7.26350024e-02]
 [ -1.99148865e+01  -1.04930950e-02  -4.50763245e+02  -1.23153315e+01
   -1.82704849e+02  -1.21216461e-01]
 ..., 
 [ -3.13172417e+01  -2.47503281e-01  -8.96184509e+02  -1.67828522e+01
   -3.63041443e+02  -9.03400958e-01]
 [ -1.85434494e+01  -8.36654305e-01  -5.41558167e+02  -1.75675869e+01
   -2.28923386e+02  -9.80510890e-01]
 [ -2.10326920e+01  -4.23085570e-01  -5.45739807e+02  -1.61292782e+01
   -2.00097473e+02  -1.88234317e+00]]
Loss
-10.2304
Action prob
[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   2.98853908e-22   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.

71th episode
== Epoch: 0 ==
Log likelihood
[[ -1.84700272e+02  -6.83699703e+00  -5.02599365e+03  -1.26411697e+02
   -1.98342200e+03  -2.74435449e+00]
 [ -2.56441315e+02  -1.05327225e+01  -7.25079736e+03  -1.86194412e+02
   -2.89004321e+03  -8.98664761e+00]
 [ -1.33391685e+01  -1.41438786e-02  -2.07147858e+02  -6.25319195e+00
   -7.01795044e+01  -1.16115379e+00]
 ..., 
 [ -2.26375183e+02  -5.07243872e+00  -5.46303320e+03  -1.52520142e+02
   -2.18823828e+03  -4.89983702e+00]
 [ -2.19038651e+02  -5.48383427e+00  -5.58048975e+03  -1.41038223e+02
   -2.29480640e+03  -3.53013515e+00]
 [ -2.58844635e+02  -9.47690201e+00  -6.53057129e+03  -1.76758255e+02
   -2.55890234e+03  -4.85710669e+00]]
Loss
-308.708
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

75th episode
== Epoch: 0 ==
Log likelihood
[[ -3.27640038e+01  -2.08113998e-01  -7.83051819e+02  -1.95892677e+01
   -3.07287903e+02  -8.23462069e-01]
 [ -5.52202721e+01  -3.90348554e-01  -1.43654456e+03  -3.48001785e+01
   -5.45933411e+02  -9.31450307e-01]
 [ -1.14659721e+02  -4.37703991e+00  -2.99406152e+03  -8.93131104e+01
   -1.21876135e+03  -2.06774378e+00]
 ..., 
 [ -2.23644135e+02  -6.28317308e+00  -6.22480762e+03  -1.57590942e+02
   -2.42394824e+03  -7.93794966e+00]
 [ -2.92104473e+01  -1.87263143e+00  -7.37924011e+02  -2.07811756e+01
   -2.75251770e+02  -2.45224223e-01]
 [ -5.28550606e+01  -1.76703155e-01  -1.17921118e+03  -2.76794357e+01
   -4.65489594e+02  -2.78432202e+00]]
Loss
-188.327
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

81th episode
== Epoch: 0 ==
Log likelihood
[[ -1.35487495e+14  -2.66237379e+12  -3.43134145e+15  -9.10527782e+13
   -1.29829506e+15  -4.03180670e+12]
 [ -3.08227489e+14  -9.77125402e+12  -7.99571567e+15  -2.06845508e+14
   -3.03296739e+15  -1.05430877e+13]
 [ -3.81209889e+14  -1.26978118e+13  -9.91585757e+15  -2.54591954e+14
   -3.76251349e+15  -1.35196983e+13]
 ..., 
 [ -7.83850909e+13  -1.55285193e+12  -1.99428055e+15  -5.50548705e+13
   -7.52914612e+14  -1.87134653e+12]
 [ -2.42882028e+14  -7.38796031e+12  -6.28933429e+15  -1.63939539e+14
   -2.38574642e+15  -8.05271031e+12]
 [ -1.31528743e+14  -3.27794111e+12  -3.37510261e+15  -9.06377099e+13
   -1.27708155e+15  -3.71152152e+12]]
Loss
1.57468e+14
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

87th episode
== Epoch: 0 ==
Log likelihood
[[ -4.50914456e+14  -1.00511326e+13  -1.14199981e+16  -2.83350435e+14
   -4.29820409e+15  -1.48655865e+13]
 [ -6.96518873e+14  -1.74158313e+13  -1.77574403e+16  -4.36964671e+14
   -6.68921346e+15  -2.41099122e+13]
 [ -4.33737305e+14  -1.14078610e+13  -1.11298043e+16  -2.77668461e+14
   -4.18302032e+15  -1.45118640e+13]
 ..., 
 [ -1.04236373e+15  -3.08492317e+13  -2.69597288e+16  -6.60656164e+14
   -1.01554577e+16  -3.79312553e+13]
 [ -5.80102174e+14  -1.37057723e+13  -1.47440773e+16  -3.64119174e+14
   -5.55318226e+15  -1.96775395e+13]
 [ -7.20297422e+14  -1.80635231e+13  -1.83679924e+16  -4.51778181e+14
   -6.92011090e+15  -2.50006544e+13]]
Loss
-2.42175e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

93th episode
== Epoch: 0 ==
Log likelihood
[[ -4.69800719e+13  -4.35666387e+11  -1.15920281e+15  -3.29840150e+13
   -4.31603646e+14  -1.14978377e+12]
 [ -2.22364571e+13  -7.40508877e+10  -5.29170707e+14  -1.53658673e+13
   -1.94493635e+14  -4.01611129e+11]
 [ -2.84770727e+14  -9.57388056e+12  -7.71717147e+15  -2.05282962e+14
   -2.89663896e+15  -1.07961658e+13]
 ..., 
 [ -1.19367937e+14  -3.21903447e+12  -3.15641576e+15  -8.57657741e+13
   -1.17986000e+15  -3.84230805e+12]
 [ -6.40207783e+13  -7.88425277e+11  -1.60204629e+15  -4.49998781e+13
   -5.98112716e+14  -1.69622713e+12]
 [ -1.74201284e+13  -3.84989635e+10  -4.09105634e+14  -1.19524756e+13
   -1.48725574e+14  -2.70159905e+11]]
Loss
5.93509e+12
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

99th episode
== Epoch: 0 ==
Log likelihood
[[ -2.10828083e+14  -7.20878122e+12  -5.79392967e+15  -1.52116987e+14
   -2.15636685e+15  -7.60543760e+12]
 [ -2.51776972e+14  -8.96858902e+12  -6.95522763e+15  -1.82423334e+14
   -2.59170057e+15  -9.33056899e+12]
 [ -8.62387406e+13  -2.18089416e+12  -2.28748294e+15  -6.18924123e+13
   -8.47874460e+14  -2.50442049e+12]
 ..., 
 [ -1.36013404e+13  -7.13631744e+10  -3.27616616e+14  -9.90589852e+12
   -1.18957751e+14  -1.12501735e+11]
 [ -8.40458746e+13  -2.14092284e+12  -2.22987481e+15  -6.03661261e+13
   -8.26154810e+14  -2.42780917e+12]
 [ -1.31423197e+14  -4.03301020e+12  -3.56267940e+15  -9.47018562e+13
   -1.32340062e+15  -4.34640231e+12]]
Loss
2.05203e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

105th episode
== Epoch: 0 ==
Log likelihood
[[ -4.35899712e+13  -6.42377253e+11  -1.15613232e+15  -3.13788528e+13
   -4.29343688e+14  -1.24691376e+12]
 [ -7.91892732e+13  -1.83433626e+12  -2.18307067e+15  -5.73707387e+13
   -8.14410692e+14  -2.70952143e+12]
 [ -1.10830188e+14  -4.01666159e+12  -3.16943327e+15  -8.32928964e+13
   -1.17960579e+15  -4.06742604e+12]
 ..., 
 [ -9.74135241e+13  -3.34391909e+12  -2.76646655e+15  -7.30355414e+13
   -1.02896746e+15  -3.44306981e+12]
 [ -5.36175077e+13  -1.51314589e+12  -1.48439479e+15  -3.99144691e+13
   -5.49770779e+14  -1.61241026e+12]
 [ -1.65092346e+13  -4.79048663e+10  -4.02275495e+14  -1.15532303e+13
   -1.47123685e+14  -2.99635737e+11]]
Loss
-4.13637e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

111th episode
== Epoch: 0 ==
Log likelihood
[[ -6.90697850e+12  -2.17478881e+10  -1.32357932e+14  -3.96274722e+12
   -4.63568361e+13  -5.29503478e+10]
 [ -2.36235428e+13  -5.68009728e+10  -5.42090103e+14  -1.51004633e+13
   -1.95990229e+14  -4.92553306e+11]
 [ -1.09988568e+14  -2.07634760e+12  -2.91854490e+15  -7.49400909e+13
   -1.07067112e+15  -4.22654928e+12]
 ..., 
 [ -1.18791908e+14  -2.34370603e+12  -3.17108522e+15  -8.11179568e+13
   -1.16381011e+15  -4.68956794e+12]
 [ -1.48969347e+13  -4.18076186e+09  -3.24815290e+14  -9.31326539e+12
   -1.16329768e+14  -2.62254412e+11]
 [ -6.45988457e+12  -2.52542833e+10  -1.22726241e+14  -3.70459515e+12
   -4.27979776e+13  -4.50671862e+10]]
Loss
-2.17396e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

117th episode
== Epoch: 0 ==
Log likelihood
[[ -1.71661234e+13  -1.11013659e+11  -4.28379501e+14  -1.17765371e+13
   -1.53457856e+14  -3.88066574e+11]
 [ -1.54842085e+13  -1.65335747e+10  -3.69897817e+14  -1.01628489e+13
   -1.33218662e+14  -3.65250544e+11]
 [ -4.16430686e+12  -9.15421389e+09  -8.41828102e+13  -2.58190606e+12
   -2.93265253e+13  -3.58906716e+10]
 ..., 
 [ -9.58884689e+12  -9.51237600e+07  -2.15120970e+14  -6.13293071e+12
   -7.71294505e+13  -1.71518067e+11]
 [ -4.91090661e+12  -1.38702694e+09  -1.07532551e+14  -3.39917092e+12
   -3.83262120e+13  -1.26174751e+10]
 [ -9.42839693e+12  -1.31167184e+08  -2.12304730e+14  -6.11251598e+12
   -7.61606670e+13  -1.56415197e+11]]
Loss
1.0373e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

123th episode
== Epoch: 0 ==
Log likelihood
[[ -4.17215094e+13  -6.76490052e+11  -1.10406423e+15  -2.80353718e+13
   -3.95424788e+14  -1.59282915e+12]
 [ -3.95212615e+13  -5.67424713e+11  -1.02907403e+15  -2.64408819e+13
   -3.68184494e+14  -1.37506626e+12]
 [ -7.18982978e+13  -1.42510378e+12  -1.95576637e+15  -4.86500089e+13
   -7.02055891e+14  -3.20278757e+12]
 ..., 
 [ -6.52879363e+13  -1.24375623e+12  -1.76715745e+15  -4.40665700e+13
   -6.33669341e+14  -2.81770721e+12]
 [ -2.13836984e+13  -1.75362343e+11  -5.21786819e+14  -1.41759412e+13
   -1.86537929e+14  -5.11913624e+11]
 [ -6.95545123e+13  -1.33325403e+12  -1.87942346e+15  -4.70584929e+13
   -6.74228530e+14  -2.97687869e+12]]
Loss
2.29882e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

129th episode
== Epoch: 0 ==
Log likelihood
[[ -5.96163900e+12  -1.38526423e+10  -1.14448086e+14  -3.31408946e+12
   -3.96485418e+13  -5.68596767e+10]
 [ -4.37636531e+13  -7.41053170e+11  -1.17570113e+15  -2.96624010e+13
   -4.20199367e+14  -1.77795983e+12]
 [ -2.35808385e+13  -2.44035731e+11  -5.92497181e+14  -1.57323698e+13
   -2.11194112e+14  -6.71506039e+11]
 ..., 
 [ -7.83198873e+12  -1.14122202e+09  -1.66222163e+14  -4.98320749e+12
   -5.84545720e+13  -6.85573980e+10]
 [ -6.04492687e+12  -3.53266425e+06  -1.24855471e+14  -3.83595839e+12
   -4.37902660e+13  -2.41980948e+10]
 [ -1.00042696e+13  -9.92878285e+09  -2.19245783e+14  -6.28147146e+12
   -7.77866141e+13  -1.39250696e+11]]
Loss
-1.58959e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

135th episode
== Epoch: 0 ==
Log likelihood
[[ -4.95280277e+13  -6.45309334e+11  -1.43751482e+15  -3.60551473e+13
   -5.22354292e+14  -2.11480071e+12]
 [ -8.96833002e+12  -2.19448832e+08  -2.06067212e+14  -5.77309088e+12
   -7.39853499e+13  -1.59635587e+11]
 [ -4.30729645e+13  -4.82222211e+11  -1.23472619e+15  -3.10834857e+13
   -4.48734995e+14  -1.77138880e+12]
 ..., 
 [ -5.85320114e+13  -8.80033923e+11  -1.72264307e+15  -4.29824347e+13
   -6.26099797e+14  -2.62813123e+12]
 [ -3.59639925e+13  -3.16887958e+11  -1.00820344e+15  -2.56290157e+13
   -3.66238170e+14  -1.34915254e+12]
 [ -1.42805282e+13  -1.51809434e+10  -3.54451370e+14  -9.67036423e+12
   -1.27936716e+14  -3.49280600e+11]]
Loss
1.67042e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

141th episode
== Epoch: 0 ==
Log likelihood
[[ -9.10599672e+13  -2.46206326e+12  -2.74016847e+15  -6.73608284e+13
   -9.86447553e+14  -4.99253667e+12]
 [ -7.17035038e+12  -7.40430592e+09  -1.56221097e+14  -4.58415040e+12
   -5.53145608e+13  -6.25153761e+10]
 [ -5.42005915e+13  -7.86700698e+11  -1.54008924e+15  -3.82056425e+13
   -5.56097568e+14  -2.61600497e+12]
 ..., 
 [ -8.59043288e+13  -1.73467173e+12  -2.54444546e+15  -6.21503998e+13
   -9.18431176e+14  -4.72700682e+12]
 [ -5.52055844e+13  -8.22697656e+11  -1.57245880e+15  -3.90022792e+13
   -5.67761055e+14  -2.67987399e+12]
 [ -4.58959199e+13  -9.94698199e+11  -1.32199295e+15  -3.32613382e+13
   -4.74655157e+14  -2.07881175e+12]]
Loss
1.80037e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

147th episode
== Epoch: 0 ==
Log likelihood
[[ -3.38623830e+12  -2.03594813e+10  -5.84877699e+13  -1.65764819e+12
   -2.02481787e+13  -1.64817644e+10]
 [ -3.16891139e+13  -2.92454007e+11  -8.74876315e+14  -2.15675159e+13
   -3.15942425e+14  -1.32563691e+12]
 [ -9.78419303e+13  -2.72790087e+12  -3.04063659e+15  -7.26366966e+13
   -1.09418553e+15  -5.79974935e+12]
 ..., 
 [ -1.45442524e+13  -1.37270542e+11  -3.71586880e+14  -9.84906885e+12
   -1.33199275e+14  -3.49439492e+11]
 [ -1.37146160e+13  -1.13774215e+11  -3.45649841e+14  -9.21965796e+12
   -1.23988693e+14  -3.12132993e+11]
 [ -2.65132588e+13  -1.84274813e+11  -7.12927661e+14  -1.77109834e+13
   -2.57518807e+14  -1.01314586e+12]]
Loss
-1.11359e+13
Action prob
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [9]:
env = VREPPushTaskEnvironment()
states = env.reset(True)

In [16]:
del model
del policy
del env

In [17]:
del env

NameError: name 'env' is not defined

In [None]:
K.learning_phase()

In [None]:
states

In [None]:
import time
for _ in range(200):
    states = env.step(generateRandomVel(2)[np.newaxis, :])
    print(states)
    #time.sleep(0.05)