In [2]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm as tqdm
tf.keras.backend.set_floatx('float32')
from collections import deque
from datetime import datetime
import random
import matplotlib
from environment import Environment
from plots import just_plot
from misc import *
from nets import *
from buffer import ReplayBuffer
import timeit

amplitude=0.4
dolinar_layers=2
number_phases=2
total_episodes = 10**3
buffer_size=500
batch_size=64
ep_guess=0.01
noise_displacement=0.5
lr_actor=0.01
lr_critic=0.001
tau=0.005


exper = np.load("example_buffer/2_sample.npy")
env = Environment(amplitude=amplitude, dolinar_layers = dolinar_layers, number_phases=number_phases)
# buffer = ReplayBuffer(buffer_size=buffer_size)

critic = Critic(nature="primary",valreg=0.01, dolinar_layers = dolinar_layers, number_phases=number_phases)
critic_target = Critic(nature="target", dolinar_layers = dolinar_layers, number_phases=number_phases)
actor = Actor(nature="primary", dolinar_layers = dolinar_layers)
actor_target = Actor(nature="target", dolinar_layers = dolinar_layers)

optimizer_critic = tf.keras.optimizers.Adam(lr=lr_critic)
optimizer_actor = tf.keras.optimizers.Adam(lr=lr_actor)

policy_evaluator = PolicyEvaluator(amplitude = amplitude, dolinar_layers=dolinar_layers, number_phases = number_phases)

#
experiences = exper.astype(np.float32)
targeted_experience = actor_target.process_sequence_of_experiences_tf(experiences)
sequences, zeroed_rews = critic_target.process_sequence_tf(targeted_experience)


In [3]:
@tf.function
def give_td_error_Kennedy_guess_tf(critic,sequences,zeroed_rews):
    if critic.nature != "target":
        raise AttributeError("I'm not the target!")

    final_rews = tf.reshape(zeroed_rews[:,-1], (sequences.shape[0],1,1))
    bellman_tds_noguess = critic(sequences)[:,1:-1,:]

    phases = tf.range(critic.number_phases, dtype=np.float32)/critic.number_phases

    unstacked = tf.unstack(tf.convert_to_tensor(sequences))
    phases_concs = {}
    for ph in range(critic.number_phases):
        phases_concs[str(ph)] = []
    stacked = {}

    for episode in unstacked:
        prefinal = episode[:-1]
        for ph in range(critic.number_phases):
            final = tf.expand_dims(tf.stack([tf.unstack(episode[-1])[0], phases[ph]], axis=0), axis=0)
            phases_concs[str(ph)].append(tf.concat([prefinal, final], axis=0))
    #
        for ph in range(critic.number_phases):
            stacked[str(ph)] = tf.stack(phases_concs[str(ph)], axis=0)

    all_preds = tf.concat([critic(stacked[str(ph)]) for ph in range(critic.number_phases)], axis=2)
    maxs = tf.math.reduce_max(all_preds,axis=2)[:,-1]
    bellman_td = tf.concat([tf.reshape(bellman_tds_noguess,(sequences.shape[0],critic.dolinar_layers-1)), tf.reshape(maxs,(sequences.shape[0],1))], axis=1)
    return tf.concat([bellman_td, tf.reshape(zeroed_rews[:,-1], (sequences.shape[0],1))], axis=1)

In [4]:
def give_td_error_Kennedy_guess(critic,sequences,zeroed_rews):
    if critic.nature != "target":
        raise AttributeError("I'm not the target!")

    final_rews = tf.reshape(zeroed_rews[:,-1], (sequences.shape[0],1,1))
    bellman_tds_noguess = critic(sequences)[:,1:-1,:]

    phases = tf.range(critic.number_phases, dtype=np.float32)/critic.number_phases

    unstacked = tf.unstack(tf.convert_to_tensor(sequences))
    phases_concs = {}
    for ph in range(critic.number_phases):
        phases_concs[str(ph)] = []
    stacked = {}

    for episode in unstacked:
        prefinal = episode[:-1]
        for ph in range(critic.number_phases):
            final = tf.expand_dims(tf.stack([tf.unstack(episode[-1])[0], phases[ph]], axis=0), axis=0)
            phases_concs[str(ph)].append(tf.concat([prefinal, final], axis=0))
    #
        for ph in range(critic.number_phases):
            stacked[str(ph)] = tf.stack(phases_concs[str(ph)], axis=0)

    all_preds = tf.concat([critic(stacked[str(ph)]) for ph in range(critic.number_phases)], axis=2)
    maxs = tf.math.reduce_max(all_preds,axis=2)[:,-1]
    bellman_td = tf.concat([tf.reshape(bellman_tds_noguess,(sequences.shape[0],critic.dolinar_layers-1)), tf.reshape(maxs,(sequences.shape[0],1))], axis=1)
    return tf.concat([bellman_td, tf.reshape(zeroed_rews[:,-1], (sequences.shape[0],1))], axis=1)

In [None]:
%timeit give_td_error_Kennedy_guess_tf(critic_target,sequences,zeroed_rews)

In [None]:
%timeit give_td_error_Kennedy_guess(critic_target,sequences,zeroed_rews)

In [5]:
labels_critic = give_td_error_Kennedy_guess_tf(critic_target,sequences,zeroed_rews)

In [6]:
@tf.function
def step_critic_tf(labels_critic, critic):
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(sequences)
        loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        return tf.squeeze(loss_critic)
    
def step_critic(labels_critic, critic):
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(sequences)
        loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        return tf.squeeze(loss_critic)

In [7]:
%timeit step_critic(tf.expand_dims(labels_critic, axis=2), critic)

37 ms ± 542 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%timeit step_critic_tf(tf.expand_dims(labels_critic, axis=2), critic)

14.5 ms ± 511 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
experiences[:,:-1]

array([[ 0.03453873,  0.        , -0.33991995,  0.        ,  0.        ],
       [-0.25866753,  0.        , -0.19607863,  0.        ,  0.5       ],
       [-0.24178177,  0.        ,  0.09374991,  0.        ,  0.5       ],
       [-0.2672065 ,  0.        ,  0.04338924,  0.        ,  0.        ],
       [-0.1888024 ,  0.        , -0.11296402,  0.        ,  0.5       ],
       [-0.0119067 ,  0.        ,  0.10449888,  0.        ,  0.        ],
       [ 0.0838516 ,  0.        , -0.223578  ,  0.        ,  0.        ],
       [ 0.09396052,  0.        , -0.00805294,  0.        ,  0.5       ],
       [-0.18203147,  0.        , -0.11425584,  0.        ,  0.        ],
       [-0.2001634 ,  0.        ,  0.02064468,  0.        ,  0.        ],
       [-0.27773562,  0.        ,  0.05043116,  0.        ,  0.        ],
       [-0.06698474,  0.        , -0.2857382 ,  0.        ,  0.        ],
       [-0.29122508,  1.        , -0.16833827,  1.        ,  0.5       ],
       [-0.02650449,  0.        , -0.3

In [10]:

@tf.function
def process_sequence_of_experiences_tf(self, experiences):
    self.lstm.stateful=True

    unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
    to_stack = []
    for index in range(2*self.dolinar_layers-1): # I consider from first outcome to last one (but guess)
        if (index==0):
            to_stack.append(unstacked_exp[index])
        if (index%2 == 1):
            to_stack.append(unstacked_exp[index])

            to_stack.append(tf.squeeze(self(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))))
    for index in range(2*self.dolinar_layers-1, 2*self.dolinar_layers+2):
        to_stack.append(unstacked_exp[index])
    self.lstm.stateful=False

In [11]:
@tf.function
def critic_derivative(experiences, actor, critic):
    actions_indexed = [0.]*(actor.dolinar_layers)
 
    with tf.GradientTape() as tape:
        unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
        to_stack = []
        actions_wathed_index = []
        for index in range(0,experiences.shape[-1]-3,2): # I consider from first outcome to last one (but guess)
            actions_wathed_index.append(index)
            to_stack.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))
   
        actions_indexed = tf.concat(to_stack,axis=1)
    tape.watch(actions_indexed)

In [12]:

def critic_grad(critic, experiences):
    with tf.GradientTape() as tape:
        unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
        to_stack = []
        actions_wathed_index = []
        for index in range(0,experiences.shape[-1]-3,2): # I consider from first outcome to last one (but guess)
            actions_wathed_index.append(index)
            to_stack.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        actions_indexed = tf.concat(to_stack,axis=1)
        tape.watch(actions_indexed)


        index_actions=0
        watched_exps=[tf.ones((experiences.shape[0],1,1))*actor.pad_value]
        watched_actions_unstacked = tf.unstack(actions_indexed, axis=1)
        for index in range(0,experiences.shape[-1]-1): 
            if index in actions_wathed_index:
                watched_exps.append(tf.expand_dims(watched_actions_unstacked[index_actions], axis=2))
                index_actions+=1
            else:
                watched_exps.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        qvals = critic(tf.reshape(tf.concat(watched_exps, axis=2), (experiences.shape[0],critic.dolinar_layers+1,2)))

        dq_da = tape.gradient(qvals, actions_indexed)
        return dq_da

In [13]:
%timeit critic_grad(critic, experiences)

31.2 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
@tf.function
def critic_grad_tf(critic, experiences):
    with tf.GradientTape() as tape:
        unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
        to_stack = []
        actions_wathed_index = []
        for index in range(0,experiences.shape[-1]-3,2): # I consider from first outcome to last one (but guess)
            actions_wathed_index.append(index)
            to_stack.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        actions_indexed = tf.concat(to_stack,axis=1)
        tape.watch(actions_indexed)


        index_actions=0
        watched_exps=[tf.ones((experiences.shape[0],1,1))*actor.pad_value]
        watched_actions_unstacked = tf.unstack(actions_indexed, axis=1)
        for index in range(0,experiences.shape[-1]-1): 
            if index in actions_wathed_index:
                watched_exps.append(tf.expand_dims(watched_actions_unstacked[index_actions], axis=2))
                index_actions+=1
            else:
                
                watched_exps.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        qvals = critic(tf.reshape(tf.concat(watched_exps, axis=2), (experiences.shape[0],critic.dolinar_layers+1,2)))

        dq_da = tape.gradient(qvals, actions_indexed)
        return dq_da

In [15]:
%timeit critic_grad_tf(critic, experiences)

9.09 ms ± 878 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
with tf.GradientTape() as tape:
    tape.watch(actor.trainable_variables)
    pads = np.ones(len(experiences)).astype(np.float32)*actor.pad_value
    news = np.random.rand(experiences.shape[0], experiences.shape[1]+1).astype(np.float32)
    news[:,1:] = experiences
    news[:,0] = pads
    instances_actor = [i for i in range(0,2*actor.dolinar_layers,2)]
    actionss = actor(np.reshape(news[:,instances_actor], (experiences.shape[0],actor.dolinar_layers,1)).astype(np.float32))
    da_dtheta = tape.gradient(actionss, actor.trainable_variables, output_gradients=-dq_da)


NameError: name 'dq_da' is not defined

In [None]:

def actor_grad(actor, dq_da, experiences, optimizer_actor):
    with tf.GradientTape() as tape:
        unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
        states_to_act=[tf.ones((experiences.shape[0],1,1))*actor.pad_value]

        to_stack = [] 
        actions_wathed_index = []
        for index in range(1,2*actor.dolinar_layers-2,2):
            states_to_act.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        actor_thinks = actor(tf.concat(states_to_act, axis=1))
        da_dtheta = tape.gradient(actor_thinks, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return

In [None]:
%timeit actor_grad(actor, dq_da, experiences, optimizer_actor)

In [None]:
@tf.function
def actor_grad_tf(actor, dq_da, experiences, optimizer_actor):
    with tf.GradientTape() as tape:
        unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
        states_to_act=[tf.ones((experiences.shape[0],1,1))*actor.pad_value]

        to_stack = [] 
        actions_wathed_index = []
        for index in range(1,2*actor.dolinar_layers-2,2):
            states_to_act.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        actor_thinks = actor(tf.concat(states_to_act, axis=1))
        da_dtheta = tape.gradient(actor_thinks, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return

In [None]:
%timeit actor_grad_tf(actor, dq_da, experiences, optimizer_actor)

In [None]:
%timeit policy_evaluator.greedy_strategy(actor = actor, critic = critic)

In [19]:
dq_da = critic_grad_tf(critic, experiences)

In [20]:
def actor_grad_tf(actor, dq_da, experiences, optimizer_actor):
    with tf.GradientTape() as tape:
        unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
        states_to_act=[tf.ones((experiences.shape[0],1,1))*actor.pad_value]

        to_stack = []
        actions_wathed_index = []
        for index in range(1,2*actor.dolinar_layers-2,2):
            states_to_act.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))
        inps_actor = tf.concat(states_to_act, axis=1)
        actor.lstm.stateful=False
        actor_thinks = actor(inps_actor)
        actor.lstm.stateful=True
        da_dtheta = tape.gradient(actor_thinks, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return


In [44]:
actor = Actor(nature="primary")
context_outcome_actor = np.reshape(np.array([actor.pad_value]),(1,1,1)).astype(np.float32)
actor(context_outcome_actor)

<tf.Tensor: shape=(1, 1, 1), dtype=float32, numpy=array([[[-0.05643553]]], dtype=float32)>

In [42]:
actor.lstm.stateful

True

In [65]:
actor = Actor(nature="primary")


In [73]:
@tf.function
def fin(actor):
    actor.lstm.stateful=False
    actor(tf.ones((experiences.shape[0],1,1))*actor.pad_value)
    actor(context_outcome_actor)    

In [None]:
context_outcome_actor = np.reshape(np.array([actor.pad_value]),(1,1,1)).astype(np.float32)
actor(context_outcome_actor)

In [75]:
to_stack = []
actions_wathed_index = []
for index in range(1,2*actor.dolinar_layers-2,2):
    states_to_act.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

In [88]:
vasos={}
finss= []
for index in range(1,2*actor.dolinar_layers-2,2):
    vasos[str(index)] = []
    for k in tf.unstack(unstacked_exp[index]):
        vasos[str(index)].append(actor(tf.reshape(k, (1,1,1))))
    

In [95]:
tf.concat(first, axis=0)

<tf.Tensor: shape=(15, 1, 1), dtype=float32, numpy=
array([[[0.08592076]],

       [[0.0886812 ]],

       [[0.08660756]],

       [[0.09219246]],

       [[0.08950914]],

       [[0.08994727]],

       [[0.10423961]],

       [[0.09411004]],

       [[0.09894168]],

       [[0.08911385]],

       [[0.08988633]],

       [[0.09515337]],

       [[0.10386605]],

       [[0.11112309]],

       [[0.11409836]]], dtype=float32)>

In [21]:

    states_to_act=[tf.ones((experiences.shape[0],1,1))*actor.pad_value]

    to_stack = []
    actions_wathed_index = []
    for index in range(1,2*actor.dolinar_layers-2,2):
        states_to_act.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))
    inps_actor = tf.concat(states_to_act, axis=1)
    actor.lstm.stateful=False
    actor_thinks = actor(inps_actor)
    actor.lstm.stateful=True
    da_dtheta = tape.gradient(actor_thinks, actor.trainable_variables, output_gradients=-dq_da)
    optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))


ValueError: Tensor's shape (1, 15, 500) is not compatible with supplied shape [1, 1, 500]