In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm as tqdm
tf.keras.backend.set_floatx('float32')
from collections import deque
from datetime import datetime
import random
import matplotlib
from environment import Environment
from plots import just_plot
from misc import *
from nets import *
from buffer import ReplayBuffer

amplitude=0.4
tau = .01
lr_critic = 0.0001
lr_actor=0.001
noise_displacement = .1
ep_guess=0.01
dolinar_layers=2
number_phases=2
buffer_size = 5000
batch_size = 8.



@tf.function
def step_critic_tf(batched_input,labels_critic, critic, optimizer_critic):
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(batched_input)
        loss_critic = tf.keras.losses.MSE(tf.expand_dims(labels_critic, axis=2), preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        return tf.squeeze(loss_critic)

@tf.function
def critic_grad_tf(critic, experiences):
    with tf.GradientTape() as tape:
        unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
        to_stack = []
        actions_wathed_index = []
        for index in range(0,experiences.shape[-1]-3,2): # I consider from first outcome to last one (but guess)
            actions_wathed_index.append(index)
            to_stack.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        actions_indexed = tf.concat(to_stack,axis=1)
        tape.watch(actions_indexed)

        index_actions=0
        watched_exps=[tf.ones((experiences.shape[0],1,1))*critic.pad_value]
        watched_actions_unstacked = tf.unstack(actions_indexed, axis=1)
        for index in range(0,experiences.shape[-1]-1):
            if index in actions_wathed_index:
                watched_exps.append(tf.expand_dims(watched_actions_unstacked[index_actions], axis=2))
                index_actions+=1
            else:
                watched_exps.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        qvals = critic(tf.reshape(tf.concat(watched_exps, axis=2), (experiences.shape[0],critic.dolinar_layers+1,2)))

        dq_da = tape.gradient(qvals, actions_indexed)
        return dq_da

@tf.function
def actor_grad_tf(actor, dq_da, experiences, optimizer_actor):
    unstacked_exp = tf.unstack(experiences, axis=1)
    actions_per_episode={}
    context_outcome_actor = np.reshape(np.array([actor.pad_value]),(1,1,1)).astype(np.float32)
    finns = [tf.multiply(actor(context_outcome_actor), tf.ones((experiences.shape[0],1,1)))]

    with tf.GradientTape() as tape:
        tape.watch(actor.trainable_variables)
        for index in range(1,2*actor.dolinar_layers-2,2):
            actions_per_episode[str(index)] = []
            for k in tf.unstack(unstacked_exp[index]):
                actions_per_episode[str(index)].append(actor(tf.reshape(k, (1,1,1))))
            finns.append(tf.concat(actions_per_episode[str(index)], axis=0))
        final_preds = tf.concat(finns, axis=1)
        da_dtheta=tape.gradient(final_preds, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return


    # states_to_act=[tf.ones((experiences.shape[0],1,1))*actor.pad_value]
    #
    # to_stack = []
    # actions_wathed_index = []
    # for index in range(1,2*actor.dolinar_layers-2,2):
    #     states_to_act.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))
    # inps_actor = tf.concat(states_to_act, axis=1)
    # actor.lstm.stateful=False
    # actor_thinks = actor(inps_actor)
    # actor.lstm.stateful=True
    # da_dtheta = tape.gradient(actor_thinks, actor.trainable_variables, output_gradients=-dq_da)
    # optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))




@tf.function
def optimization_step(experiences, critic, critic_target, actor, actor_target, optimizer_critic, optimizer_actor):
    # actor.lstm.reset_states()
    actor.lstm.stateful=False
    # experiences = experiences.astype(np.float32)
    targeted_experience = actor_target.process_sequence_of_experiences_tf(experiences)
    sequences, zeroed_rews = critic_target.process_sequence_tf(targeted_experience)
    labels_critic = critic_target.give_td_errors_tf( sequences, zeroed_rews)

    loss_critic = step_critic_tf(sequences ,labels_critic, critic, optimizer_critic)

    dq_da = critic_grad_tf(critic, experiences)

    actor_grad_tf(actor, dq_da, experiences, optimizer_actor)

    actor.lstm.stateful=True
    return loss_critic


env = Environment(amplitude=amplitude, dolinar_layers = dolinar_layers, number_phases=number_phases)
buffer = ReplayBuffer(buffer_size=buffer_size)

critic = Critic(nature="primary",valreg=0.01, dolinar_layers = dolinar_layers, number_phases=number_phases)
critic_target = Critic(nature="target", dolinar_layers = dolinar_layers, number_phases=number_phases)
actor = Actor(nature="primary", dolinar_layers = dolinar_layers)
actor_target = Actor(nature="target", dolinar_layers = dolinar_layers)

optimizer_critic = tf.keras.optimizers.Adam(lr=lr_critic)
optimizer_actor = tf.keras.optimizers.Adam(lr=lr_actor)

policy_evaluator = PolicyEvaluator(amplitude = amplitude, dolinar_layers=dolinar_layers, number_phases = number_phases)

experiences = np.load("tutorials_functions/expe_2L.npy")


In [2]:
actor.lstm.stateful=False
experiences = experiences.astype(np.float32)
targeted_experience = actor_target.process_sequence_of_experiences_tf(experiences)
sequences, zeroed_rews = critic_target.process_sequence_tf(targeted_experience)
labels_critic = critic_target.give_td_errors_tf( sequences, zeroed_rews)
dq_da = critic_grad_tf(critic, experiences)


In [None]:

@tf.function
def actor_grad_tf_old(actor, dq_da, experiences, optimizer_actor):
    unstacked_exp = tf.unstack(experiences, axis=1)
    actions_per_episode={}
    context_outcome_actor = np.reshape(np.array([actor.pad_value]),(1,1,1)).astype(np.float32)
    finns = [tf.multiply(actor(context_outcome_actor), tf.ones((experiences.shape[0],1,1)))]

    with tf.GradientTape() as tape:
        tape.watch(actor.trainable_variables)
        for index in range(1,2*actor.dolinar_layers-2,2):
            actions_per_episode[str(index)] = []
            for k in tf.unstack(unstacked_exp[index]):
                actions_per_episode[str(index)].append(actor(tf.reshape(k, (1,1,1))))
            finns.append(tf.concat(actions_per_episode[str(index)], axis=0))
        final_preds = tf.concat(finns, axis=1)
        da_dtheta=tape.gradient(final_preds, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return

In [None]:
@tf.function
def act_v2(actor, dq_da, experiences, optimizer_actor):
    with tf.GradientTape() as tape:
        tape.watch(actor.trainable_variables)
        finns = [actor(tf.ones((experiences.shape[0], 1,1))*actor.pad_value)]
        unstacked_exp = tf.unstack(experiences, axis=1)
        for index in range(1,2*actor.dolinar_layers-2,2):
            finns.append(actor(tf.reshape(unstacked_exp[index], (experiences.shape[0], 1,1))))    
        final_preds = tf.concat(finns, axis=1)
        da_dtheta=tape.gradient(final_preds, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return

In [None]:
%timeit act_v2(actor, dq_da, experiences, optimizer_actor)

In [None]:
%timeit act_v2(actor, dq_da, tf.convert_to_tensor(experiences), optimizer_actor) 

In [12]:
@tf.function
def act_v3(actor, dq_da, experiences, optimizer_actor):
    with tf.GradientTape() as tape:
        tape.watch(actor.trainable_variables)
        finns = [tf.ones((experiences.shape[0], 1,1))*actor.pad_value]
        unstacked_exp = tf.unstack(experiences, axis=1)
        for index in range(1,2*actor.dolinar_layers-2,2):
            finns.append(tf.reshape(unstacked_exp[index], (experiences.shape[0], 1,1)))
        final_preds = tf.concat(finns, axis=1)
        final_preds = actor(final_preds)
        da_dtheta=tape.gradient(final_preds, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return

In [14]:
%timeit act_v3(actor, dq_da, experiences, optimizer_actor)

29.2 ms ± 768 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
