In [2]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm as tqdm
tf.keras.backend.set_floatx('float32')
from collections import deque
from datetime import datetime
import random
import matplotlib
from environment import Environment
from plots import just_plot
from misc import *
from nets import *
from buffer import ReplayBuffer

amplitude=0.4
tau = .01
lr_critic = 0.0001
lr_actor=0.001
noise_displacement = .1
ep_guess=0.01
dolinar_layers=2
number_phases=2
buffer_size = 5000
batch_size = 8.



@tf.function
def step_critic_tf(batched_input,labels_critic, critic, optimizer_critic):
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(batched_input)
        loss_critic = tf.keras.losses.MSE(tf.expand_dims(labels_critic, axis=2), preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        return tf.squeeze(loss_critic)

@tf.function
def critic_grad_tf(critic, experiences):
    with tf.GradientTape() as tape:
        unstacked_exp = tf.unstack(tf.convert_to_tensor(experiences), axis=1)
        to_stack = []
        actions_wathed_index = []
        for index in range(0,experiences.shape[-1]-3,2): # I consider from first outcome to last one (but guess)
            actions_wathed_index.append(index)
            to_stack.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        actions_indexed = tf.concat(to_stack,axis=1)
        tape.watch(actions_indexed)

        index_actions=0
        watched_exps=[tf.ones((experiences.shape[0],1,1))*critic.pad_value]
        watched_actions_unstacked = tf.unstack(actions_indexed, axis=1)
        for index in range(0,experiences.shape[-1]-1):
            if index in actions_wathed_index:
                watched_exps.append(tf.expand_dims(watched_actions_unstacked[index_actions], axis=2))
                index_actions+=1
            else:
                watched_exps.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))

        qvals = critic(tf.reshape(tf.concat(watched_exps, axis=2), (experiences.shape[0],critic.dolinar_layers+1,2)))

        dq_da = tape.gradient(qvals, actions_indexed)
        return dq_da

@tf.function
def actor_grad_tf(actor, dq_da, experiences, optimizer_actor):
    unstacked_exp = tf.unstack(experiences, axis=1)
    actions_per_episode={}
    context_outcome_actor = np.reshape(np.array([actor.pad_value]),(1,1,1)).astype(np.float32)
    finns = [tf.multiply(actor(context_outcome_actor), tf.ones((experiences.shape[0],1,1)))]

    with tf.GradientTape() as tape:
        tape.watch(actor.trainable_variables)
        for index in range(1,2*actor.dolinar_layers-2,2):
            actions_per_episode[str(index)] = []
            for k in tf.unstack(unstacked_exp[index]):
                actions_per_episode[str(index)].append(actor(tf.reshape(k, (1,1,1))))
            finns.append(tf.concat(actions_per_episode[str(index)], axis=0))
        final_preds = tf.concat(finns, axis=1)
        da_dtheta=tape.gradient(final_preds, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return


    # states_to_act=[tf.ones((experiences.shape[0],1,1))*actor.pad_value]
    #
    # to_stack = []
    # actions_wathed_index = []
    # for index in range(1,2*actor.dolinar_layers-2,2):
    #     states_to_act.append(tf.reshape(unstacked_exp[index],(experiences.shape[0],1,1)))
    # inps_actor = tf.concat(states_to_act, axis=1)
    # actor.lstm.stateful=False
    # actor_thinks = actor(inps_actor)
    # actor.lstm.stateful=True
    # da_dtheta = tape.gradient(actor_thinks, actor.trainable_variables, output_gradients=-dq_da)
    # optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))




@tf.function
def optimization_step(experiences, critic, critic_target, actor, actor_target, optimizer_critic, optimizer_actor):
    # actor.lstm.reset_states()
    actor.lstm.stateful=False
    # experiences = experiences.astype(np.float32)
    targeted_experience = actor_target.process_sequence_of_experiences_tf(experiences)
    sequences, zeroed_rews = critic_target.process_sequence_tf(targeted_experience)
    labels_critic = critic_target.give_td_errors_tf( sequences, zeroed_rews)

    loss_critic = step_critic_tf(sequences ,labels_critic, critic, optimizer_critic)

    dq_da = critic_grad_tf(critic, experiences)

    actor_grad_tf(actor, dq_da, experiences, optimizer_actor)

    actor.lstm.stateful=True
    return loss_critic


env = Environment(amplitude=amplitude, dolinar_layers = dolinar_layers, number_phases=number_phases)
buffer = ReplayBuffer(buffer_size=buffer_size)

critic = Critic(nature="primary",valreg=0.01, dolinar_layers = dolinar_layers, number_phases=number_phases)
critic_target = Critic(nature="target", dolinar_layers = dolinar_layers, number_phases=number_phases)
actor = Actor(nature="primary", dolinar_layers = dolinar_layers)
actor_target = Actor(nature="target", dolinar_layers = dolinar_layers)

optimizer_critic = tf.keras.optimizers.Adam(lr=lr_critic)
optimizer_actor = tf.keras.optimizers.Adam(lr=lr_actor)

policy_evaluator = PolicyEvaluator(amplitude = amplitude, dolinar_layers=dolinar_layers, number_phases = number_phases)

experiences = np.load("tutorials_functions/expe_2L.npy")


In [4]:
actor.lstm.stateful=False
experiences = experiences.astype(np.float32)
targeted_experience = actor_target.process_sequence_of_experiences_tf(experiences)
sequences, zeroed_rews = critic_target.process_sequence_tf(targeted_experience)
labels_critic = critic_target.give_td_errors_tf( sequences, zeroed_rews)
dq_da = critic_grad_tf(critic, experiences)


In [5]:

@tf.function
def actor_grad_tf_old(actor, dq_da, experiences, optimizer_actor):
    unstacked_exp = tf.unstack(experiences, axis=1)
    actions_per_episode={}
    context_outcome_actor = np.reshape(np.array([actor.pad_value]),(1,1,1)).astype(np.float32)
    finns = [tf.multiply(actor(context_outcome_actor), tf.ones((experiences.shape[0],1,1)))]

    with tf.GradientTape() as tape:
        tape.watch(actor.trainable_variables)
        for index in range(1,2*actor.dolinar_layers-2,2):
            actions_per_episode[str(index)] = []
            for k in tf.unstack(unstacked_exp[index]):
                actions_per_episode[str(index)].append(actor(tf.reshape(k, (1,1,1))))
            finns.append(tf.concat(actions_per_episode[str(index)], axis=0))
        final_preds = tf.concat(finns, axis=1)
        da_dtheta=tape.gradient(final_preds, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return

In [32]:
@tf.function
def act_v2(actor, dq_da, experiences, optimizer_actor):
    with tf.GradientTape() as tape:
        tape.watch(actor.trainable_variables)
        finns = [actor(tf.ones((experiences.shape[0], 1,1))*actor.pad_value)]
        unstacked_exp = tf.unstack(experiences, axis=1)
        for index in range(1,2*actor.dolinar_layers-2,2):
            finns.append(actor(tf.reshape(unstacked_exp[index], (experiences.shape[0], 1,1))))    
        final_preds = tf.concat(finns, axis=1)
        da_dtheta=tape.gradient(final_preds, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return

In [36]:
%timeit act_v2(actor, dq_da, experiences, optimizer_actor)

33.8 ms ± 2.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
%timeit act_v2(actor, dq_da, tf.convert_to_tensor(experiences), optimizer_actor) 

31.5 ms ± 1.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [40]:
@tf.function
def act_v3(actor, dq_da, exper, optimizer_actor):
    experiences = tf.convert_to_tensor(exper)
    with tf.GradientTape() as tape:
        tape.watch(actor.trainable_variables)
        finns = [actor(tf.ones((experiences.shape[0], 1,1))*actor.pad_value)]
        unstacked_exp = tf.unstack(experiences, axis=1)
        for index in tf.range(1,2*actor.dolinar_layers-2,2):
            print(tf.gather(unstacked_exp, index))
            finns.append(actor(tf.reshape(tf.gather(unstacked_exp, index), (experiences.shape[0], 1,1))))    
        final_preds = tf.concat(finns, axis=1)
        da_dtheta=tape.gradient(final_preds, actor.trainable_variables, output_gradients=-dq_da)
        optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return

In [41]:
%timeit act_v3(actor, dq_da, experiences, optimizer_actor)

RuntimeError: in user code:

    <ipython-input-40-de4204a849c6>:9 act_v3  *
        finns.append(actor(tf.reshape(tf.gather(unstacked_exp, index), (experiences.shape[0], 1,1))))
    /home/cooper-cooper/Desktop/deeper/nets.py:233 call  *
        feat = tf.nn.relu(self.l1(feat))
    /home/cooper-cooper/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:955 __call__  **
        self._handle_activity_regularization(inputs, outputs)
    /home/cooper-cooper/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:2215 _handle_activity_regularization
        mean_activity_loss, method='activity_regularizer')
    /home/cooper-cooper/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer_utils.py:557 check_graph_consistency
        bad_example=bad_example, correct_example=correct_example))

    RuntimeError: You are using a layer with `activity_regularizer` in a control flow branch, e.g.:
    
          class TestModel(tf.keras.Model):
    
            def __init__(self):
              super(TestModel, self).__init__(name='test_model')
              self.dense = tf.keras.layers.Dense(2, activity_regularizer='l2')
    
            def call(self, x, training=None):
              if training:
                return self.dense(x)
              else:
                return self.dense(x)
          
    This is currently not supported. Please move your call to the layer with `activity_regularizer` out of the control flow branch, e.g.:
    
          class TestModel(tf.keras.Model):
    
            def __init__(self):
              super(TestModel, self).__init__(name='test_model')
              self.dense = tf.keras.layers.Dense(2, activity_regularizer='l2')
    
            def call(self, x, training=None):
              return self.dense(x)
          
    You can also resolve this by marking your outer model/layer dynamic (eager-only) by passing `dynamic=True` to the layer constructor. Any kind of control flow is supported with dynamic layers. Note that using `dynamic=True` requires you to implement static shape inference in the `compute_output_shape(input_shape)` method.


In [29]:
ff = [actor(tf.ones((experiences.shape[0], 1,1))*actor.pad_value)]
tfexperiences = tf.convert_to_tensor(experiences)
unsta = tf.unstack(tfexperiences, axis=1)
for index in tf.range(1,2*actor.dolinar_layers-2,2):
    ff.append(actor(tf.reshape(unstacked_exp[index], (experiences.shape[0], 1,1))))    

In [None]:
for index in range(1,2*actor.dolinar_layers-2,2):
        for k in tf.unstack(unstacked_exp[index]):
            actions_per_episode[str(index)].append(actor(tf.reshape(k, (1,1,1))))
        finns.append(tf.concat(actions_per_episode[str(index)], axis=0))

<tf.Tensor: shape=(100, 2, 1), dtype=float32, numpy=
array([[[0.00718073],
        [0.00718174]],

       [[0.00717851],
        [0.00718064]],

       [[0.00717866],
        [0.00718212]],

       [[0.00718067],
        [0.00718326]],

       [[0.00718077],
        [0.00718088]],

       [[0.00718001],
        [0.00717943]],

       [[0.00717958],
        [0.00718186]],

       [[0.00718099],
        [0.00717923]],

       [[0.00717997],
        [0.00718219]],

       [[0.00718046],
        [0.00718246]],

       [[0.00718216],
        [0.00718108]],

       [[0.00718047],
        [0.00718226]],

       [[0.00718331],
        [0.00718   ]],

       [[0.00718107],
        [0.00718116]],

       [[0.00718115],
        [0.00718119]],

       [[0.0071806 ],
        [0.00718097]],

       [[0.00718187],
        [0.00718048]],

       [[0.00718053],
        [0.00718116]],

       [[0.00718104],
        [0.00718085]],

       [[0.00718172],
        [0.00718052]],

       [[0.0071807 ],
     

In [None]:
actions_per_episode={}
context_outcome_actor = np.reshape(np.array([actor.pad_value]),(1,1,1)).astype(np.float32)
finns = [tf.multiply(actor(context_outcome_actor), tf.ones((experiences.shape[0],1,1)))]


    for index in range(1,2*actor.dolinar_layers-2,2):
        actions_per_episode[str(index)] = []
        for k in tf.unstack(unstacked_exp[index]):
            actions_per_episode[str(index)].append(actor(tf.reshape(k, (1,1,1))))
        finns.append(tf.concat(actions_per_episode[str(index)], axis=0))
    final_preds = tf.concat(finns, axis=1)

In [1]:

        env.pick_phase()
        experiences=[] #where the current history of the current episode is stored
        context_outcome_actor = np.reshape(np.array([actor.pad_value]),(1,1,1)).astype(np.float32)
        outcomes_so_far = []
        for layer in range(actor.dolinar_layers):
            beta_would_do = np.squeeze(actor(context_outcome_actor))
            beta =  beta_would_do + np.random.uniform(-noise_displacement, noise_displacement)#np.clip(,-2*amplitude,2*amplitude)
            policy_evaluator.recorded_trajectory_tree[str(layer)][str(np.array(outcomes_so_far))].append(beta)
            policy_evaluator.recorded_trajectory_tree_would_do[str(layer)][str(np.array(outcomes_so_far))].append(beta_would_do)

            outcome = env.give_outcome(beta,layer)
            outcomes_so_far.append(int(outcome))
            experiences.append(beta)
            experiences.append(outcome)
            context_outcome_actor = np.reshape(np.array([outcome]),(1,1,1)).astype(np.float32)

        ### ep-gredy guessing of the phase###
        ### ep-gredy guessing of the phase###
        if np.random.random()<ep_guess:
            val = np.random.choice(range(number_phases),1)[0]
            guess_index, guess_input_network = val, val/critic.number_phases
            # print(guess_input_network)
        else:
            guess_index, guess_input_network = critic.give_favourite_guess(experiences) #experiences is the branch of the current tree of actions + outcomes.
        experiences.append(guess_input_network)

        reward = env.give_reward(guess_index)
        experiences.append(reward)
        buffer.add(tuple(experiences))


        rt.append(reward)
        pt.append(policy_evaluator.greedy_strategy(actor = actor, critic = critic))

        ###### OPTIMIZATION STEP ######
        ###### OPTIMIZATION STEP ######
        ###### OPTIMIZATION STEP ######
        ###### OPTIMIZATION STEP ######
        # if (buffer.count>1):#(episode%100==1):
            # sampled_experiences = buffer.sample(batch_size)
            # np.save(str(dolinar_layers)+"_sample", sampled_experiences)
        if (buffer.count>batch_size):#(episode%100==1):
            sampled_experiences = tf.convert_to_tensor(buffer.sample(batch_size), dtype=np.float32)

            new_loss = optimization_step(sampled_experiences, critic, critic_target, actor, actor_target, optimizer_critic, optimizer_actor)
            new_loss = new_loss.numpy()
            critic_target.update_target_parameters(critic)
            actor_target.update_target_parameters(actor)
            # noise_displacement = max(0.1,0.999*noise_displacement)
        ###### OPTIMIZATION STEP ######
        ###### OPTIMIZATION STEP ######
        ###### OPTIMIZATION STEP ######
        ###### OPTIMIZATION STEP ######
        avg_train.append(new_loss)
        actor.lstm.reset_states()

         #set again the states to zero, because when actor.lstm.stateful = True, it does not reset state along differnt batches !

        # if episode%(total_episodes/100) == 0: #this is for showing 10 results in total.
        #
        #     template = 'Episode {}, \Rt: {}, \Pt: {}, Train loss: {}\n\n'
        #     print(template.format(episode+1,
        #                         np.sum(rt)/(episode+1),
        #                           pt[-1],
        #                          np.round(np.array(avg_train).mean(),5),
        #                         )
        #           )

    cumre=0
    rrt = []
    for k in rt:
        cumre+=k
        rrt.append(cumre)
    rrt = rrt/np.arange(1,len(rt)+1)

    np.save(directory+"/learning_curves/", rrt)
    np.save(directory+"/learning_curves/", pt)
    policy_evaluator.save_hisory_tree(directory+"/action_trees")

    for model, net_folder in zip([actor, actor_target, critic, critic_target],["actor_primary", "actor_target", "critic_primary", "critic_target"]):
        model.save_weights(directory+"/networks/"+net_folder+"/")
    just_plot(rrt, pt, avg_train, env.helstrom(), policy_evaluator, directory)
    # BigPlot(buffer,rt, pt, history_betas, history_betas_would_have_done, histo_preds, losses, directory)
    return


info_run = ""
to_csv=[]

RDPG(amplitude=amplitude, total_episodes=10**2, dolinar_layers=dolinar_layers, noise_displacement=noise_displacement, tau=tau,
        buffer_size=buffer_size, batch_size=batch_size, lr_critic=lr_critic, lr_actor=lr_actor, ep_guess=ep_guess)
