In [7]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm as tqdm
tf.keras.backend.set_floatx('float64')
from collections import deque
from datetime import datetime
import random
import matplotlib

from plots import *
from misc import Prob, ps_maxlik, qval, record
from nets import *
from buffer import ReplayBuffer


special_name=""
total_episodes = 10**2
buffer_size=500
batch_size=64
ep_guess=0.01
noise_displacement=0.5
lr_actor=0.01
lr_critic=0.01
tau = 0.01
repetitions=1
plots=True

if not os.path.exists("results"):
    os.makedirs("results")

amplitude = 0.4
buffer = ReplayBuffer(buffer_size=buffer_size)

critic = Critic()
critic_target = Critic()
actor = Actor(input_dim=1)

actor(np.array([[0.]]).astype(np.float32)) #initialize the network 0, arbitrary inputs.
#
optimizer_critic = tf.keras.optimizers.Adam(lr=lr_critic)
optimizer_actor = tf.keras.optimizers.Adam(lr=lr_actor)


rt = []
pt = []

#define this global so i use them in a function defined above... optimizatin step and testing()
train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)


if special_name == "":
    # current_run_and_time = "results/{}".format(datetime.now().strftime("%Y%m%d-%H%M"))
    numb = record()
    current_run_and_time ="results/run_" + str(numb)
else:
    current_run_and_time = "results/"+special_name

directory = current_run_and_time
train_log =  current_run_and_time + '/train_l0'
test_log =   current_run_and_time + '/test_l0'

train_summary_writer = tf.summary.create_file_writer(train_log)
test_summary_writer_0 = tf.summary.create_file_writer(test_log)

info_optimizers = "optimizer_critic_guess: {} \nOptimizer_actor_l0: {}\n".format(optimizer_critic.get_config(), optimizer_actor.get_config())
infor_buffer = "Buffer_size: {}\n Batch_size for sampling: {}\n".format(buffer.buffer_size, batch_size)
info_epsilons= "epsilon-guess: {}\nepsilon_displacement_noise: {}".format(ep_guess,noise_displacement)

data = "tau: {}, repetitions per optimization step (would be like epochs): {}".format(tau,repetitions) + "\n \n**** optimizers ***\n"+info_optimizers+"\n\n\n*** BUFFER ***\n"+infor_buffer+"\n\n\n *** NOISE PARAMETERS *** \n"+info_epsilons
with open(directory+"/info.txt", 'w') as f:
    f.write(data)
    f.close()

print("Beggining to train! \n \n")
print(data)
print("starting time: {}".format(datetime.now().strftime("%Y%m%d-%H%M%S")))
print("saving results in " + str(directory))
avg_train = []
avg_test = []

history_betas = [] #to put in histogram
history_betas_would_have_done=[] #to put in histogram
histo_preds = {"critic":{}} #here i save the predictions to plot in a "straightforward way"

#######
for episode in tqdm(range(total_episodes)):

    alice_phase = np.random.choice([-1.,1.],1)[0]
    beta_would_do = actor(np.array([[0.]])).numpy()[0][0]
    beta =  beta_would_do + np.random.uniform(-noise_displacement, noise_displacement)
    proboutcome = Prob(alice_phase*amplitude,beta,0)
    outcome = np.random.choice([0.,1.],1,p=[proboutcome, 1-proboutcome])[0]

    history_betas.append(beta)
    history_betas_would_have_done.append(beta_would_do)

#
    if np.random.random()< ep_guess:
        guess = np.random.choice([-1.,1.],1)[0]
    else:
        sequence = np.array([[ [beta, critic.pad_value], [outcome, -1.]]  ]).astype(np.float32)
        guess = critic.give_favourite_guess(sequence)
    if guess == alice_phase:
        reward = 1.
    else:
        reward = 0.
    buffer.add(beta, outcome, guess, reward)

    
    ###### END OF OPTIMIZATION STEP ######
    ###### END OF OPTIMIZATION STEP ######
    experiences = buffer.sample(batch_size)
    sequences, zeroed_rews = process_sequence(experiences)
    labels_critic = give_td_error_Kennedy_guess(critic_target, sequences, zeroed_rews)
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(sequences)
        loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        train_loss(loss_critic)
    
    critic_target.update_target_parameters(critic, tau=0.01)
    
    with tf.GradientTape() as tape:
        ones = tf.ones(shape=(experiences.shape[0],1))
        actions = tf.cast(actor(np.expand_dims(np.zeros(len(experiences)),axis=1)), tf.float32)   #This can be improved i think!! (the conversion... )
        
        tape.watch(actions)
        qvals = critic(tf.expand_dims(tf.concat([actions, ones], axis=1),axis=1))
        dq_da = tape.gradient(qvals, actions)

    with tf.GradientTape() as tape:
        actionss = tf.cast(actor(np.expand_dims(np.zeros(len(experiences)),axis=1)), tf.float32)
        da_dtheta = tape.gradient(actionss, actor.trainable_variables, output_gradients=-dq_da)
    
    optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    ###### END OF OPTIMIZATION STEP ######
    ###### END OF OPTIMIZATION STEP ######
    
    avg_train.append(train_loss.result().numpy())
    avg_test.append(test_loss.result().numpy())
#
    rt.append(reward)
#
    ### calculate success probability if the agent went greedy ###
    p=0
    for outcome in [0.,1.]:
        p+=Prob(critic.give_favourite_guess(pad_single_sequence([beta_would_do, outcome, -1.]))*amplitude, beta_would_do,outcome)
    p/=2
    pt.append(p)
    ################
    
    
rt = [np.sum(rt[:k]) for k in range(len(rt))]
rt = rt/np.arange(1,len(rt)+1)



To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



  0%|          | 0/100 [00:00<?, ?it/s]

Beggining to train! 
 

tau: 0.01, repetitions per optimization step (would be like epochs): 1
 
**** optimizers ***
optimizer_critic_guess: {'name': 'Adam', 'learning_rate': 0.01, 'decay': 0.0, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False} 
Optimizer_actor_l0: {'name': 'Adam', 'learning_rate': 0.01, 'decay': 0.0, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False}



*** BUFFER ***
Buffer_size: 500
 Batch_size for sampling: 64



 *** NOISE PARAMETERS *** 
epsilon-guess: 0.01
epsilon_displacement_noise: 0.5
starting time: 20200510-200325
saving results in results/run_18


To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



100%|██████████| 100/100 [00:14<00:00,  7.03it/s]


In [2]:
def process_sequence(sample_buffer, pad_value = -4., LAYERS=1):
    """" gets data obtained from N experiments: data.shape = (N, 2L+1),
    where +1 accounts for the guess and 2L for (beta, outcome).
    
    [[a0, o1, a1, o2, a2, o3, a4]
     [same but other experiment]
     
    ]
    
    and returns an array of shape (experiments, queries_RNN, 2 ), as accepted by an RNN
    The idea is that i input [\beta, pad_value], and then [outcome, guess].
    
    Or if I have two layers [\beta, pa_value], [outcome, beta2], [outcome, guess],
    
    so the number of "queries" to the RNN is layers+1,
    and i'm always interested in putting 2 values more.
    
    """
    batch_size = sample_buffer.shape[0]
    data = sample_buffer[:,0:(LAYERS+1+1)] 
    pad_value = -4.
    padded_data = np.ones((batch_size,LAYERS+1, 2))*pad_value
    padded_data[:,0][:,0] = data[:,0]
    for k in range(1,LAYERS+1):
        padded_data[:,k] = data[:,[k,k+1]]
        
    rewards_obtained = np.zeros((batch_size, LAYERS+1)).astype(np.float32)
    rewards_obtained[:,-1] = sample_buffer[:,-1]
    
    return padded_data, rewards_obtained

In [3]:
def give_td_error_Kennedy_guess(net,batched_input,sequential_rews_with_zeros):
    '''
    this function takes a batch with its corresponding labels
    and retrieves what the true labels are according to network
    prodection on next states.

    For instance, my datapoint is [(\beta, pad), (n, guess)] 
    and i want [Max_g Q(\beta, n, guess), reward].
    
    
    TO DO: extend this to more layers!!! 
    
    So what you want is 
    [Max_{a_1} Q(a0, o1, a_1), 
    Max_{a_2} Q(a0, o1, a_1, o2, a_2) 
    ,...,
    Max_g Q(h, guess)]
    
    But of course, we can't take the Max_g, so we replace by the target actor's choice !!!
    '''
    b = batched_input.copy()
    ll = sequential_rews_with_zeros.copy()
    preds1 = net(b)
    b[:,1][:,1] = -b[:,1][:,1]
    preds2 = net(b)
    both = tf.concat([preds1,preds2],1)
    maxs = np.squeeze(tf.math.reduce_max(both,axis=1).numpy())
    ll[:,0] = maxs + ll[:,0]
    ll = np.expand_dims(ll,axis=1)
    return ll


In [4]:
def pad_single_sequence(seq, pad_value = -4., LAYERS=1):
    """" 
    [a0, o1, a1, o2, a2, o3, a4]
    
    output [[a0, pad], [o1, a1], [...]]
    """


    pad_value = -4.
    padded_data = np.ones((1,LAYERS+1, 2))*pad_value
    padded_data[0][0][0] = seq[0]
    #padded_data[0][0] = data[0]
    for k in range(1,LAYERS+1):
        padded_data[0][k] = seq[k:(k+2)]
    return padded_data

In [None]:
qv1 = []
qv2 = []
for beta in np.arange(-1.5,1.5,.01):
    q1, q2 =np.squeeze(critic(pad_single_sequence([beta,0.,-1])).numpy())
    qv1.append(q1)
    qv2.append(q2)