In [6]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm as tqdm
tf.keras.backend.set_floatx('float64')
from collections import deque
from datetime import datetime
import random
import matplotlib

from plots import *
from misc import Prob, ps_maxlik, qval, record
from nets import *
from buffer import ReplayBuffer


def optimization_step(experiences,critic, critic_target, actor, optimizer_critic, optimizer_actor, train_loss):
    sequences, zeroed_rews = critic.process_sequence(experiences)
    labels_critic = critic_target.give_td_error_Kennedy_guess( sequences, zeroed_rews)
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(sequences)
        loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        train_loss(loss_critic)

    critic_target.update_target_parameters(critic, tau=0.05)

    with tf.GradientTape() as tape:
        ones = tf.ones(shape=(experiences.shape[0],1))
        actions = tf.cast(actor(np.expand_dims(np.zeros(len(experiences)),axis=1)), tf.float32)   #This can be improved i think!! (the conversion... )

        tape.watch(actions)
        qvals = critic(tf.expand_dims(tf.concat([actions, ones], axis=1),axis=1))
        dq_da = tape.gradient(qvals, actions)

    with tf.GradientTape() as tape:
        actionss = tf.cast(actor(np.expand_dims(np.zeros(len(experiences)),axis=1)), tf.float32)
        da_dtheta = tape.gradient(actionss, actor.trainable_variables, output_gradients=-dq_da)

    optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return
    ###### END OF OPTIMIZATION STEP ######
    ###### END OF OPTIMIZATION STEP ######


def ddpgKennedy(special_name="",total_episodes = 10**3,buffer_size=500, batch_size=64, ep_guess=0.01,
 noise_displacement=0.5,lr_actor=0.01, lr_critic=0.001, tau=0.005, repetitions=1, plots=True):

    if not os.path.exists("results"):
        os.makedirs("results")

    amplitude = 0.4
    buffer = ReplayBuffer(buffer_size=buffer_size)

    critic = Critic()
    critic_target = Critic()
    actor = Actor(input_dim=1)

    actor(np.array([[0.]]).astype(np.float32)) #initialize the network 0, arbitrary inputs.
    #
    optimizer_critic = tf.keras.optimizers.Adam(lr=lr_critic)
    optimizer_actor = tf.keras.optimizers.Adam(lr=lr_actor)


    rt = []
    pt = []

    #define this global so i use them in a function defined above... optimizatin step and testing()
    train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
    test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)


    if special_name == "":
        # current_run_and_time = "results/{}".format(datetime.now().strftime("%Y%m%d-%H%M"))
        numb = record()
        current_run_and_time ="results/run_" + str(numb)
    else:
        current_run_and_time = "results/"+special_name

    directory = current_run_and_time
    train_log =  current_run_and_time + '/train_l0'
    test_log =   current_run_and_time + '/test_l0'

    train_summary_writer = tf.summary.create_file_writer(train_log)
    test_summary_writer_0 = tf.summary.create_file_writer(test_log)

    info_optimizers = "optimizer_critic_guess: {} \nOptimizer_actor_l0: {}\n".format(optimizer_critic.get_config(), optimizer_actor.get_config())
    infor_buffer = "Buffer_size: {}\n Batch_size for sampling: {}\n".format(buffer.buffer_size, batch_size)
    info_epsilons= "epsilon-guess: {}\nepsilon_displacement_noise: {}".format(ep_guess,noise_displacement)

    data = "tau: {}, repetitions per optimization step (would be like epochs): {}".format(tau,repetitions) + "\n \n**** optimizers ***\n"+info_optimizers+"\n\n\n*** BUFFER ***\n"+infor_buffer+"\n\n\n *** NOISE PARAMETERS *** \n"+info_epsilons
    with open(directory+"/info.txt", 'w') as f:
        f.write(data)
        f.close()

    print("Beggining to train! \n \n")
    print(data)
    print("starting time: {}".format(datetime.now().strftime("%Y%m%d-%H%M%S")))
    print("saving results in " + str(directory))
    avg_train = []
    avg_test = []

    history_betas = [] #to put in histogram
    history_betas_would_have_done=[] #to put in histogram
    histo_preds = {"critic":{}} #here i save the predictions to plot in a "straightforward way"

    #######
    for episode in tqdm(range(total_episodes)):

        alice_phase = np.random.choice([-1.,1.],1)[0]
        beta_would_do = actor(np.array([[0.]])).numpy()[0][0]
        beta =  beta_would_do + max(0.1, np.random.uniform(-noise_displacement, noise_displacement)*np.exp(-episode/300))
        proboutcome = Prob(alice_phase*amplitude,beta,0)
        outcome = np.random.choice([0.,1.],1,p=[proboutcome, 1-proboutcome])[0]

        history_betas.append(beta)
        history_betas_would_have_done.append(beta_would_do)

    #
        if np.random.random()< ep_guess:
            guess = np.random.choice([-1.,1.],1)[0]
        else:
            sequence = np.array([[ [beta, critic.pad_value], [outcome, -1.]]  ]).astype(np.float32)
            guess = critic.give_favourite_guess(sequence)
        if guess == alice_phase:
            reward = 1.
        else:
            reward = 0.
        buffer.add(beta, outcome, guess, reward)


        ###### OPTIMIZATION STEP ######
        ###### OPTIMIZATION STEP ######

        experiences = buffer.sample(batch_size)
        optimization_step(experiences,critic, critic_target, actor, optimizer_critic, optimizer_actor, train_loss)


#####
        avg_train.append(train_loss.result().numpy())
        avg_test.append(test_loss.result().numpy())
    #
        rt.append(reward)
    #

        ########################################################################
        ### calculate success probability if the agent went greedy ###########
        p=0
        for outcome in [0.,1.]:
            p+=Prob(critic.give_favourite_guess(critic.pad_single_sequence([beta_would_do, outcome, -1.]))*amplitude, beta_would_do,outcome)
        p/=2
        pt.append(p)
        ################

    


    rt = [np.sum(rt[:k]) for k in range(len(rt))]
    rt = rt/np.arange(1,len(rt)+1)

    losses = [avg_train, avg_test]

    #BigPlot(buffer,rt, pt, history_betas, history_betas_would_have_done, histo_preds, losses, directory)
    return buffer



In [8]:
buffer = ddpgKennedy(total_episodes=100)



To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



  0%|          | 0/100 [00:00<?, ?it/s]

Beggining to train! 
 

tau: 0.005, repetitions per optimization step (would be like epochs): 1
 
**** optimizers ***
optimizer_critic_guess: {'name': 'Adam', 'learning_rate': 0.001, 'decay': 0.0, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False} 
Optimizer_actor_l0: {'name': 'Adam', 'learning_rate': 0.01, 'decay': 0.0, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False}



*** BUFFER ***
Buffer_size: 500
 Batch_size for sampling: 64



 *** NOISE PARAMETERS *** 
epsilon-guess: 0.01
epsilon_displacement_noise: 0.5
starting time: 20200511-004230
saving results in results/run_19


To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



100%|██████████| 100/100 [00:14<00:00,  6.71it/s]


In [31]:
#if episode%(total_episodes/10) == 0: #this is for showing 10 results in total.
histo_preds = {"layer0":{}, "layer1":{}} #here i save the predictions to plot in a "straightforward way"
episode = 10

for layer in ["layer0","layer1"]: #net_0 will be critic_q0, net_1 will be critic_qguess

    histo_preds[layer][str(episode)] ={}
    histo_preds[layer][str(episode)]["episode"] = episode
    histo_preds[layer][str(episode)]["values"] = {}

simp = np.random.randn(len(buffer.betas),4)
simp[:,0] =buffer.betas
qvals0 = np.squeeze(critic(critic.process_sequence(simp)[0]).numpy()[:,0]) 
histo_preds["layer0"][str(episode)]["values"] = qvals0

index=0
for n1 in [0.,1.]:
    for guess in [-1.,1.]:
        simp[:,1] = n1
        simp[:,2] = guess
        qvals1 = np.squeeze(critic(critic.process_sequence(simp)[0]).numpy()[:,1]) 
        histo_preds["layer1"][str(episode)]["values"][str(index)] = qvals1
        index+=1


array([0.49618411, 0.49618338, 0.4961832 , 0.49618292, 0.49618164,
       0.49618037, 0.49617909, 0.49617781, 0.49617654, 0.49617526,
       0.49617398, 0.4961727 , 0.49617142, 0.49617014, 0.49616886,
       0.49616758, 0.4961663 , 0.49616502, 0.49616373, 0.49616245,
       0.49616116, 0.49615988, 0.49615859, 0.49615731, 0.49615602,
       0.49615473, 0.49615344, 0.49615215, 0.49615086, 0.49614957,
       0.49614828, 0.49614699, 0.4961457 , 0.49614441, 0.49614311,
       0.49614182, 0.49614052, 0.49613923, 0.49613793, 0.49613663,
       0.49613534, 0.49613404, 0.49613274, 0.49613144, 0.49613014,
       0.49612884, 0.49612754, 0.49612624, 0.49612493, 0.49612363,
       0.49612233, 0.49612102, 0.49611972, 0.49611841, 0.4961171 ,
       0.4961158 , 0.49611449, 0.49611318, 0.49611187, 0.49611056,
       0.49610925, 0.49610794, 0.49610663, 0.49610531, 0.496104  ,
       0.49610269, 0.49610137, 0.49610006, 0.49609874, 0.49609742,
       0.49609611, 0.49609479, 0.49609347, 0.49609215, 0.49609

In [17]:
critic = Critic()


array([[0.49618411],
       [0.49618338],
       [0.4961832 ],
       [0.49618292],
       [0.49618164],
       [0.49618037],
       [0.49617909],
       [0.49617781],
       [0.49617654],
       [0.49617526],
       [0.49617398],
       [0.4961727 ],
       [0.49617142],
       [0.49617014],
       [0.49616886],
       [0.49616758],
       [0.4961663 ],
       [0.49616502],
       [0.49616373],
       [0.49616245],
       [0.49616116],
       [0.49615988],
       [0.49615859],
       [0.49615731],
       [0.49615602],
       [0.49615473],
       [0.49615344],
       [0.49615215],
       [0.49615086],
       [0.49614957],
       [0.49614828],
       [0.49614699],
       [0.4961457 ],
       [0.49614441],
       [0.49614311],
       [0.49614182],
       [0.49614052],
       [0.49613923],
       [0.49613793],
       [0.49613663],
       [0.49613534],
       [0.49613404],
       [0.49613274],
       [0.49613144],
       [0.49613014],
       [0.49612884],
       [0.49612754],
       [0.496