In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm as tqdm
tf.keras.backend.set_floatx('float32')
from collections import deque
from datetime import datetime
import random
import matplotlib

from plots import *
from misc import Prob, ps_maxlik, qval, record
from nets import *
from buffer import ReplayBuffer


In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
import numpy as np
tf.compat.v1.enable_eager_execution()
tf.executing_eagerly()

class Critic(tf.keras.Model):
    #input_dim: 1 if layer=0, 3 if layer= 2, for the Kennedy receiver ##
    def __init__(self, valreg=0.01, seed_val=0.1, pad_value=-7.):
        super(Critic,self).__init__()

        self.pad_value = pad_value
        self.mask = tf.keras.layers.Masking(mask_value=pad_value,
                                  input_shape=(2, 2))
        self.lstm = tf.keras.layers.LSTM(250, return_sequences=True)

        self.l1 = Dense(50,kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg), dtype=tf.float32)

        self.l2 = Dense(50, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val), dtype=tf.float32)

        self.l3 = Dense(1, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val), dtype=tf.float32)




    def update_target_parameters(self,primary_net, tau=0.01):
        #### only
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(tau * prim_weights[i] + (1 - tau) * targ_weights[i])
        self.set_weights(weights)
        return

    def call(self, inputs):

        feat = self.mask(inputs)

        feat= self.lstm(feat)
        feat = tf.nn.dropout(feat, rate=0.01)

        feat = tf.nn.relu(self.l1(feat))
        feat = tf.nn.dropout(feat, rate=0.01)

        feat = tf.nn.relu(self.l2(feat))
        feat = tf.nn.sigmoid(self.l3(feat))
        return feat


    def process_sequence(self,sample_buffer, pad_value = -4., LAYERS=1):
        """" gets data obtained from N experiments: data.shape = (N, 2L+1),
        where +1 accounts for the guess and 2L for (beta, outcome).

        [[a0, o1, a1, o2, a2, o3, a4]
         [same but other experiment]

        ]

        and returns an array of shape (experiments, queries_RNN, 2 ), as accepted by an RNN
        The idea is that i input [\beta, pad_value], and then [outcome, guess].

        Or if I have two layers [\beta, pa_value], [outcome, beta2], [outcome, guess],

        so the number of "queries" to the RNN is layers+1,
        and i'm always interested in putting 2 values more.

        """
        batch_size = sample_buffer.shape[0]
        data = sample_buffer[:,0:(LAYERS+1+1)]
        pad_value = -4.
        padded_data = np.ones((batch_size,LAYERS+1, 2))*pad_value
        padded_data[:,0][:,0] = data[:,0]
        for k in range(1,LAYERS+1):
            padded_data[:,k] = data[:,[k,k+1]]

        rewards_obtained = np.zeros((batch_size, LAYERS+1)).astype(np.float32)
        rewards_obtained[:,-1] = sample_buffer[:,-1]
        return padded_data, rewards_obtained

    




    @tf.function
    def process_sequence_tf(self, sample_buffer, pad_value = -4., LAYERS=1):
        sample_buffer = tf.convert_to_tensor(experiences.astype(np.float32))
        first = tf.stack([sample_buffer[:,0], pad_value*tf.ones((64,))], axis=-1)
        for k in range(1,LAYERS+1):
            to_stack = tf.stack([sample_buffer[:,k], sample_buffer[:,k+1]], axis=-1)
            first = tf.stack([first, to_stack], axis=1)

        rewards = tf.zeros((sample_buffer.shape[0]))
        rewards = tf.stack([rewards,sample_buffer[:,-1]], axis=-1)
        rewards = tf.expand_dims(rewards, axis=2)
        return first, rewards
    
    def pad_single_sequence(self, seq, pad_value = -4., LAYERS=1):
        """"
        input: [a0, o1, a1, o2, a2, o3, a4]

        output: [[a0, pad], [o1, a1], [...]]

        the cool thing is that then you can put this to predict the greedy guess/action.
        """
        pad_value = -4.
        padded_data = np.ones((1,LAYERS+1, 2))*pad_value
        padded_data[0][0][0] = seq[0]
        #padded_data[0][0] = data[0]
        for k in range(1,LAYERS+1):
            padded_data[0][k] = seq[k:(k+2)]
        return padded_data


    def give_td_error_Kennedy_guess(self,batched_input,sequential_rews_with_zeros):
        '''
        this function takes a batch with its corresponding labels
        and retrieves what the true labels are according to network
        prodection on next states.

        For instance, my datapoint is [(\beta, pad), (n, guess)]
        and i want [Max_g Q(\beta, n, guess), reward].


        TO DO: extend this to more layers!!!

        So what you want is
        [Max_{a_1} Q(a0, o1, a_1),
        Max_{a_2} Q(a0, o1, a_1, o2, a_2)
        ,...,
        Max_g Q(h, guess)]

        But of course, we can't take the Max_g, so we replace by the target actor's choice !!!
        '''
        b = batched_input.copy()
        ll = sequential_rews_with_zeros.copy()
        preds1 = self(b)
        b[:,1][:,1] = -b[:,1][:,1]
        preds2 = self(b)
        both = tf.concat([preds1,preds2],1)
        maxs = np.squeeze(tf.math.reduce_max(both,axis=1).numpy())
        ll[:,0] = maxs + ll[:,0]
        ll = np.expand_dims(ll,axis=1)
        return ll

    
    @tf.function
    def give_td_error_Kennedy_guess_tf(self,batched_input,batched_zeroed_reward):
        preds1 = self(batched_input)

        Level1 = tf.unstack(b, axis=1)
        pad, guess = tf.unstack(Level1[1], axis=1)
        new_guess = tf.multiply(guess,-1)
        flipped_guess = tf.stack([Level1[0],tf.stack([pad, new_guess], axis=1)], axis=2)

        preds2 = self(flipped_guess)
        both = tf.concat([preds1,preds2],1)
        maxs = tf.math.reduce_max(both,axis=1)
        batched_zeroed_reward = tf.stack([maxs, batched_zeroed_reward[:,1] ], axis=1)
        return batched_zeroed_reward


    def give_favourite_guess(self,sequence):
        """"sequence should be [[beta, pad], [outcome, guess]] """
        pred_1 = self(sequence)
        sequence[:,1][:,1] = -sequence[:,1][:,1]
        pred_2 = self(sequence)
        both = tf.concat([pred_1,pred_2],1)
        maxs = tf.argmax(both,axis=1)
        guess = (-1)**maxs.numpy()[0][0]
        return guess

experiences= np.load("experiences.npy")
critic = Critic()
b, rews = critic.process_sequence_tf(experiences)
critic.give_td_error_Kennedy_guess_tf(b, rews)

<tf.Tensor: shape=(64, 2, 1), dtype=float32, numpy=
array([[[0.48438182],
        [0.        ]],

       [[0.48429245],
        [1.        ]],

       [[0.48446605],
        [1.        ]],

       [[0.4844515 ],
        [1.        ]],

       [[0.48444903],
        [0.        ]],

       [[0.4843577 ],
        [0.        ]],

       [[0.48448274],
        [1.        ]],

       [[0.4842738 ],
        [0.        ]],

       [[0.48434904],
        [1.        ]],

       [[0.4843511 ],
        [0.        ]],

       [[0.48427483],
        [1.        ]],

       [[0.48421592],
        [0.        ]],

       [[0.4842314 ],
        [1.        ]],

       [[0.48458126],
        [1.        ]],

       [[0.48445517],
        [1.        ]],

       [[0.4843235 ],
        [0.        ]],

       [[0.48431727],
        [1.        ]],

       [[0.4842989 ],
        [1.        ]],

       [[0.48438513],
        [0.        ]],

       [[0.4844734 ],
        [0.        ]],

       [[0.48428857],
      

In [None]:
%run -t td_errs.py

In [None]:
guesses = np.array(tf.unstack(bb,axis=0))[:,1][:,1]

In [None]:
tf.keras.losses.MSE(ll,preds1)

In [None]:


def optimization_step(experiences,critic, critic_target, actor, optimizer_critic, optimizer_actor, train_loss):
    sequences, zeroed_rews = critic.process_sequence(experiences)
    labels_critic = critic_target.give_td_error_Kennedy_guess( sequences, zeroed_rews)
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(sequences)
        loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        train_loss(loss_critic)

    critic_target.update_target_parameters(critic, tau=0.05)

    with tf.GradientTape() as tape:
        ones = tf.ones(shape=(experiences.shape[0],1))
        actions = tf.cast(actor(np.expand_dims(np.zeros(len(experiences)),axis=1)), tf.float32)   #This can be improved i think!! (the conversion... )

        tape.watch(actions)
        qvals = critic(tf.expand_dims(tf.concat([actions, ones], axis=1),axis=1))
        dq_da = tape.gradient(qvals, actions)

    with tf.GradientTape() as tape:
        actionss = tf.cast(actor(np.expand_dims(np.zeros(len(experiences)),axis=1)), tf.float32)
        da_dtheta = tape.gradient(actionss, actor.trainable_variables, output_gradients=-dq_da)

    optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    return
        ###### END OF OPTIMIZATION STEP ######
    ###### END OF OPTIMIZATION STEP ######


def ddpgKennedy(special_name="",total_episodes = 10**3,buffer_size=500, batch_size=64, ep_guess=0.01,
 noise_displacement=0.5,lr_actor=0.01, lr_critic=0.001, tau=0.005, repetitions=1, plots=True):

    if not os.path.exists("results"):
        os.makedirs("results")

    amplitude = 0.4
    buffer = ReplayBuffer(buffer_size=buffer_size)

    critic = Critic()
    critic_target = Critic()
    actor = Actor(input_dim=1)
    # actor_target = Actor(input_dim=1) THIS IS NOT REQUIRED FOR THE FIRST LAYER ONLY

    actor(np.array([[0.]]).astype(np.float32)) #initialize the network 0, arbitrary inputs.
    #
    optimizer_critic = tf.keras.optimizers.Adam(lr=lr_critic)
    optimizer_actor = tf.keras.optimizers.Adam(lr=lr_actor)


    rt = []
    pt = []

    #define this global so i use them in a function defined above... optimizatin step and testing()
    train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
    test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)


    if special_name == "":
        # current_run_and_time = "results/{}".format(datetime.now().strftime("%Y%m%d-%H%M"))
        numb = record()
        current_run_and_time ="results/run_" + str(numb)
    else:
        current_run_and_time = "results/"+special_name

    directory = current_run_and_time
    train_log =  current_run_and_time + '/train_l0'
    test_log =   current_run_and_time + '/test_l0'

    train_summary_writer = tf.summary.create_file_writer(train_log)
    test_summary_writer_0 = tf.summary.create_file_writer(test_log)

    info_optimizers = "optimizer_critic_guess: {} \nOptimizer_actor_l0: {}\n".format(optimizer_critic.get_config(), optimizer_actor.get_config())
    infor_buffer = "Buffer_size: {}\n Batch_size for sampling: {}\n".format(buffer.buffer_size, batch_size)
    info_epsilons= "epsilon-guess: {}\nepsilon_displacement_noise: {}".format(ep_guess,noise_displacement)

    data = "tau: {}, repetitions per optimization step (would be like epochs): {}".format(tau,repetitions) + "\n \n**** optimizers ***\n"+info_optimizers+"\n\n\n*** BUFFER ***\n"+infor_buffer+"\n\n\n *** NOISE PARAMETERS *** \n"+info_epsilons
    with open(directory+"/info.txt", 'w') as f:
        f.write(data)
        f.close()

    print("Beggining to train! \n \n")
    print(data)
    print("starting time: {}".format(datetime.now().strftime("%Y%m%d-%H%M%S")))
    print("saving results in " + str(directory))
    avg_train = []
    avg_test = []

    history_betas = [] #to put in histogram
    history_betas_would_have_done=[] #to put in histogram
    histo_preds = {"layer0":{}, "layer1":{}} #here i save the predictions to plot in a "straightforward way"

    #######
    for episode in tqdm(range(total_episodes)):

        alice_phase = np.random.choice([-1.,1.],1)[0]
        beta_would_do = actor(np.array([[0.]])).numpy()[0][0]
        beta =  beta_would_do + np.random.uniform(-noise_displacement, noise_displacement)
        proboutcome = Prob(alice_phase*amplitude,beta,0)
        outcome = np.random.choice([0.,1.],1,p=[proboutcome, 1-proboutcome])[0]

        history_betas.append(beta)
        history_betas_would_have_done.append(beta_would_do)

    #
        if np.random.random()< ep_guess:
            guess = np.random.choice([-1.,1.],1)[0]
        else:
            sequence = np.array([[ [beta, critic.pad_value], [outcome, -1.]]  ]).astype(np.float32)
            guess = critic.give_favourite_guess(sequence)
        if guess == alice_phase:
            reward = 1.
        else:
            reward = 0.
        buffer.add(beta, outcome, guess, reward)


        ###### OPTIMIZATION STEP ######
        ###### OPTIMIZATION STEP ######

        experiences = buffer.sample(batch_size)
        optimization_step(experiences,critic, critic_target, actor, optimizer_critic, optimizer_actor, train_loss)


#####
        avg_train.append(train_loss.result().numpy())
        avg_test.append(test_loss.result().numpy())
    #
        rt.append(reward)
    #

        ########################################################################
        ### calculate success probability if the agent went greedy ###########
        p=0
        for outcome in [0.,1.]:
            p+=Prob(critic.give_favourite_guess(critic.pad_single_sequence([beta_would_do, outcome, -1.]))*amplitude, beta_would_do,outcome)
        p/=2
        pt.append(p)
        ################

        if episode%(total_episodes/10) == 0: #this is for showing 10 results in total.

            template = 'Episode {}, \Rt: {}, \Pt: {}, Train loss: {}, Test loss: {}\n\n'
            print(template.format(episode+1,
                                np.sum(rt)/(episode+1),
                                  pt[-1],
                                 np.round(train_loss.result().numpy(),5),
                                 np.round(test_loss.result().numpy(),5))
                  )


            for layer in ["layer0","layer1"]: #net_0 will be critic_q0, net_1 will be critic_qguess

                histo_preds[layer][str(episode)] ={}
                histo_preds[layer][str(episode)]["episode"] = episode
                histo_preds[layer][str(episode)]["values"] = {}

            simp = np.random.randn(len(buffer.betas),4)
            simp[:,0] =buffer.betas
            qvals0 = np.squeeze(critic(critic.process_sequence(simp)[0]).numpy()[:,0])
            histo_preds["layer0"][str(episode)]["values"] = qvals0

            index=0
            for n1 in [0.,1.]:
                for guess in [-1.,1.]:
                    simp[:,1] = n1
                    simp[:,2] = guess
                    qvals1 = np.squeeze(critic(critic.process_sequence(simp)[0]).numpy()[:,1])
                    histo_preds["layer1"][str(episode)]["values"][str(index)] = qvals1
                    index+=1



    rt = [np.sum(rt[:k]) for k in range(len(rt))]
    rt = rt/np.arange(1,len(rt)+1)

    losses = [avg_train, avg_test]

    BigPlot(buffer,rt, pt, history_betas, history_betas_would_have_done, histo_preds, losses, directory)
    return


if __name__ == "__main__":
    info_run = ""
    to_csv=[]
    for tau in [0.001]:
        for lr_critic in [.001]:
            for noise_displacement in [.2]:
                for batch_size in [8., 16., 32. ,64.]:

                    # name_run = datetime.now().strftime("%m-%d-%H-%-M%-S")

                    name_run = ddpgKennedy(total_episodes=500, noise_displacement=noise_displacement, tau=tau,
                    buffer_size=10**3, batch_size=batch_size, lr_critic=lr_critic, lr_actor=0.001, plots=True)

