Here i explore ep-greedy with DQN for the dataset I have created (3d with 2 qubits)

Define the model

In [1]:
from vans_gym.envs import VansEnvsSeq
from vans_gym.solvers import CirqSolverR, Checker
import numpy as np
from tqdm import tqdm

solver = CirqSolverR(n_qubits = 2, observable_name="Ising_",qlr=0.05,qepochs=100)
checker = Checker(solver)

env = VansEnvsSeq(solver,checker=checker, depth_circuit=3)

gates_number = len(solver.alphabet) - solver.n_qubits
current=20

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
import numpy as np
import warnings
from collections import deque
import random
from tqdm import tqdm as tqdm
import os
import pickle
warnings.filterwarnings('ignore')

class Critic_rnn(tf.keras.Model):
    def __init__(self,tau=0.05, seed_val = 0.05, n_actions=6, state_shape=3):
        super(Critic_rnn,self).__init__()

        self.tau = tau
        self.n_actions = n_actions
        self.mask = tf.keras.layers.Masking(mask_value=-1.,input_shape=(state_shape,))
        
        self.lstm = tf.keras.layers.LSTM(250, return_sequences=True, input_shape=(state_shape,))
        self.l1 = Dense(60, kernel_regularizer=tf.keras.regularizers.l1(0.01))
        self.l2 = Dense(60,   kernel_regularizer=tf.keras.regularizers.l1(0.01))
        self.l3 = Dense(n_actions)
    


    def update_target_parameters(self,primary_net, tau=0.01):
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(tau * prim_weights[i] + (1 - tau) * targ_weights[i])
        self.set_weights(weights)
        return
    
    @tf.function
    def greedy_act(self, tf_state):
        return tf.argmax(self(tf_state), axis=-1)

    def give_action(self,state, ep=0.01):
        if np.random.random() < ep:
            random_action = np.random.choice(range(self.n_actions))
            return random_action
        else:
            idx = self.greedy_act(tf.expand_dims(np.array(state), axis=0))
            idx =idx.numpy()[0]
            return idx

    def call(self, inputs):
        feat = self.mask(tf.cast(inputs/self.n_actions, tf.float32))
        feat = self.l1(inputs)
        feat = tf.nn.relu(feat)
        feat = self.l2(feat)
        feat = tf.nn.relu(feat)
        feat = self.l3(feat)
        feat = tf.nn.tanh(feat)
        return feat



class ReplayBuffer():
    def __init__(self, buffer_size=10**6, ps=.3):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque()
        self.priorities = []
        self.ps=ps
        
    def add(self, experience, priority=0):
        if not isinstance(experience, tuple):
            raise ValueError("buffer wants tuples!")
        if self.count < self.buffer_size:
            self.buffer.append(experience)
            self.count += 1
            self.priorities.append(priority) 
        else:
            self.buffer.popleft()
            self.buffer.append(experience)
            self.priorities[self.count] = priority  #the most recent is self.count

    def size(self):
        return self.count

    def sample(self, batch_size):
        batch = []
        pro=(np.array(self.priorities)**self.ps)/np.sum(np.array(self.priorities)**self.ps)
        if self.count < batch_size:
            indices = np.random.choice(range(self.count), self.count, p=pro)
        else:
            indices = np.random.choice(range(self.count), int(batch_size), p=pro)
        for idx in indices:
            batch.append(self.buffer[idx])
        return batch

    def clear(self):
        self.buffer.clear()
        self.count = 0
        

In [3]:
def rnn_train_step(buffer, net, tnet, optimizer, bs=64):
    
    states, actions, ns, rewards, dones = np.transpose(buffer.sample(bs))
    concst=tf.concat([tf.cast(tf.reshape(tf.stack(states), (bs,1,3)), tf.float32), tf.cast(tf.reshape(tf.stack(ns), (bs,1,3)), tf.float32)], axis=1)
    
    
    with tf.GradientTape() as tape:
        tape.watch(net.trainable_variables)
        
        
        preds = net(concst)
        predst = tnet(concst)

        argsgreedy=tf.argmax(preds[:,1,:], axis=-1)
        qnext = predst.numpy()[range(bs),1,argsgreedy]
        target_q = rewards + (1-dones)*qnext

        qvals_update = tf.reduce_sum(tf.multiply(preds[:,0,:], tf.keras.utils.to_categorical(actions, net.n_actions)), axis=1)

        loss=tf.keras.losses.MeanSquaredError()(tf.stack(target_q),qvals_update)

      
        g= tape.gradient(loss, net.trainable_variables)
    optimizer.apply_gradients(zip(g, net.trainable_variables))
    tnet.update_target_parameters(net, tau=0.1)
    return loss

In [4]:
ep=0.01
ps=0.5
net = Critic_rnn()
tnet = Critic_rnn()

net(tf.expand_dims(np.array([0,0,1]), axis=0))
tnet(tf.expand_dims(np.array([0,0,1]), axis=0))
net.compile(loss="mse")

tnet.update_target_parameters(net, tau=1)


optimizer = tf.keras.optimizers.Adam(lr=10**-2)
with open ("data_testing_algo/energies_3d_2q.pickle", "rb") as dictt:
    energies = pickle.load(dictt)

with open ("data_testing_algo/next_states_3d_2q.pickle", "rb") as dictt:
    next_states = pickle.load(dictt)

buffer = ReplayBuffer(ps=ps)
#add_experiences()

name="runsuc"
current+=1
info="ep_greedy_"+str(ep)+"ps_"+str(ps)
fw = tf.summary.create_file_writer(name+"/"+str(current)+info)
for k in tqdm(range(3*10**4)):
    state = env.reset()
    stuck_count=0
    episode=[]
    done=False
    while not done:
        action = net.give_action(state, ep=ep)
        next_state = next_states[str(np.array(state).astype(np.int64))][str(action)]
        if len(np.where(next_state == -1)[0])==0:
            done = True
            reward = energies[str(np.array(next_state).astype(np.int64))]
            if reward<.9:
                reward=-1.
        else:
            reward=0.
            if stuck_count>5:
                done=True
                reward=-1.
        if str(next_state) == str(state):
            reward=-1. #it get's stucked..
        episode.append((state, action, next_state, reward, done))

        stuck_count+=1
        state=next_state
    for step in episode:
        buffer.add(step, priority=max(10**-10,reward))

    if k>32:
        ll = rnn_train_step(buffer, net, tnet, optimizer)
        with fw.as_default():
             tf.summary.scalar('losss', ll, step=k)

    done = False
    state = env.reset()

    stuck_count=0
    while not done:
        action = net.give_action(state, ep=0)
        next_state = next_states[str(np.array(state).astype(np.int64))][str(action)]
        if len(np.where(next_state == -1)[0])==0:
            done = True
            reward = energies[str(np.array(next_state).astype(np.int64))]
        else:
            reward=0.
            if stuck_count>3:
                done=True
        stuck_count+=1
        state=next_state

    with fw.as_default():
        tf.summary.scalar('greedy energy', tf.convert_to_tensor(reward), step=k)

  1%|          | 353/30000 [00:08<09:48, 50.39it/s] 



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.





To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

100%|██████████| 30000/30000 [17:52<00:00, 27.98it/s]


In [None]:
states, actions, ns, rewards, dones = np.transpose(buffer.sample(buffer.count))

In [None]:
net(tf.expand_dims(np.array([0,4,-1]), axis=0))

In [6]:
net(tf.expand_dims(np.array([-1,-1,-1]), axis=0))

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[0.9993218 , 0.9996736 , 0.9995933 , 0.9926628 , 0.99998266,
        0.99998057]], dtype=float32)>

In [9]:
net.give_action([-1,-1,-1])

5

In [10]:
net.give_action([5,-1,-1])

2

In [15]:
solver.give_circuit([5,2,3])

((0, 0): ───Rz(th_0)───────────────

(0, 1): ───Rx(-0.5π)───Rz(th_1)───,
 ['th_0', 'th_1'])

In [19]:
net(np.array([[5,2,-1]]).astype(np.float32))

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=array([[1., 1., 1., 1., 1., 1.]], dtype=float32)>

In [21]:
net(np.array([[-1,-1,-1]]).astype(np.float32))

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[1.       , 1.       , 0.9997144, 1.       , 1.       , 1.       ]],
      dtype=float32)>

In [12]:
net.give_action([5,2,5])

5

In [None]:
net.give_action([4,3,-1])

In [16]:
next_states

{'[0 0 0]': {},
 '[0 0 1]': {},
 '[0 0 2]': {},
 '[0 0 3]': {},
 '[0 0 4]': {},
 '[0 0 5]': {},
 '[ 0  0 -1]': {'0': array([ 0, -1, -1]),
  '1': array([ 1, -1, -1]),
  '2': array([ 2, -1, -1]),
  '3': array([ 3, -1, -1]),
  '4': array([ 4, -1, -1]),
  '5': array([ 5, -1, -1])},
 '[0 1 0]': {},
 '[0 1 1]': {},
 '[0 1 2]': {},
 '[0 1 3]': {},
 '[0 1 4]': {},
 '[0 1 5]': {},
 '[ 0  1 -1]': {'0': array([0, 1, 0]),
  '1': array([ 0, -1, -1]),
  '2': array([0, 1, 2]),
  '3': array([0, 3, 1]),
  '4': array([0, 4, 1]),
  '5': array([0, 1, 5])},
 '[0 2 0]': {},
 '[0 2 1]': {},
 '[0 2 2]': {},
 '[0 2 3]': {},
 '[0 2 4]': {},
 '[0 2 5]': {},
 '[ 0  2 -1]': {'0': array([ 2, -1, -1]),
  '1': array([2, 0, 1]),
  '2': array([ 2,  0, -1]),
  '3': array([2, 0, 3]),
  '4': array([2, 0, 4]),
  '5': array([2, 5, 0])},
 '[0 3 0]': {},
 '[0 3 1]': {},
 '[0 3 2]': {},
 '[0 3 3]': {},
 '[0 3 4]': {},
 '[0 3 5]': {},
 '[ 0  3 -1]': {'0': array([0, 3, 0]),
  '1': array([0, 3, 1]),
  '2': array([2, 0, 3]),
  '3'

In [None]:
net(tf.expand_dims(np.array([0,-1,-1]), axis=0))

In [None]:
net(tf.expand_dims(np.array([0,0,-1]), axis=0))

In [None]:
net(tf.expand_dims(np.array([1,2,-1]), axis=0))

OK! So the problem is that this fucking sequential thing collapses, because relates everything with everything... any smart way to put some structure that is not a RNN ? Let's jump to RNN !

In [None]:
energies["[0 0 1]"]

In [None]:
batch = buffer.sample(buffer.count)
states, actions, ns, rewards, dones = np.transpose(batch)

In [None]:
np.max(np.array(rewards))

In [None]:
def learning_step(critic, buffer, optimizer, batch_size=30):
    batch =buffer.sample(batch_size)
    states, actions, next_states, rewards, dones = np.transpose(batch)

    qpreds = critic(tf.stack(states))
    labels = qpreds.numpy()
    for inda, act in enumerate(actions):
        if dones[inda] is False:
            labels[inda,act] = np.max(np.squeeze(critic_target(tf.expand_dims(next_states[inda], axis=0))))
        else:
            labels[inda, act] = rewards[inda]


    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        qpreds = critic(tf.stack(states))

        loss = tf.keras.losses.MSE(labels, qpreds)
        loss = tf.reduce_mean(loss)
        grads = tape.gradient(loss, critic.trainable_variables)
    optimizer.apply_gradients(zip(grads, critic.trainable_variables))
    critic_target.update_target_parameters(critic)
    return loss.numpy()

In [None]:
net = Critic()
state = tf.random.uniform((1,1,3))
net(state)

In [None]:
import gym
from vans_gym.envs import VansEnvsSeq
from vans_gym.solvers import CirqSolverR, Checker
from vans_gym.models import DQN
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import pickle

solver = CirqSolverR(n_qubits = 2, observable_name="Ising_",qlr=0.05,qepochs=100)
checker = Checker(solver)

env = VansEnvsSeq(solver,checker=checker, depth_circuit=3)

gates_number = len(solver.alphabet) - solver.n_qubits




In [None]:
Model = DQN(env,name="wed",use_per=False, learning_rate=0.1, tau=1, ep=1)

In [None]:
for k in tqdm(range(1000)):
    done = False
    state = env.reset()
    stuck_count=0
    while not done:
        action = Model.give_action(state, ep=1)
        next_state = next_states[str(np.array(state).astype(np.int64))][str(action)]
        if len(np.where(next_state == -1)[0])==0:
            done = True
            reward = energies[str(np.array(next_state).astype(np.int64))]
        else:
            reward=0.
            if stuck_count>10:
                done=True
        Model.replay_buffer.add_experience(action, [state, next_state], reward, done)
        stuck_count+=1
        state=next_state
        
    

In [None]:
for k in tqdm(range(2000)):
    with Model.fw_loss.as_default():
        tf.summary.scalar('loss', Model.learn_step(batch_size=32), step=k)
        
    done = False
    state = env.reset()

    stuck_count=0
    while not done:
        action = Model.give_action(state, ep=0)
        next_state = next_states[str(np.array(state).astype(np.int64))][str(action)]
        if len(np.where(next_state == -1)[0])==0:
            done = True
            reward = energies[str(np.array(next_state).astype(np.int64))]
        else:
            reward=0.
            if stuck_count>5:
                done=True
        stuck_count+=1
        state=next_state

    with Model.fw_greedy.as_default():
        tf.summary.scalar('greedy energy', tf.convert_to_tensor(reward), step=k)
