Here i explore ep-greedy with DQN for the dataset I have created (3d with 2 qubits)

Define the model

In [1]:
from vans_gym.envs import VansEnvsSeq
from vans_gym.solvers import CirqSolverR, Checker
import numpy as np
from tqdm import tqdm

solver = CirqSolverR(n_qubits = 2, observable_name="Ising_",qlr=0.05,qepochs=100)
checker = Checker(solver)

env = VansEnvsSeq(solver,checker=checker, depth_circuit=3)

gates_number = len(solver.alphabet) - solver.n_qubits
current=10

In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
import numpy as np
import warnings
from collections import deque
import random
from tqdm import tqdm as tqdm
import os
import pickle
warnings.filterwarnings('ignore')

class Critic(tf.keras.Model):
    def __init__(self,tau=0.05, seed_val = 0.05, n_actions=6):
        super(Critic,self).__init__()

        self.tau = tau
        self.n_actions = n_actions

        self.l1 = Dense(60)
        self.l2 = Dense(60)
        self.l3 = Dense(n_actions)
    


    def update_target_parameters(self,primary_net):
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(self.tau * prim_weights[i] + (1 - self.tau) * targ_weights[i])
        self.set_weights(weights)
        return
    
    @tf.function
    def greedy_act(self, tf_state):
        return tf.argmax(self(tf_state), axis=-1)

    def give_action(self,state, ep=0.01):
        if np.random.random() < ep:
            random_action = np.random.choice(range(self.n_actions))
            return random_action
        else:
            idx = self.greedy_act(tf.expand_dims(np.array(state), axis=0))
            idx =idx.numpy()[0]
            return idx

    def call(self, inputs):
        feat = self.l1(inputs)
        feat = tf.nn.relu(feat)
        feat = self.l2(feat)
        feat = tf.nn.relu(feat)
        feat = self.l3(feat)
        feat = tf.nn.sigmoid(feat)
        return feat



class ReplayBuffer():
    def __init__(self, buffer_size=10**6):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque()
        self.priorities = []
        self.ps=.8
        
    def add(self, experience, priority=0):
        if not isinstance(experience, tuple):
            raise ValueError("buffer wants tuples!")
        if self.count < self.buffer_size:
            self.buffer.append(experience)
            self.count += 1
            self.priorities.append(priority + 0.01) 
        else:
            self.buffer.popleft()
            self.buffer.append(experience)
            self.priorities[self.count] = priority + 0.01 #the most recent is self.count

    def size(self):
        return self.count

    def sample(self, batch_size):
        batch = []
        pro=(np.array(self.priorities)**self.ps)/np.sum(np.array(self.priorities)**self.ps)
        if self.count < batch_size:
            indices = np.random.choice(range(self.count), self.count, p=pro)
        else:
            indices = np.random.choice(range(self.count), int(batch_size), p=pro)
        for idx in indices:
            batch.append(self.buffer[idx])
        return batch

    def clear(self):
        self.buffer.clear()
        self.count = 0


In [20]:
buffer = ReplayBuffer()

In [21]:
with open ("data_testing_algo/energies_3d_2q.pickle", "rb") as dictt:
    energies = pickle.load(dictt)

with open ("data_testing_algo/next_states_3d_2q.pickle", "rb") as dictt:
    next_states = pickle.load(dictt)

In [22]:
def add_experiences():
    for k in tqdm(range(1000)):
        done = False
        state = env.reset()
        stuck_count=0
        episode=[]
        while not done:
            action = net.give_action(state, ep=1)
            next_state = next_states[str(np.array(state).astype(np.int64))][str(action)]
            if len(np.where(next_state == -1)[0])==0:
                done = True
                reward = energies[str(np.array(next_state).astype(np.int64))]
            else:
                reward=0.
                if stuck_count>5:
                    done=True
            episode.append((state, action, next_state, reward, done))
            stuck_count+=1
            state=next_state
        for step in episode:
            buffer.add(step, priority=reward)

In [23]:
def train_step(buffer, net, optimizer):
    batch_size=32
    batch =buffer.sample(batch_size)
    states, actions, ns, rewards, dones = np.transpose(batch)

    with tf.GradientTape() as tape:
        tape.watch(net.trainable_variables)
        qpreds = net(tf.stack(states))
        Q_sP1_greedy = tf.math.reduce_max(net(tf.stack(ns)), axis=-1)
        target_q = rewards + (1-dones)*Q_sP1_greedy

        Q_update = tf.reduce_sum(tf.multiply(qpreds, tf.keras.utils.to_categorical(actions, net.n_actions)), axis=1)

        loss = tf.reduce_mean(tf.keras.losses.MeanSquaredError()(target_q, Q_update))
        grads = tape.gradient(loss, net.trainable_variables)
        
    #c=0
    #for gra, var in zip(grads, net.trainable_variables):
    #    grads[c] = tf.clip_by_value(grads[c], -0.1,0.1)
    #c+=1
    #grads = [tf.clip_by_value(k, np.min(), self.clip_rew) for k in tape.gradient(loss, self.prim_qnet.trainable_variables)]

    optimizer.apply_gradients(zip(grads, net.trainable_variables))
    return loss

In [24]:
buffer = ReplayBuffer()
net = Critic()
add_experiences()
state = tf.random.uniform((1,1,3))
net(state)
optimizer = tf.keras.optimizers.Adam(lr=10**-2)

100%|██████████| 1000/1000 [00:00<00:00, 2725.56it/s]


In [25]:
name="runs"
current+=1
info="lr10**-2"
fw_loss = tf.summary.create_file_writer(name+"/"+str(current)+info)
fw_greedy= tf.summary.create_file_writer(name+"/"+str(current)+info)
for k in tqdm(range(10**3)):
    l = train_step(buffer,net,optimizer)
    with fw_loss.as_default():
         tf.summary.scalar('loss', l, step=k)
            
    done = False
    state = env.reset()

    stuck_count=0
    while not done:
        action = net.give_action(state, ep=0)
        next_state = next_states[str(np.array(state).astype(np.int64))][str(action)]
        if len(np.where(next_state == -1)[0])==0:
            done = True
            reward = energies[str(np.array(next_state).astype(np.int64))]
        else:
            reward=0.
            if stuck_count>3:
                done=True
        stuck_count+=1
        state=next_state

    with fw_greedy.as_default():
        tf.summary.scalar('greedy energy', tf.convert_to_tensor(reward), step=k)

  0%|          | 1/1000 [00:00<03:19,  5.01it/s]



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.





To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

100%|██████████| 1000/1000 [00:13<00:00, 76.81it/s]


In [26]:
net(tf.expand_dims(np.array([5,0,-1]), axis=0))

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[0.99951386, 0.9948768 , 0.9503937 , 0.99988604, 0.05652143,
        0.15273295]], dtype=float32)>

In [13]:
energies["[5 0 3]"]

array(0.99999976, dtype=float32)

In [15]:
list(energies.items())

[('[0 1 0]', 0.0),
 ('[0 1 2]', array(-4.0865985e-17, dtype=float32)),
 ('[0 3 1]', array(2.435396e-17, dtype=float32)),
 ('[0 4 1]', -4.2146848e-08),
 ('[0 1 5]', -4.2146848e-08),
 ('[2 0 1]', array(-8.458828e-18, dtype=float32)),
 ('[2 0 3]', array(1.4974907e-17, dtype=float32)),
 ('[2 0 4]', array(-3.9502556e-08, dtype=float32)),
 ('[2 5 0]', array(-6.474444e-08, dtype=float32)),
 ('[0 3 0]', array(-5.2230593e-17, dtype=float32)),
 ('[0 4 3]', array(-5.2390664e-08, dtype=float32)),
 ('[0 3 5]', array(-5.2217576e-08, dtype=float32)),
 ('[0 4 0]', 4.371138e-08),
 ('[0 4 2]', array(0.9999929, dtype=float32)),
 ('[0 4 4]', -8.429369e-08),
 ('[0 4 5]', -8.429368e-08),
 ('[5 0 1]', 4.371138e-08),
 ('[5 0 3]', array(0.99999976, dtype=float32)),
 ('[5 0 4]', -8.429368e-08),
 ('[5 5 0]', -8.429369e-08),
 ('[1 0 1]', 0.0),
 ('[1 2 0]', array(-5.411548e-17, dtype=float32)),
 ('[1 0 3]', array(-1.0179708e-16, dtype=float32)),
 ('[1 0 4]', -4.2146848e-08),
 ('[1 5 0]', -4.2146848e-08),
 ('[1 2 1

In [17]:
net(tf.expand_dims(np.array([5,-1,-1]), axis=0))

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[1.0381008, 1.056144 , 1.0013628, 1.0380746, 1.0129645, 1.0406094]],
      dtype=float32)>

In [27]:
net(tf.expand_dims(np.array([-1,-1,-1]), axis=0))

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[1.        , 0.99998057, 0.9999523 , 0.99999154, 0.99999976,
        0.99999964]], dtype=float32)>

In [28]:
net(tf.expand_dims(np.array([0,-1,-1]), axis=0))

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[1.        , 0.99965024, 0.99985254, 0.9999442 , 0.9999994 ,
        0.9999975 ]], dtype=float32)>

In [29]:
net(tf.expand_dims(np.array([0,0,1]), axis=0))

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[0.7090682 , 0.9918829 , 0.31462577, 0.05386559, 0.01243235,
        0.00883244]], dtype=float32)>

OK! So the problem is that this fucking sequential thing collapses, because relates everything with everything... any smart way to put some structure that is not a RNN ? Let's jump to RNN !

In [30]:
energies["[0 0 1]"]

KeyError: '[0 0 1]'

In [None]:
batch = buffer.sample(buffer.count)
states, actions, ns, rewards, dones = np.transpose(batch)

In [None]:
np.max(np.array(rewards))

In [None]:
def learning_step(critic, buffer, optimizer, batch_size=30):
    batch =buffer.sample(batch_size)
    states, actions, next_states, rewards, dones = np.transpose(batch)

    qpreds = critic(tf.stack(states))
    labels = qpreds.numpy()
    for inda, act in enumerate(actions):
        if dones[inda] is False:
            labels[inda,act] = np.max(np.squeeze(critic_target(tf.expand_dims(next_states[inda], axis=0))))
        else:
            labels[inda, act] = rewards[inda]


    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        qpreds = critic(tf.stack(states))

        loss = tf.keras.losses.MSE(labels, qpreds)
        loss = tf.reduce_mean(loss)
        grads = tape.gradient(loss, critic.trainable_variables)
    optimizer.apply_gradients(zip(grads, critic.trainable_variables))
    critic_target.update_target_parameters(critic)
    return loss.numpy()

In [None]:
net = Critic()
state = tf.random.uniform((1,1,3))
net(state)

In [None]:
import gym
from vans_gym.envs import VansEnvsSeq
from vans_gym.solvers import CirqSolverR, Checker
from vans_gym.models import DQN
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import pickle

solver = CirqSolverR(n_qubits = 2, observable_name="Ising_",qlr=0.05,qepochs=100)
checker = Checker(solver)

env = VansEnvsSeq(solver,checker=checker, depth_circuit=3)

gates_number = len(solver.alphabet) - solver.n_qubits




In [None]:
Model = DQN(env,name="wed",use_per=False, learning_rate=0.1, tau=1, ep=1)

In [None]:
for k in tqdm(range(1000)):
    done = False
    state = env.reset()
    stuck_count=0
    while not done:
        action = Model.give_action(state, ep=1)
        next_state = next_states[str(np.array(state).astype(np.int64))][str(action)]
        if len(np.where(next_state == -1)[0])==0:
            done = True
            reward = energies[str(np.array(next_state).astype(np.int64))]
        else:
            reward=0.
            if stuck_count>10:
                done=True
        Model.replay_buffer.add_experience(action, [state, next_state], reward, done)
        stuck_count+=1
        state=next_state
        
    

In [None]:
for k in tqdm(range(2000)):
    with Model.fw_loss.as_default():
        tf.summary.scalar('loss', Model.learn_step(batch_size=32), step=k)
        
    done = False
    state = env.reset()

    stuck_count=0
    while not done:
        action = Model.give_action(state, ep=0)
        next_state = next_states[str(np.array(state).astype(np.int64))][str(action)]
        if len(np.where(next_state == -1)[0])==0:
            done = True
            reward = energies[str(np.array(next_state).astype(np.int64))]
        else:
            reward=0.
            if stuck_count>5:
                done=True
        stuck_count+=1
        state=next_state

    with Model.fw_greedy.as_default():
        tf.summary.scalar('greedy energy', tf.convert_to_tensor(reward), step=k)
