In [12]:
import gym
import keras
import datetime as dt
import tensorflow as tf
import random
import numpy as np
import math
from tensorflow.keras.layers import Dense
from tqdm import tqdm

STORE_PATH = '/run'
MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.0005
GAMMA = 0.95
BATCH_SIZE = 32
TAU = 0.08
RANDOM_REWARD_STD = 1.0
env = gym.make("CartPole-v1")
# env = gym.make("MountainCar-v0")
state_size = 4
num_actions = env.action_space.n



class Memory():
    def __init__(self, max_memory):
        self._max_memory = max_memory
        self._samples = []
    def add_sample(self, sample):
        self._samples.append(sample)
        if len(self._samples) > self._max_memory:
            self._samples.pop(0)
    def sample(self, no_samples):
        if no_samples > len(self._samples):
            return random.sample(self._samples, len(self._samples))
        else:
            return random.sample(self._samples, no_samples)
    @property
    def num_samples(self):
        return len(self._samples)
        
memory = Memory(50000)


class QN(tf.keras.Model):
    def __init__(self):
        super(QN,self).__init__()
        self.l1 = Dense(30, input_shape=(4,), kernel_initializer='random_uniform',
                bias_initializer='random_uniform')
        self.l2 = Dense(30, kernel_initializer='random_uniform',
                bias_initializer='random_uniform')
        self.l3 = Dense(num_actions, kernel_initializer='random_uniform',
                bias_initializer='random_uniform')

    def call(self, input):
        feat = tf.nn.relu(self.l1(input))
        feat = tf.nn.relu(self.l2(feat))
        value = self.l3(feat)
        return value

def choose_action(state, primary_network, eps):
    if random.random() < eps:
        return random.randint(0, num_actions - 1)
    else:
        state = np.expand_dims(np.array(state),axis=0) #otherwise throuhg eerror..
        return np.argmax(primary_network(state))


def train(primary_network, memory, tarket_network):
    if memory.num_samples < BATCH_SIZE*3:
        return 0
    else:
        print("doing!!!")
        batch = memory.sample(BATCH_SIZE)
        states = np.array([val[0] for val in batch])
        actions = np.array([val[1] for val in batch])
        rewards = np.array([val[2] for val in batch])
        next_states = np.array([(np.zeros(state_size)
                                 if val[3] is None else val[3]) for val in batch])

        prim_qt = primary_network(np.expand_dims(states,axis=0)) # Q_t[s,a]
        prim_qtp1 = primary_network(np.expand_dims(next_states,axis=0)) #Q_{t+1}[s,a]

        updates = rewards
        valid_idxs = np.array(next_states).sum(axis=1) != 0
        batch_idxs = np.arange(BATCH_SIZE)

        opt_q_tp1_eachS = np.argmax(np.squeeze(prim_qtp1.numpy()), axis=0)
        q_from_target = target_network(np.expand_dims(next_states), axis=0)

        updates[valid_idx] += GAMMA*np.squeeze(q_from_target.numpy())[valid_idx, opt_q_tp1_eachS[valid_idx]]

        target_q = np.squeeze(prim_qt.numpy())
        target_q[batch_idxs, actions] = updates

        with tf.device("/cpu:0"):
            with tf.GradientTape() as tape:
                tape.watch(primary_network.trainable_variables)
                predicted_q = primary_network(states)
                target_q = np.expand_dims(target_q,axis=0)
                loss = tf.keras.losses.MSE(predicted_q, target_q)
                loss = tf.reduce_mean(loss)
                print(loss)
                grads = tape.gradient(loss, primary_network.trainable_variables)
                optimizer.apply_gradients(zip(grads, primary_network.trainable_variables))

        for t, e in zip(target_network.trainable_variables, primary_network.trainable_variables):
            t.assign(t*(1-TAU) + e*TAU)

        return loss




primary_network = QN()
target_network = QN()
optimizer = tf.keras.optimizers.Adam(lr=0.01)




num_episodes  = 10**4
eps = MAX_EPSILON
render = False
# train_writer = tf.summary.create_file_writer("summarie/"+f"/DoubleQ_{dt.datetime.now().strftime('%d%m%Y%H%M')}")
steps = 0

for i in tqdm(range(num_episodes)):
    state = env.reset()
    cnt=0
    avg_loss=0
    while True:
        action = choose_action(state, primary_network, eps)
        next_state, reward, done, info = env.step(action)
        reward = np.random.normal(1.0, RANDOM_REWARD_STD)
        if done:
            next_state = None
        memory.add_sample((state, action, reward, next_state))
        loss = train(primary_network, memory, target_network)
        avg_loss += loss
        state = next_state
        steps +=1
        eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON)*np.exp(- LAMBDA*steps)
        if done:
            avg_loss /= cnt
            # print(f"Episode: {i}, Reward: {cnt}, avg loss: {avg_loss:.3f}, eps: {eps:.3f}")
            # with train_writer.as_default():
            #     tf.summary.scalar('reward', cnt, step=i)
            #     tf.summary.scalar('avg loss', avg_loss, step=i)
            break
        cnt += 1


  0%|          | 0/10000 [00:00<?, ?it/s]W1030 13:45:00.939733 140192390350656 base_layer.py:1814] Layer qn_8 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



doing!!!





TypeError: expand_dims() missing 1 required positional argument: 'axis'

In [4]:
primary_network = QN()
target_network = QN()
optimizer = tf.keras.optimizers.Adam(lr=0.001)


In [13]:
num_episodes  = 10**4
eps = MAX_EPSILON
render = False
train_writer = tf.summary.create_file_writer("summarie/"+f"/DoubleQ_{dt.datetime.now().strftime('%d%m%Y%H%M')}")
steps = 0

for i in range(num_episodes):
    state = env.reset()
    cnt=0
    avg_loss=0
    while True:
        action = choose_action(state, primary_network, eps)
        next_state, reward, done, info = env.step(action)
        reward = np.random.normal(1.0, RANDOM_REWARD_STD)
        if done:
            next_state = None
        memory.add_sample((state, action, reward, next_state))
        loss = train(primary_network, memory, target_network)
        avg_loss += loss
        state = next_state
        steps +=1
        eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON)*np.exp(- LAMBDA*steps)
        if done:
            avg_loss /= cnt
            print(f"Episode: {i}, Reward: {cnt}, avg loss: {avg_loss:.3f}, eps: {eps:.3f}")
            with train_writer.as_default():
                tf.summary.scalar('reward', cnt, step=i)
                tf.summary.scalar('avg loss', avg_loss, step=i)
            break
        cnt += 1

doing!!!


TypeError: expand_dims() missing 1 required positional argument: 'axis'