In [None]:
!pip install -U tensorflow tensorflow_probability

In [2]:
import tensorflow as tf
import tensorflow_probability as tfp

print(tf.__version__)

2.7.0


In [12]:
import random
import gym 
import numpy as np
from collections import deque

In [4]:
class ReplayBuffer:
    """
    Replay Buffer
    Stores and retrieves gameplay experiences
    """

    def __init__(self):
        self.gameplay_experiences = deque(maxlen=1000000)

    def store_gameplay_experience(self, state, next_state, reward, action,
                                  done):
        """
        Records a single step (state transition) of gameplay experience.
        :param state: the current game state
        :param next_state: the game state after taking action
        :param reward: the reward taking action at the current state brings
        :param action: the action taken at the current state
        :param done: a boolean indicating if the game is finished after
        taking the action
        :return: None
        """
        self.gameplay_experiences.append((state, next_state, reward, action,
                                          done))

    def sample_gameplay_batch(self):
        """
        Samples a batch of gameplay experiences for training.
        :return: a list of gameplay experiences
        """
        batch_size = min(128, len(self.gameplay_experiences))
        sampled_gameplay_batch = random.sample(
            self.gameplay_experiences, batch_size)
        state_batch = []
        next_state_batch = []
        action_batch = []
        reward_batch = []
        done_batch = []
        for gameplay_experience in sampled_gameplay_batch:
            state_batch.append(gameplay_experience[0])
            next_state_batch.append(gameplay_experience[1])
            reward_batch.append(gameplay_experience[2])
            action_batch.append(gameplay_experience[3])
            done_batch.append(gameplay_experience[4])
        return np.array(state_batch), np.array(next_state_batch), np.array(
            action_batch), np.array(reward_batch), np.array(done_batch)


In [5]:
class DQN_Model(tf.keras.Model):
  ''' input state, output each action's predict Q value'''
  def __init__(self, action_size=2):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(30,activation='relu')
    self.d2 = tf.keras.layers.Dense(30,activation='relu')
    self.out = tf.keras.layers.Dense(action_size, activation='linear')

  def call(self, input_data):
    x = tf.convert_to_tensor(input_data)
    x = self.d1(x)
    x = self.d2(x)
    x = self.out(x)
    return x


In [70]:
class DQN_Agent:
  def __init__(self):
    self.q_net = DQN_Model()
    self.target_q_net = DQN_Model()
    self.gamma = 0.999
    self.opt = tf.keras.optimizers.Adagrad(learning_rate=0.005)
    self.loss_fn = tf.keras.losses.MeanSquaredError()

  def collect_policy(self,state):
    ''' 
    state: shape=(4,) 
    prob: shape=(1, 2)
    action: shape=(1,)
    '''
    if np.random.random() < 0.05:
        return np.random.randint(0, 2)
    return self.policy(state)

    # prob = self.q_net(tf.reshape(state, [-1, 4]))
    # print("prob:", prob)
    # dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    # action = dist.sample()
    # print("action:", action)
    # return int(action.numpy()[0])

  def policy(self, state):
    prob = self.q_net(tf.reshape(state, [-1, 4]))
    action = tf.math.argmax(prob, axis=1)
    #print("prob:", prob, "action:", action)
    return int(action.numpy()[0])

  # def a_loss(self,prob, action, reward): 
  #   dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
  #   log_prob = dist.log_prob(action)
  #   loss = -log_prob*reward
  #   return loss 

  def update_target_network(self):
      """
      Updates the current target_q_net with the q_net which brings all the
      training in the q_net to the target_q_net.
      :return: None
      """
      self.target_q_net.set_weights(self.q_net.get_weights())

  def train_single_step(self, states, rewards, actions, next_states, dones):

    step_count = 0
    for state, reward, action, next_state, done in zip(states, rewards, actions, next_states, dones):
      step_count += 1
      with tf.GradientTape() as tape:
        predict_q = self.q_net(np.array([state]), training=True)

        target_q = predict_q
        target_q_val = tf.constant([reward])
        if not done:
          next_q = self.target_q_net(np.array([next_state]))
          max_next_q = tf.reduce_max(next_q, axis=1)
          target_q_val += 0.95 * max_next_q
        #print(done, target_q, [[0, action]], target_q_val)
        # action 对应的 Q 值有loss， 其他action 对应的Q值无loss
        target_q = tf.tensor_scatter_nd_update(target_q, [[0, action]], target_q_val)

        #print("step_count:{} target_q: {}, predict_q:{}".format(step_count, target_q, predict_q))
        loss = self.loss_fn(target_q, predict_q)
      
      grads = tape.gradient(loss, self.q_net.trainable_variables)
      self.opt.apply_gradients(zip(grads, self.q_net.trainable_variables))


  def train(self, states, rewards, actions, next_states, dones):

    #print("train:", states, rewards, actions, next_states, dones)
    #print("actions:", actions)

    action_index = list(zip(range(len(actions)), actions))
    with tf.GradientTape() as tape:
      predict_q = self.q_net(states, training=True) # (batch_size, 2)

      # print("states:", tf.shape(states))
      # print("predict_q:", predict_q)

      target_q = predict_q 
      target_q_val = tf.constant(rewards, dtype=tf.float32)  # (batch_size,)

      next_q = self.target_q_net(next_states)
      #print("next_q:", next_q)

      max_next_q = tf.reduce_max(next_q, axis=1) # (batch_size, 1)
      #print("max_next_q:", max_next_q, "target_q_val:", target_q_val)
      target_q_val += max_next_q * 0.95


      target_q = tf.tensor_scatter_nd_update(target_q, action_index, target_q_val)

      #print(done, target_q, [[0, action]], target_q_val)
      # action 对应的 Q 值有loss， 其他action 对应的Q值无loss
      #target_q = tf.tensor_scatter_nd_update(target_q, [[0, action]], target_q_val)

      #print("target_q: {}, predict_q:{}".format(target_q, predict_q))
      loss = self.loss_fn(target_q, predict_q)
      
    grads = tape.gradient(loss, self.q_net.trainable_variables)
    self.opt.apply_gradients(zip(grads, self.q_net.trainable_variables))
    #print("loss:", loss)
    return loss



In [7]:
env = gym.make('CartPole-v1')

In [71]:
ai_agent = DQN_Agent()
buffer = ReplayBuffer()

In [72]:
def collect_gameplay_experiences(env, agent, buffer):
    """
    Collects gameplay experiences by playing env with the instructions
    produced by agent and stores the gameplay experiences in buffer.
    :param env: the game environment
    :param agent: the DQN agent
    :param buffer: the replay buffer
    :return: None
    """
    state = env.reset()
    done = False
    while not done:
        action = agent.collect_policy(state)
        next_state, reward, done, _ = env.step(action)
        if done:
            reward = -1.0
        buffer.store_gameplay_experience(state, next_state,
                                         reward, action, done)
        state = next_state

In [73]:
def evaluate_training_result(env, agent):
    """
    Evaluates the performance of the current DQN agent by using it to play a
    few episodes of the game and then calculates the average reward it gets.
    The higher the average reward is the better the DQN agent performs.
    :param env: the game environment
    :param agent: the DQN agent
    :return: average reward across episodes
    """
    total_reward = 0.0
    episodes_to_play = 10
    for i in range(episodes_to_play):
        state = env.reset()
        done = False
        episode_reward = 0.0
        while not done:
            action = agent.policy(state)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        total_reward += episode_reward
    average_reward = total_reward / episodes_to_play
    return average_reward


In [76]:

steps = 6000


for s in range(steps):
  
  collect_gameplay_experiences(env, ai_agent, buffer)
  gameplay_experience_batch = buffer.sample_gameplay_batch()
  states, next_states, actions, rewards, dones = gameplay_experience_batch
  #print(gameplay_experience_batch)
  loss = ai_agent.train(states, rewards, actions, next_states, dones)
  avg_reward = evaluate_training_result(env, ai_agent)
  print("step = {}, loss = {}, avg_reward={}".format(s, loss, avg_reward))
  # done = False
  # state = env.reset()
  
  # total_reward = 0
  # episode_count = 0

  # rewards = []
  # states = []
  # actions = []
  # next_states = []
  # dones = []

  # while not done:
  #   #env.render()
  #   action = ai_agent.collect_policy(state)
  #   next_state, reward, done, _ = env.step(action)

  #   states.append(state)
  #   actions.append(action)
  #   rewards.append(reward)
  #   next_states.append(next_state)
  #   dones.append(done)

  #   state = next_state
  #   total_reward += reward
    
  #   if done:
  #     ai_agent.train(states, rewards, actions, next_states, dones)
  #     episode_count += 1
  #     if episode_count % 20 == 0:
  #       ai_agent.update_target_network()
  #     #print("total step for this episord are {}".format(t))
  #     print("total reward after {} steps is {}".format(s, total_reward))


step = 0, loss = 0.10644170641899109, avg_reward=9.6
step = 1, loss = 0.10364633798599243, avg_reward=9.3
step = 2, loss = 0.08234547823667526, avg_reward=9.3
step = 3, loss = 0.14284659922122955, avg_reward=9.7
step = 4, loss = 0.10925962030887604, avg_reward=9.3
step = 5, loss = 0.16830024123191833, avg_reward=9.5
step = 6, loss = 0.1353081464767456, avg_reward=8.9
step = 7, loss = 0.16609293222427368, avg_reward=9.6
step = 8, loss = 0.13902956247329712, avg_reward=9.4
step = 9, loss = 0.12115225195884705, avg_reward=9.5
step = 10, loss = 0.11307188868522644, avg_reward=9.1
step = 11, loss = 0.10337039083242416, avg_reward=9.6
step = 12, loss = 0.07457363605499268, avg_reward=9.4
step = 13, loss = 0.07995116710662842, avg_reward=9.1
step = 14, loss = 0.12051858007907867, avg_reward=9.5
step = 15, loss = 0.1015283614397049, avg_reward=9.2
step = 16, loss = 0.08184604346752167, avg_reward=9.0
step = 17, loss = 0.09789541363716125, avg_reward=9.7
step = 18, loss = 0.1083342507481575, av

KeyboardInterrupt: 

In [29]:
state

array([-0.01219752,  0.04356086, -0.01093986, -0.03999828], dtype=float32)

In [30]:
ai_agent.collect_policy(state)

prob: tf.Tensor([[-0.00357938 -0.00206238]], shape=(1, 2), dtype=float32)
action: tf.Tensor([2.], shape=(1,), dtype=float32)


2

In [21]:
def ai_play(ai_agent):
    env = gym.make('CartPole-v1')
    state = env.reset()

    done = False
    step = 0
    total_reward = 0
    while not done:
        env.render()
        action = ai_agent.predict(state)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        step += 1
        total_reward += reward

        print("step: {} action: {}, total_reward: {}, done:{}".format(step, action, total_reward, done))
        if done:
            break
    env.reset()
    env.close()

In [26]:
ai_play(ai_agent)

step: 1 action: 1, total_reward: 1.0, done:False
step: 2 action: 0, total_reward: 2.0, done:False
step: 3 action: 1, total_reward: 3.0, done:False
step: 4 action: 0, total_reward: 4.0, done:False
step: 5 action: 0, total_reward: 5.0, done:False
step: 6 action: 1, total_reward: 6.0, done:False
step: 7 action: 0, total_reward: 7.0, done:False
step: 8 action: 1, total_reward: 8.0, done:False
step: 9 action: 0, total_reward: 9.0, done:False
step: 10 action: 1, total_reward: 10.0, done:False
step: 11 action: 0, total_reward: 11.0, done:False
step: 12 action: 1, total_reward: 12.0, done:False
step: 13 action: 0, total_reward: 13.0, done:False
step: 14 action: 1, total_reward: 14.0, done:False
step: 15 action: 0, total_reward: 15.0, done:False
step: 16 action: 1, total_reward: 16.0, done:False
step: 17 action: 0, total_reward: 17.0, done:False
step: 18 action: 1, total_reward: 18.0, done:False
step: 19 action: 0, total_reward: 19.0, done:False
step: 20 action: 1, total_reward: 20.0, done:Fals

In [10]:
state = env.reset()
prob = agentoo7.model(tf.reshape(state, [-1, 4]))
action = tf.math.argmax(prob, axis=1)
state, prob, action

(array([ 0.00261727, -0.03078178, -0.01598616,  0.00459414], dtype=float32),
 <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.49441344, 0.5055866 ]], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=int64, numpy=array([1])>)

In [28]:
dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
log_prob = dist.log_prob(action)
#loss = -log_prob*reward
prob, action, log_prob

(<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.49441344, 0.5055866 ]], dtype=float32)>,
 1,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.6820359], dtype=float32)>)

In [29]:
dist.logits_parameter()

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.7043832 , -0.68203586]], dtype=float32)>

In [30]:
tf.math.log(prob)

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.7043832 , -0.68203586]], dtype=float32)>

In [None]:
# action = agentoo7.act(state)
# action

In [None]:
# prob = agentoo7.model(tf.reshape(state, [-1, 4]))
# prob

In [None]:
# dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)

In [None]:
# dist.sample()