##### Copyright 2018 The TensorFlow Authors.

Licensed under the Apache License, Version 2.0 (the "License");

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License"); { display-mode: "form" }
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get Started with TensorFlow Probability

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/probability"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/probability/blob/main/tensorflow_probability/g3doc/_index.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/probability/tree/main/tensorflow_probability/g3doc/_index.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

This is a [Google Colaboratory](https://colab.research.google.com/notebooks/welcome.ipynb) notebook file. Python programs are run directly in the browser—a great way to learn and use TensorFlow. To run the Colab notebook:

1. Connect to a Python runtime: At the top-right of the menu bar, select *CONNECT*.
2. Run all the notebook code cells: Select *Runtime* > *Run all*.

For more examples and guides (including details for this program), see [Get Started with TensorFlow](https://www.tensorflow.org/tutorials/).

In [None]:
!pip install -U tensorflow tensorflow_probability

Import the TensorFlow and TensorFlow Probability modules into your program. Enable [eager execution](https://www.tensorflow.org/guide/eager) to display the output as we go:

In [1]:
import tensorflow as tf
import tensorflow_probability as tfp

print(tf.__version__)

2.7.0


In [2]:
import gym 
import numpy as np

env = gym.make('CartPole-v1')

In [None]:
# # Pretend to load synthetic data set.
# features = tfp.distributions.Normal(loc=0., scale=1.).sample(int(100e3))
# labels = tfp.distributions.Bernoulli(logits=1.618 * features).sample()

# # Specify model.
# model = tfp.glm.Bernoulli()

# # Fit model given data.
# coeffs, linear_response, is_converged, num_iter = tfp.glm.fit(
#     model_matrix=features[:, tf.newaxis],
#     response=tf.cast(labels, dtype=tf.float32),
#     model=model)

# print(coeffs)  # ==> coeffs is approximately [1.618] (We're golden!)

In [3]:
class model(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(30,activation='relu')
    self.d2 = tf.keras.layers.Dense(30,activation='relu')
    self.out = tf.keras.layers.Dense(env.action_space.n,activation='softmax')

  def call(self, input_data):
    x = tf.convert_to_tensor(input_data)
    x = self.d1(x)
    x = self.d2(x)
    x = self.out(x)
    return x




In [11]:
class agent:
  def __init__(self):
    self.model = model()
    self.gamma = 0.999
    self.opt = tf.keras.optimizers.Adagrad(learning_rate=0.005)

  def act(self,state):
    ''' 
    state: shape=(4,) 
    prob: shape=(1, 2)
    action: shape=(1,)
    '''
    prob = self.model(np.array([state]))
    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    action = dist.sample()
    return int(action.numpy()[0])

  def predict(self, state):
    prob = self.model(tf.reshape(state, [-1, 4]))
    action = tf.math.argmax(prob, axis=1)
    return int(action.numpy()[0])

  def a_loss(self,prob, action, reward): 
    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    log_prob = dist.log_prob(action)
    loss = -log_prob*reward
    return loss 

  def train(self, states, rewards, actions):
    sum_reward = 0
    discnt_rewards = []
    rewards.reverse()
    for r in rewards:
      sum_reward = r + self.gamma*sum_reward
      discnt_rewards.append(sum_reward)
    discnt_rewards.reverse()  

    for state, reward, action in zip(states, discnt_rewards, actions):
      with tf.GradientTape() as tape:
        p = self.model(np.array([state]), training=True)
        loss = self.a_loss(p, action, reward)
      grads = tape.gradient(loss, self.model.trainable_variables)
      self.opt.apply_gradients(zip(grads, self.model.trainable_variables))



In [None]:
agentoo7 = agent()

In [20]:

steps = 800
for s in range(steps):
  
  done = False
  state = env.reset()
  total_reward = 0
  rewards = []
  states = []
  actions = []
  step_count = 0
  while not done:
    #env.render()
    action = agentoo7.act(state)
    #print(action)
    next_state, reward, done, _ = env.step(action)
    rewards.append(reward)
    states.append(state)
    actions.append(action)
    state = next_state
    total_reward += reward
    step_count += 1
    
    if done:
      agentoo7.train(states, rewards, actions)
      #print("total step for this episord are {}".format(t))
      print("total reward after {} steps is {}, step_count = {}".format(s, total_reward, step_count))


total reward after 0 steps is 15.0, step_count = 15
total reward after 1 steps is 18.0, step_count = 18
total reward after 2 steps is 26.0, step_count = 26
total reward after 3 steps is 19.0, step_count = 19
total reward after 4 steps is 20.0, step_count = 20
total reward after 5 steps is 25.0, step_count = 25
total reward after 6 steps is 34.0, step_count = 34
total reward after 7 steps is 32.0, step_count = 32
total reward after 8 steps is 14.0, step_count = 14
total reward after 9 steps is 15.0, step_count = 15
total reward after 10 steps is 14.0, step_count = 14
total reward after 11 steps is 31.0, step_count = 31
total reward after 12 steps is 38.0, step_count = 38
total reward after 13 steps is 82.0, step_count = 82
total reward after 14 steps is 31.0, step_count = 31
total reward after 15 steps is 13.0, step_count = 13
total reward after 16 steps is 60.0, step_count = 60
total reward after 17 steps is 11.0, step_count = 11
total reward after 18 steps is 43.0, step_count = 43
tot

In [21]:
def ai_play(agent):
    env = gym.make('CartPole-v1')
    state = env.reset()

    done = False
    step = 0
    total_reward = 0
    while not done:
        env.render()
        action = agentoo7.predict(state)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        step += 1
        total_reward += reward

        print("step: {} action: {}, total_reward: {}, done:{}".format(step, action, total_reward, done))
        if done:
            break
    env.reset()
    env.close()

In [26]:
ai_play(agentoo7)

step: 1 action: 1, total_reward: 1.0, done:False
step: 2 action: 0, total_reward: 2.0, done:False
step: 3 action: 1, total_reward: 3.0, done:False
step: 4 action: 0, total_reward: 4.0, done:False
step: 5 action: 0, total_reward: 5.0, done:False
step: 6 action: 1, total_reward: 6.0, done:False
step: 7 action: 0, total_reward: 7.0, done:False
step: 8 action: 1, total_reward: 8.0, done:False
step: 9 action: 0, total_reward: 9.0, done:False
step: 10 action: 1, total_reward: 10.0, done:False
step: 11 action: 0, total_reward: 11.0, done:False
step: 12 action: 1, total_reward: 12.0, done:False
step: 13 action: 0, total_reward: 13.0, done:False
step: 14 action: 1, total_reward: 14.0, done:False
step: 15 action: 0, total_reward: 15.0, done:False
step: 16 action: 1, total_reward: 16.0, done:False
step: 17 action: 0, total_reward: 17.0, done:False
step: 18 action: 1, total_reward: 18.0, done:False
step: 19 action: 0, total_reward: 19.0, done:False
step: 20 action: 1, total_reward: 20.0, done:Fals

In [10]:
state = env.reset()
prob = agentoo7.model(tf.reshape(state, [-1, 4]))
action = tf.math.argmax(prob, axis=1)
state, prob, action

(array([ 0.00261727, -0.03078178, -0.01598616,  0.00459414], dtype=float32),
 <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.49441344, 0.5055866 ]], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=int64, numpy=array([1])>)

In [28]:
dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
log_prob = dist.log_prob(action)
#loss = -log_prob*reward
prob, action, log_prob

(<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.49441344, 0.5055866 ]], dtype=float32)>,
 1,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.6820359], dtype=float32)>)

In [29]:
dist.logits_parameter()

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.7043832 , -0.68203586]], dtype=float32)>

In [30]:
tf.math.log(prob)

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.7043832 , -0.68203586]], dtype=float32)>

In [None]:
# action = agentoo7.act(state)
# action

In [None]:
# prob = agentoo7.model(tf.reshape(state, [-1, 4]))
# prob

In [None]:
# dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)

In [None]:
# dist.sample()