# TensorFlow Tutorial for Policy Gradients
> Deep RL course - Fall 2024 - Sharif University of Technology - Workshop session <br>
> Author: M Narimani - December 8th, 2024

In [None]:
import tensorflow as tf
import numpy as np

## Basic Tensor Operations

In [None]:
# Creating tensors
x = tf.constant([1.0, 2.0, 3.0])
y = tf.constant([4.0, 5.0, 6.0])

print("x:", x.numpy())
print("y:", y.numpy())
print("\nBasic operations:")
print("Addition:", (x + y).numpy())
print("Multiplication:", (x * y).numpy())
print("Mean of x:", tf.reduce_mean(x).numpy())

In [None]:
# Matrix operations
A = tf.constant([[1.0, 2.0], [3.0, 4.0]])
b = tf.constant([[5.0], [6.0]])

print("Matrix multiplication:")
print("\nA:")
print(A.numpy())
print("\nb:")
print(b.numpy())
print("\nA @ b:")
print((A @ b).numpy())

## Probability Operations

In [None]:
# Logits (raw/non-normalized predictions) to probabilities
logits = tf.constant([2.0, 1.0, 0.5])
probs = tf.nn.softmax(logits)
print("Logits:", logits.numpy())
print("Probabilities (softmax):", probs.numpy())

In [None]:
# Log probabilities
log_probs = tf.math.log(probs)
print("Log probabilities:", log_probs.numpy())

## Automatic Differentiation

In [None]:
# Simple gradient example
x = tf.Variable(2.0)
z = tf.Variable(2.0)
with tf.GradientTape() as tape:
    y = x * x + z
grad = tape.gradient(y, x)
print("dy/dx at x=2:", grad.numpy())

In [None]:
# Multiple variables and operations
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    y = x1 * x2 + tf.square(x1)
grads = tape.gradient(y, [x1, x2])
print("Gradients for x1 and x2:", [g.numpy() for g in grads])

In [None]:
import matplotlib.pyplot as plt

# Create a grid of points
x = np.linspace(-4, 4, 40)
y = np.linspace(-4, 4, 40)
X, Y = np.meshgrid(x, y)

# Convert to TensorFlow Variables
x_tf = tf.Variable(X)
y_tf = tf.Variable(Y)

with tf.GradientTape() as tape:
    z = tf.sin(x_tf)**2 + tf.cos(y_tf)**2

grads = tape.gradient(z, [x_tf, y_tf])

# Visualization
fig = plt.figure(figsize=(15, 5))
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, Y, z.numpy(), cmap='viridis')
ax.set_title('Function Surface')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')

# Gradient vector field
plt.figure(figsize=(6, 6))
plt.quiver(X, Y, grads[0].numpy(), grads[1].numpy())
plt.title('Gradient Vector Field')
plt.xlabel('x')
plt.ylabel('y')
plt.grid(True)
plt.tight_layout()
plt.show()

# Policy gradient precess
## Policy network

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=(4,)),
    tf.keras.layers.Dense(2, activation='softmax')
])

## Data Collection Phase

In [None]:
def simulate_cartpole_state():
    return np.random.randn(4)  # [cart_position, cart_velocity, pole_angle, pole_velocity]

In [None]:
n_steps = 3

states = []
actions = []
raw_probs = []

for step in range(n_steps):
    state = simulate_cartpole_state()
    states.append(state)
    
    # Get action probabilities
    state_input = np.array([state])
    action_probs = model(state_input).numpy().flatten()
    raw_probs.append(action_probs)
    
    # Sample action
    action = np.random.choice(2, p=action_probs)
    actions.append(action)
    
    print(f"\nStep {step + 1}:")
    print(f"State: {state}")
    print(f"Action probabilities: [left={action_probs[0]:.2f}, right={action_probs[1]:.2f}]")
    print(f"Chosen action: {'left' if action == 0 else 'right'}")

## Reward Processing

In [None]:
def discount_rewards(rewards, gamma=0.99):
    """Calculate discounted rewards"""
    discounted = np.zeros_like(rewards, dtype=np.float32)
    running_sum = 0
    for t in reversed(range(len(rewards))):
        running_sum = running_sum * gamma + rewards[t]
        discounted[t] = running_sum
    return discounted

In [None]:
# Example rewards
raw_rewards = [1.0, 1.0, 1.0]
discounted_rewards = discount_rewards(raw_rewards)

print("Raw rewards:", raw_rewards)
print("Discounted rewards:", discounted_rewards)

## Policy Update

In [None]:
states = np.array(states)
actions = np.array(actions)

# Create action indices
idx = np.array(list(zip(range(len(actions)), actions)))
print("Action indices (idx):")
print(idx)
print("\nEach row is (timestep, action_taken)")

In [None]:
# Policy gradient update
with tf.GradientTape() as tape:
    # Get action probabilities for all states
    action_probs = model(states)
    
    # Select probabilities of actions that were taken
    selected_probs = tf.gather_nd(action_probs, idx) #gather_nd returnes the elements of action_probs corresponding to indices of idx
    print("\nProbabilities of selected actions:", selected_probs.numpy())
    
    # Calculate log probabilities
    log_probs = tf.math.log(selected_probs)
    print("Log probabilities:", log_probs.numpy())
    
    # Calculate loss
    loss = -tf.reduce_mean(log_probs * discounted_rewards)
    print("\nPolicy gradient loss:", loss.numpy())

# Get gradients
gradients = tape.gradient(loss, model.trainable_variables)
print("\nGradient shapes:", [g.shape for g in gradients])

> **TODO:** Justify "Gradient shapes" by calling `model.summary()`

## REINFORCE with Baseline

In [None]:
# Create a simple value network
baseline = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=(4,)),
    tf.keras.layers.Dense(1)
])

# Predict values
values = baseline(states)
print("Predicted values:", values.numpy().flatten())

# Calculate advantages
advantages = discounted_rewards - values.numpy().flatten()
print("Advantages:", advantages)

# Demonstrate policy update with advantages
with tf.GradientTape() as tape:
    action_probs = model(states)
    selected_probs = tf.gather_nd(action_probs, idx)
    log_probs = tf.math.log(selected_probs)
    
    # Use advantages instead of raw rewards
    loss = -tf.reduce_mean(log_probs * advantages)
    print("\nPolicy gradient loss with advantages:", loss.numpy())