In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import numpy as np
import gym
from gym.spaces import Discrete, Box
import math

In [None]:
# make environment, check spaces, get obs / act dims
env = gym.make('CartPole-v1', render_mode="human")

obs, _ = env.reset()  
env.render()

assert isinstance(env.observation_space, Box), \
    "This example only works for envs with continuous state spaces."
assert isinstance(env.action_space, Discrete), \
    "This example only works for envs with discrete action spaces."

obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n

In [None]:
# torch.distributions.Categorical usage

# Define a probability distribution
probs = torch.tensor([0.1, 0.3, 0.6])  # Must sum to 1
dist = torch.distributions.Categorical(probs)

# Sample from the distribution
sample = dist.sample()
print("Sampled action:", sample.item())

# Get log probability of a specific action
log_prob = dist.log_prob(torch.tensor(1))

# math.log(0.3) = -1.2039728043259361
print("Log probability of action 1:", log_prob.item())

indices = torch.arange(len(probs))  # Tensor of indices [0, 1, 2]
log_probs = dist.log_prob(indices)
print("Log probabilities of all actions:", log_probs)

# Entropy of the distribution
entropy = dist.entropy()
print("Entropy:", entropy.item())


Sampled action: 2
Log probability of action 1: -1.2039728164672852
Log probabilities of all actions: tensor([-2.3026, -1.2040, -0.5108])
Entropy: 0.897945761680603


In [7]:
math.log(0.3)

-1.2039728043259361

In [11]:
p = [0.1, 0.3, 0.6]

# Calculate the entropy term: p_i * log(p_i)
entropy_term = sum(p_i * math.log(p_i) for p_i in p)

entropy_term = -entropy_term  # Negate the sum to get the entropy

print(entropy_term)

0.8979457248567797


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        logits = self.fc(x)
        return torch.distributions.Categorical(logits=logits)  # Convert logits to a categorical distribution

# Example usage
policy_net = PolicyNetwork(4, 2)  # Example: 4 input features, 2 possible actions
state1 = torch.rand(4)  # Example input
dist = policy_net(state1)
action = dist.sample()
log_prob = dist.log_prob(action)


In [None]:
# Set the random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Define Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        logits = self.fc(x)
        return torch.distributions.Categorical(logits=logits)  # Categorical distribution from logits

# Example usage
policy_net = PolicyNetwork(4, 2)  # 4 input features, 2 possible actions
state1 = torch.rand(4, requires_grad=True)  # Example state input (with gradient tracking)

# Forward pass to get the distribution
dist = policy_net(state1)

# Sample an action from the distribution
action = dist.sample()

# Calculate the log probability of the action
log_prob = dist.log_prob(action)

# Define the loss as -log_prob * 2
loss = -log_prob * 2

# Backpropagate to compute gradients
loss.backward()

# Get the gradients of the Policy Network parameters
for param in policy_net.parameters():
    print(param.grad)  # This will print the gradients of each parameter


tensor([[-0.2992, -0.1901, -0.2784, -0.1818],
        [ 0.2992,  0.1901,  0.2784,  0.1818]])
tensor([-0.3202,  0.3202])


In [None]:
# Set the random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Define Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        logits = self.fc(x)
        return torch.distributions.Categorical(logits=logits)

# Example usage
policy_net = PolicyNetwork(4, 2)  # 4 input features, 2 possible actions

# Seed the state to ensure reproducibility
state1 = torch.rand(4, requires_grad=True)  # Example state input (with gradient tracking)

# Forward pass to get the distribution
dist = policy_net(state1)

# Sample an action from the distribution
action = dist.sample()

# Calculate the log probability of the action
log_prob = dist.log_prob(action)

# Define the loss as -log_prob * 2
loss = -log_prob * 2

# Manually calculate the gradient of the loss with respect to the logits
logits = dist.logits  # The logits produced by the network
probs = torch.softmax(logits, dim=-1)  # Convert logits to probabilities

# Manually calculate the gradient of the loss w.r.t logits
log_prob = dist.log_prob(action)
chosen_action_prob = probs[action]  # Probability of the chosen action

# Gradient of the loss with respect to the logit of the chosen action
grad_manual = -2 * (torch.eye(len(probs))[action] - probs)

print("Manual Gradient:", grad_manual)


Manual Gradient: tensor([-0.3202,  0.3202], grad_fn=<MulBackward0>)


In [12]:
# Set the random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Define Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim, bias=False)  # No bias for simplicity

    def forward(self, x):
        logits = self.fc(x)
        return torch.distributions.Categorical(logits=logits)

# Example usage
policy_net = PolicyNetwork(4, 2)  # 4 input features, 2 possible actions

# Seed the state to ensure reproducibility
state1 = torch.rand(4, requires_grad=True)  # Example state input (with gradient tracking)

# Forward pass to get the distribution
dist = policy_net(state1)

# Sample an action from the distribution
action = dist.sample()

# Calculate the log probability of the action
log_prob = dist.log_prob(action)

# Define the loss as -log_prob * 2
loss = -log_prob * 2

# Manually calculate the gradient of the loss with respect to the logits
logits = dist.logits  # The logits produced by the network
probs = torch.softmax(logits, dim=-1)  # Convert logits to probabilities

# Get the chosen action's probability
chosen_action_prob = probs[action]  # Probability of the chosen action

# Gradient of the loss with respect to logits (manual)
grad_logit = torch.zeros_like(logits)

# For the chosen action (a)
grad_logit[action] = -2 * (1 - chosen_action_prob)

# For the other actions (i != a)
for i in range(len(probs)):
    if i != action:
        grad_logit[i] = 2 * probs[i]

# Print the manually computed gradient
print("Manually computed gradient with respect to logits:")
print(grad_logit)

# Manually compute the gradient with respect to the weights of the fc layer
# The input to the fc layer is the state, so we multiply by the gradients
grad_fc_weight = grad_logit.view(-1, 1) * state1  # Gradient with respect to weights

# Print the manually computed gradient with respect to the fc weights
print("Manually computed gradient with respect to fc weights:")
print(grad_fc_weight)

# Backpropagate to compute gradients automatically using PyTorch
loss.backward()

print("\nAutomatic gradients computed by PyTorch:")

# Get the gradients of the Policy Network parameters
for name, param in policy_net.named_parameters():
    if param.grad is not None:
        print(f"{name}: {param.grad}")

# Print the gradient with respect to the weights of the fully connected layer
# print("\nGradient with respect to fc weights (PyTorch):")
# print(policy_net.fc.weight.grad)


Manually computed gradient with respect to logits:
tensor([ 1.3456, -1.3456], grad_fn=<CopySlices>)
Manually computed gradient with respect to fc weights:
tensor([[ 1.2659,  0.1792,  1.2576,  0.7987],
        [-1.2659, -0.1792, -1.2576, -0.7987]], grad_fn=<MulBackward0>)

Automatic gradients computed by PyTorch:
fc.weight: tensor([[ 1.2659,  0.1792,  1.2576,  0.7987],
        [-1.2659, -0.1792, -1.2576, -0.7987]])


In [18]:
# Set random seed for reproducibility
torch.manual_seed(42)

# Define a simple policy network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim, bias=False)  # Fully connected layer
    
    def forward(self, x):
        logits = self.fc(x)  # Raw scores (logits)
        return torch.distributions.Categorical(logits=logits)  # Categorical distribution

# Initialize network
input_dim = 4
output_dim = 3  # Assume 3 possible actions
policy_net = PolicyNetwork(input_dim, output_dim)

# Seed the state to ensure reproducibility
state1 = torch.rand(input_dim)  # Example input state
state2 = torch.rand(input_dim)  # Another input state

# Forward pass: Get action distributions
dist1 = policy_net(state1)
dist2 = policy_net(state2)

# Sample actions from the distributions
action1 = dist1.sample()
action2 = dist2.sample()

# Compute log probabilities of the actions
log_prob1 = dist1.log_prob(action1)
log_prob2 = dist2.log_prob(action2)

# Define the losses
loss1 = -log_prob1 * 2
loss2 = -log_prob2 * 3
total_loss = (loss1 + loss2).mean()

# Compute gradients using PyTorch
policy_net.zero_grad()
total_loss.backward()

# Print the gradients of the parameters
print("\n---- Gradients w.r.t. Parameters ----")
for name, param in policy_net.named_parameters():
    if param.grad is not None:
        print(f"{name}: {param.grad}")
        

# Extract PyTorch computed gradients
grad_fc_weight = policy_net.fc.weight.grad

# ------------------------------------------
# Manual Gradient Computation
# ------------------------------------------

# Compute probabilities from softmax
logits1 = policy_net.fc(state1)  # Logits for state1
logits2 = policy_net.fc(state2)  # Logits for state2

probs1 = torch.softmax(logits1, dim=-1)  # Probabilities for state1
probs2 = torch.softmax(logits2, dim=-1)  # Probabilities for state2

# Compute the manual gradient for logits using log-softmax trick
grad_logits1 = -2 * (torch.eye(output_dim)[action1] - probs1)  # Gradient of log_prob1 w.r.t logits1
grad_logits2 = -3 * (torch.eye(output_dim)[action2] - probs2)  # Gradient of log_prob2 w.r.t logits2

# Compute manual gradients for network weights
grad_fc_weight_manual = torch.outer(grad_logits1, state1) + torch.outer(grad_logits2, state2)

# ------------------------------------------
# Compare Manual Gradients to PyTorch
# ------------------------------------------
print("\n---- Gradients w.r.t. Logits ----")
print("Manually Computed Logits Gradient:\n", grad_fc_weight_manual)


# Check if the gradients match
assert torch.allclose(grad_fc_weight, grad_fc_weight_manual, atol=1e-1), "Mismatch in weight gradients!"

print("\n✅ Manual gradients match PyTorch's backward()!")



---- Gradients w.r.t. Parameters ----
fc.weight: tensor([[ 2.0169,  1.3109,  1.0185,  1.2691],
        [ 0.9435,  0.6132,  0.4744,  0.5945],
        [-2.9604, -1.9241, -1.4928, -1.8635]])

---- Gradients w.r.t. Logits ----
Manually Computed Logits Gradient:
 tensor([[ 2.0169,  1.3109,  1.0185,  1.2691],
        [ 0.9435,  0.6132,  0.4744,  0.5945],
        [-2.9604, -1.9241, -1.4928, -1.8635]], grad_fn=<AddBackward0>)

✅ Manual gradients match PyTorch's backward()!
