In [1]:
import torch

# Create a tensor and set requires_grad=True to track computations
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)

# Define a computation
z = x * y  # z = x * y => dz/dx = y, dz/dy = x

# Compute gradients by calling .backward()
z.backward()

# Print the computed gradients
print("Gradient of z with respect to x:", x.grad)  # Should be 3.0
print("Gradient of z with respect to y:", y.grad)  # Should be 2.0

In [1]:
import torch
from torch.autograd import forward_ad

# Define a simple function
def func(x):
    return x**2 + 3 * x

# Create an input tensor
x = torch.tensor(2.0, requires_grad=True)

# Use the context manager to manage dual level automatically
with forward_ad.dual_level():
    # Create the dual tensor with value and its tangent (seed = 1.0)
    dual_x = forward_ad.make_dual(x, torch.tensor(1.0))

    # Perform computation using dual tensor
    dual_y = func(dual_x)

    # Unpack the dual tensor to get primal and tangent
    unpacked = forward_ad.unpack_dual(dual_y)

    print(f"Function output (primal): {unpacked.primal.item()}")
    print(f"Forward gradient (tangent): {unpacked.tangent.item()}")


Function output (primal): 10.0
Forward gradient (tangent): 7.0


In [1]:
import torch
from torch.autograd import functional as F

def f(x):
    return x ** 2 + 3 * x

x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
jac = F.jacobian(f, x)
print("Jacobian:\n", jac)


Jacobian:
 tensor([[5., 0., 0.],
        [0., 7., 0.],
        [0., 0., 9.]])


In [2]:
def scalar_fn(x):
    return (x ** 2).sum()

x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
hess = F.hessian(scalar_fn, x)
print("Hessian:\n", hess)


Hessian:
 tensor([[2., 0., 0.],
        [0., 2., 0.],
        [0., 0., 2.]])


In [3]:
def f(x):
    return x ** 3

x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
v = torch.tensor([1.0, 0.0, -1.0])
_, jvp = F.jvp(f, x, v)
print("JVP:\n", jvp)


JVP:
 tensor([  3.,   0., -27.])


In [4]:
def f(x):
    return x ** 2 + x

x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
v = torch.tensor([1.0, 0.0, -1.0])
_, vjp = F.vjp(f, x, v)
print("VJP:\n", vjp)


VJP:
 tensor([ 3.,  0., -7.])


In [5]:
def scalar_fn(x):
    return (x ** 3).sum()

x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
v = torch.tensor([1.0, 0.0, -1.0])
_, hvp = F.hvp(scalar_fn, x, v)
print("HVP:\n", hvp)


HVP:
 tensor([  6.,   0., -18.])


In [6]:
def scalar_fn(x):
    return (x ** 3).sum()

x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
v = torch.tensor([0.1, 0.2, 0.3])
_, vhp = F.vhp(scalar_fn, x, v)
print("VHP:\n", vhp)


VHP:
 tensor([0.6000, 2.4000, 5.4000])


In [7]:
def my_func(x, const, flag=True):
    return x * const if flag else x + const

const = torch.tensor([2.0, 2.0, 2.0])
input_tensor = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

jac = F.jacobian(lambda x: my_func(x, const, flag=False), input_tensor)
print("Jacobian with lambda wrapper:\n", jac)


Jacobian with lambda wrapper:
 tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])


In [5]:
import torch

# Create a tensor with requires_grad=True so it will track computations
x = torch.tensor([2.0, 3.0], requires_grad=True)

# Leaf tensor with requires_grad=True
print("Is leaf:", x.is_leaf)  # True

# Perform operations on the tensor
y = x * x + 2  # y = x^2 + 2
z = y.sum()    # z = sum(y)

# z is not a leaf tensor because it's a result of operations
print("z is leaf:", z.is_leaf)  # False

# Register a hook to see gradients
def hook_fn(grad):
    print("Hook called with grad:", grad)

x.register_hook(hook_fn)

# Retain gradient for non-leaf y
y.retain_grad()

# Perform backpropagation
z.backward()  # Computes dz/dx

print("Gradients of x:", x.grad)  # dz/dx = 2x → [4.0, 6.0]
print("Gradients of y:", y.grad)  # dy/dx = 2x → [4.0, 6.0]

# Detach to stop tracking
x_detached = x.detach()
print("Detached x requires_grad:", x_detached.requires_grad)  # False

# In-place detach (rarely used directly)
x.detach_()
print("In-place detached x is leaf:", x.is_leaf)

# Demonstrate register_post_accumulate_grad_hook (requires PyTorch >=1.13+)
def post_hook(grad):
    print("Post accumulate hook called with grad:", grad)

x = torch.tensor([2.0, 3.0], requires_grad=True)
x.retain_grad()
x.register_post_accumulate_grad_hook(post_hook)

y = (x ** 2).sum()
y.backward()


Is leaf: True
z is leaf: False
Hook called with grad: tensor([4., 6.])
Gradients of x: tensor([4., 6.])
Gradients of y: tensor([1., 1.])
Detached x requires_grad: False
In-place detached x is leaf: True
Post accumulate hook called with grad: tensor([2., 3.], requires_grad=True)


In [None]:
# https://medium.com/@piyushkashyap045/understanding-pytorch-autograd-a-complete-guide-for-deep-learning-practitioners-f5dd1f43b417

autograd provide automatic differentiations for tensor operations.

its critical for optimization algorithms like gradient descent.

we often have to deal with hundreds/thousands of nested functions

autgrad builds a directed acyclic graph (DAG) that racks all operations.

gradient accumulate by default in pytorch, to prevent this , clear gradietns between backward passes.

sometimes you wantt to disable gradient tracking, especially during model evaluations


best practices
1. clear gradietn regularly, each backward pass during training
2. use no_grad for evaluation
3. check gradient flow 
4. memory management , release computaiton graphs when not needed using detach.




In [2]:
import torch
# Create input and target
x = torch.tensor(6.7, requires_grad=True)
y = torch.tensor(0.0)
# Initialize weights and bias
w = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)
# Forward pass
z = w * x + b
y_pred = torch.sigmoid(z)
# Calculate loss
loss = y_pred * (1 - y) + (1 - y_pred) * y
# Backward pass
loss.backward()
# Get gradients
print(f"dL/dw: {w.grad}")
print(f"dL/db: {b.grad}")

dL/dw: 0.008227287791669369
dL/db: 0.0012279534712433815
