In [5]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor

In [6]:
# Configuration
BATCH_SIZE=32

In [7]:
train_data = datasets.FashionMNIST(
    root = "data",
    train = True,
    download=True,
    transform=ToTensor(),
)


test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [8]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE)

for X, y in test_dataloader:
    print(f"Shape of X[N,C,H,W]: {X.shape}")
    print(f"Sahep of y:{y.shape} {y.dtype} ")
    break

Shape of X[N,C,H,W]: torch.Size([32, 1, 28, 28])
Sahep of y:torch.Size([32]) torch.int64 


In [9]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [10]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.op1 = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            
        )
        self.opt2 = nn.Sequential(
            nn.Dropout(),
            nn.Linear(512,256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256,10)
        )
        
    def forward(self, x):
        x = self.flatten(x)
        x = self.op1(x)
        logits = self.opt2(x)
        
        return logits
    
model = NeuralNetwork().to(device)
print(model)



NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (op1): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
  )
  (opt2): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=512, out_features=256, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
)


In [11]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [12]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        pred = model(X)
        loss = loss_fn(pred, y)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if batch % 100 == 0:
            loss, current = loss.item(), (batch+1) * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}|{size:>5d}]")

In [13]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [14]:
epochs = 15
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.307558 [   32|60000]
loss: 0.931989 [ 3232|60000]
loss: 0.558470 [ 6432|60000]
loss: 0.840598 [ 9632|60000]
loss: 0.471746 [12832|60000]
loss: 0.795171 [16032|60000]
loss: 0.436261 [19232|60000]
loss: 0.275171 [22432|60000]
loss: 0.699787 [25632|60000]
loss: 0.450464 [28832|60000]
loss: 0.640673 [32032|60000]
loss: 0.462956 [35232|60000]
loss: 0.343907 [38432|60000]
loss: 0.608807 [41632|60000]
loss: 0.812741 [44832|60000]
loss: 0.443883 [48032|60000]
loss: 0.506224 [51232|60000]
loss: 0.598588 [54432|60000]
loss: 0.578737 [57632|60000]
Test Error: 
 Accuracy: 85.1%, Avg loss: 0.421679 

Epoch 2
-------------------------------
loss: 0.368852 [   32|60000]
loss: 0.418959 [ 3232|60000]
loss: 0.243965 [ 6432|60000]
loss: 0.397855 [ 9632|60000]
loss: 0.285599 [12832|60000]
loss: 0.584710 [16032|60000]
loss: 0.311290 [19232|60000]
loss: 0.314421 [22432|60000]
loss: 0.412773 [25632|60000]
loss: 0.295507 [28832|60000]
loss: 0.566209 [32032|60000

loss: 0.296635 [48032|60000]
loss: 0.203942 [51232|60000]
loss: 0.279104 [54432|60000]
loss: 0.304270 [57632|60000]
Test Error: 
 Accuracy: 87.3%, Avg loss: 0.377663 

Epoch 14
-------------------------------
loss: 0.225928 [   32|60000]
loss: 0.272136 [ 3232|60000]
loss: 0.178136 [ 6432|60000]
loss: 0.252544 [ 9632|60000]
loss: 0.253056 [12832|60000]
loss: 0.363465 [16032|60000]
loss: 0.186752 [19232|60000]
loss: 0.109273 [22432|60000]
loss: 0.500203 [25632|60000]
loss: 0.217911 [28832|60000]
loss: 0.304896 [32032|60000]
loss: 0.154525 [35232|60000]
loss: 0.278711 [38432|60000]
loss: 0.352111 [41632|60000]
loss: 0.434249 [44832|60000]
loss: 0.281522 [48032|60000]
loss: 0.205436 [51232|60000]
loss: 0.238117 [54432|60000]
loss: 0.388413 [57632|60000]
Test Error: 
 Accuracy: 87.1%, Avg loss: 0.393863 

Epoch 15
-------------------------------
loss: 0.145764 [   32|60000]
loss: 0.234216 [ 3232|60000]
loss: 0.130685 [ 6432|60000]
loss: 0.390163 [ 9632|60000]
loss: 0.350711 [12832|60000]
lo

In [15]:
torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")

Saved PyTorch Model State to model.pth


In [16]:
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

In [17]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"


# Tensor

In [None]:
import torch
import numpy as np

In [None]:
data = [[1,2], [3,4]]
x_data = torch.tensor(data)

In [None]:
np_array = np.array(data)
x_np = torch.from_numpy(np_array)

In [None]:
x_ones = torch.ones_like(x_data)
print(f"Ones tensor: {x_ones}")

In [None]:
x_rand = torch.rand_like(x_data, dtype=torch.float)
x_rand

In [None]:
shape = (2,3)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)
print(rand_tensor, ones_tensor, zeros_tensor)

In [None]:
tensor = torch.rand(3,4)

In [None]:
tensor.shape, tensor.dtype, tensor.device

In [None]:
def to_cuda(tensor):
    if torch.cuda.is_available():
        x = tensor.to("cuda")
    else:
        x = tensor.to("cpu")
    return x

In [None]:
to_cuda(tensor)

In [None]:
tensor = torch.ones(4,4)
print(f"first row: {tensor[0]}")
print(f"first column: {tensor[:, 0]}")
print(f"last column: {tensor[..., -1]}")

In [None]:
tensor[:,1]=0
tensor

In [None]:
t1 = torch.cat([tensor, tensor, tensor], dim=1)
t1

In [None]:
# matrix multiplication 
y1 = tensor @ tensor.T
y2 = tensor.matmul(tensor.T)
y3 = torch.rand_like(y1)
y4 = torch.matmul(tensor, tensor.T, out=y3)
print(y1)
print(y2)
print(y3)
print(y4)

In [None]:
# element wise
z1 = tensor * tensor
z2 = tensor.mul(tensor)

z3 = torch.rand_like(tensor)
torch.mul(tensor, tensor, out=z3)

In [None]:
agg = tensor.sum()
agg_item = agg.item()
print(agg_item, type(agg_item))

In [None]:
# inplace operation
print(tensor.add_(5))

In [None]:
# work with numpy
t  = torch.ones(5)
n = t.numpy()
print(n)

In [None]:
t.add_(1)

In [None]:
n

In [None]:
n = np.ones(5)
t = torch.from_numpy(n)
t

In [None]:
np.add(n, 1, out=n)
print(t)
print(n)

# Datasets and DataLoaders

In [None]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

In [None]:
train_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

In [None]:
labels_map = {
    0: "T-Shirt",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle Boot",
}

In [None]:
figure = plt.figure(figsize=(8,8))
cols, rows = 3,3
for i in range(1, cols * rows + 1):
    sample_index = torch.randint(len(train_data), size=(1,)).item()
    img, label = train_data[sample_index]
    figure.add_subplot(rows, cols, i)
    plt.title(labels_map[label])
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()

In [None]:
# Custom dataset 
import os
import pandas as pd
from torchvision.io import read_image


In [None]:
class CustomImageDataset(Dataset):
    def __init__(self, 
                 annotations_file, 
                 img_dir, 
                 transform=None,
                 target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.img_labels)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, 
                                self.img_labels.iloc[idx,0])
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_data, 
                              batch_size=64, 
                              shuffle=True)
test_dataloader = DataLoader(test_data,
                             batch_size=64,
                             shuffle=True)

In [None]:
#Display image and label by DataLoader and in-built dataset

train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

In [None]:
img = train_features[0].squeeze()
label = train_labels[0]
print(img.size(), label.shape)

In [None]:
plt.imshow(img, cmap="gray")
plt.show()
print(f"label: {label}")

# Data preprocessing

In [None]:
import torch
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda

In [None]:
ds = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
    target_transform=Lambda(lambda y:torch.zeros(10,dtype=torch.float).scatter_(0,
                                                                                torch.tensor(y),
                                                                                value=1)
                           )
)

# Build neural network

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device}")

In [None]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
#             nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Linear(512,512),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(512,256),
            nn.GELU(),
            nn.Linear(256,10)
        )
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
model = Network().to(device)
print(model)

In [None]:
X = torch.rand(1,28,28,device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

In [None]:
input_image = torch.rand(3,28,28)
print(input_image.size())

In [None]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

In [None]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

In [None]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

In [None]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)

In [None]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

In [None]:
# Parameters

print(f"Model structure: {model}")


for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values: {param[:2]}")

# Automatic differentiation with `torch.autograd`

In [None]:
import torch

In [None]:
x = torch.ones(5)
y = torch.zeros(3)
x,y

In [None]:
w = torch.randn(5,3,requires_grad=True)
b = torch.rand(3, requires_grad=True)
z = torch.matmul(x,w)+b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z,y)

In [None]:
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

In [None]:
loss.backward()
print(w.grad,'\n',b.grad)

In [None]:
# Disabling the gradient tracking
z = torch.matmul(x,w)+b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w) + b
    print(z.requires_grad_)
    print(z.requires_grad)
    
    
# or we can use detach method
z = torch.matmul(x,w)+b
z_det = z.detach()
print(z_det.requires_grad)

In [None]:
inp = torch.eye(4, 5, requires_grad=True)
inp

In [None]:
inp+1

In [None]:
(inp+1).pow(2)

In [None]:
(inp+1).pow(2).t()

In [None]:
(inp+1).pow(2).t().shape

In [None]:
out = (inp+1).pow(2).t()

In [None]:
out.backward(torch.ones_like(out), retain_graph=True)

In [None]:
print(f"First call\n{inp.grad}")

In [None]:
out.backward(torch.ones_like(out), retain_graph=True)

In [None]:
print(f"\nSecond call\n{inp.grad}")

In [None]:
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")

# Parameters optimization

In [4]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor


device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device}")



training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model_ = NeuralNetwork()
model = torch.compile(model_, backend="aot_eager").to(device)
print(model_)
print(model)

Using cuda
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
OptimizedModule(
  (_orig_mod): NeuralNetwork(
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear_relu_stack): Sequential(
      (0): Linear(in_features=784, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=512, bias=True)
      (3): ReLU()
      (4): Linear(in_features=512, out_features=10, bias=True)
    )
  )
)


In [5]:
learning_rate = 1e-3
batch_size = 64
epochs = 5

In [6]:
# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()

In [7]:
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [8]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X,y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [9]:
import torch
torch._dynamo.config.verbose=True



In [10]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.315061  [   64/60000]
loss: 2.298668  [ 6464/60000]
loss: 2.273940  [12864/60000]
loss: 2.256106  [19264/60000]
loss: 2.250988  [25664/60000]
loss: 2.225804  [32064/60000]
loss: 2.235341  [38464/60000]
loss: 2.211386  [44864/60000]
loss: 2.197287  [51264/60000]
loss: 2.159976  [57664/60000]
Test Error: 
 Accuracy: 38.6%, Avg loss: 2.159655 

Epoch 2
-------------------------------
loss: 2.171071  [   64/60000]
loss: 2.161341  [ 6464/60000]
loss: 2.102017  [12864/60000]
loss: 2.114500  [19264/60000]
loss: 2.069915  [25664/60000]
loss: 2.013686  [32064/60000]
loss: 2.043946  [38464/60000]
loss: 1.972373  [44864/60000]
loss: 1.972247  [51264/60000]
loss: 1.904315  [57664/60000]
Test Error: 
 Accuracy: 59.5%, Avg loss: 1.900067 

Epoch 3
-------------------------------
loss: 1.924569  [   64/60000]
loss: 1.900396  [ 6464/60000]
loss: 1.783905  [12864/60000]
loss: 1.828985  [19264/60000]
loss: 1.708629  [25664/60000]
loss: 1.662768  [32064/600

# Save and load the model

In [None]:
import torch
import torchvision.models as models

In [None]:
model = models.vgg16(weights="IMAGENET1K_V1")
torch.save(model.state_dict(), "model_weights.pth")

In [None]:
model = models.vgg16()
model.load_state_dict(torch.load("./model_weights.pth"))
model.eval()

In [None]:
# save entire model
torch.save(model, "model.younameit")

In [None]:
model = torch.load("model.younameit")

In [None]:
model

In [None]:
torch.__version__

# Torch.compile 


```
Signature:
torch.compile(
    model: Optional[Callable] = None,
    *,
    fullgraph: bool = False,
    dynamic: Optional[bool] = None,
    backend: Union[str, Callable] = 'inductor',
    mode: Optional[str] = None,
    options: Optional[Dict[str, Union[str, int, bool]]] = None,
    disable: bool = False,
) -> Callable
Docstring:
Optimizes given model/function using TorchDynamo and specified backend.

Concretely, for every frame executed within the compiled region, we will attempt
to compile it and cache the compiled result on the code object for future
use.  A single frame may be compiled multiple times if previous compiled
results are not applicable for subsequent calls (this is called a "guard
failure), you can use TORCH_LOGS=guards to debug these situations.
Multiple compiled results can be associated with a frame up to
``torch._dynamo.config.cache_size_limit``, which defaults to 64; at which
point we will fall back to eager.  Note that compile caches are per
*code object*, not frame; if you dynamically create multiple copies of a
function, they will all share the same code cache.

Args:
   model (Callable): Module/function to optimize
   fullgraph (bool): If False (default), torch.compile attempts to discover compileable regions
    in the function that it will optimize. If True, then we require that the entire function be
    capturable into a single graph. If this is not possible (that is, if there are graph breaks),
    then this will raise an error.
   dynamic (bool or None): Use dynamic shape tracing.  When this is True, we will up-front attempt
    to generate a kernel that is as dynamic as possible to avoid recompilations when
    sizes change.  This may not always work as some operations/optimizations will
    force specialization; use TORCH_LOGS=dynamic to debug overspecialization.
    When this is False, we will NEVER generate dynamic kernels, we will always specialize.
    By default (None), we automatically detect if dynamism has occurred and compile a more
    dynamic kernel upon recompile.
   backend (str or Callable): backend to be used

    - "inductor" is the default backend, which is a good balance between performance and overhead

    - Non experimental in-tree backends can be seen with `torch._dynamo.list_backends()`

    - Experimental or debug in-tree backends can be seen with `torch._dynamo.list_backends(None)`

    - To register an out-of-tree custom backend: https://pytorch.org/docs/main/compile/custom-backends.html
   mode (str): Can be either "default", "reduce-overhead", "max-autotune" or "max-autotune-no-cudagraphs"

    - "default" is the default mode, which is a good balance between performance and overhead

    - "reduce-overhead" is a mode that reduces the overhead of python with CUDA graphs,
      useful for small batches.  Reduction of overhead can come at the cost of more memory
      usage, as we will cache the workspace memory required for the invocation so that we
      do not have to reallocate it on subsequent runs.  Reduction of overhead is not guaranteed
      to work; today, we only reduce overhead for CUDA only graphs which do not mutate inputs.
      There are other circumstances where CUDA graphs are not applicable; use TORCH_LOG=perf_hints
      to debug.

    - "max-autotune" is a mode that leverages Triton based matrix multiplications and convolutions
      It enables CUDA graphs by default.

    - "max-autotune-no-cudagraphs" is a mode similar to "max-autotune" but without CUDA graphs

    - To see the exact configs that each mode sets you can call `torch._inductor.list_mode_options()`

   options (dict): A dictionary of options to pass to the backend. Some notable ones to try out are

    - `epilogue_fusion` which fuses pointwise ops into templates. Requires `max_autotune` to also be set

    - `max_autotune` which will profile to pick the best matmul configuration

    - `fallback_random` which is useful when debugging accuracy issues

    - `shape_padding` which pads matrix shapes to better align loads on GPUs especially for tensor cores

    - `triton.cudagraphs` which will reduce the overhead of python with CUDA graphs

    - `trace.enabled` which is the most useful debugging flag to turn on

    - `trace.graph_diagram` which will show you a picture of your graph after fusion

    - For inductor you can see the full list of configs that it supports by calling `torch._inductor.list_options()`
   disable (bool): Turn torch.compile() into a no-op for testing

Example::

    @torch.compile(options={"triton.cudagraphs": True}, fullgraph=True)
    def foo(x):
        return torch.sin(x) + torch.cos(x)
File:      ~/workstation/AI/utils/Anaconda3/envs/torch/lib/python3.11/site-packages/torch/__init__.py
Type:      function
```

In [1]:
import torch

In [2]:
torch._dynamo.list_backends()

['cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm']

In [3]:
torch._dynamo.list_backends(None)

['aot_eager',
 'aot_eager_decomp_partition',
 'aot_eager_default_partitioner',
 'aot_torchxla_trace_once',
 'aot_torchxla_trivial',
 'aot_ts',
 'cudagraphs',
 'dynamo_accuracy_minifier_backend',
 'dynamo_minifier_backend',
 'eager',
 'eager_debug',
 'inductor',
 'non_leaf_compile_error_TESTING_ONLY',
 'onnxrt',
 'openxla',
 'openxla_eval',
 'pre_dispatch_eager',
 'relu_accuracy_error_TESTING_ONLY',
 'relu_compile_error_TESTING_ONLY',
 'relu_runtime_error_TESTING_ONLY',
 'torchxla_trace_once',
 'torchxla_trivial',
 'ts',
 'tvm']