# Learn PyTorch

In [1]:
pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


## Stage 1 : 🔥 Stage 1: PyTorch Tensors (Your First Building Block)

In [2]:
import torch

In [3]:
torch.tensor([1,2,3])

tensor([1, 2, 3])

In [4]:
torch.rand(2,3)

tensor([[0.6859, 0.0032, 0.4774],
        [0.5443, 0.4900, 0.6425]])

In [5]:
torch.zeros(3,3)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

In [6]:
torch.ones(3,4)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [7]:
torch.linspace(0,5,steps=5)

tensor([0.0000, 1.2500, 2.5000, 3.7500, 5.0000])

In [8]:
x = torch.tensor([1., 2., 3.])
y = torch.tensor([4., 5., 6.])
x+y

tensor([5., 7., 9.])

In [9]:
# matrix multiplication
x@y

tensor(32.)

In [10]:
x.reshape(3,1)

tensor([[1.],
        [2.],
        [3.]])

In [11]:
x.view(3,1)

tensor([[1.],
        [2.],
        [3.]])

In [12]:
# Add a tensor of ones on 3 by 3 tensor
tensor_3_by_3 = torch.tensor([
    [1,2,3],
    [4,5,6],
    [7,8,9]
])
ones = torch.ones(3,3)
addition= tensor_3_by_3 + ones
addition

tensor([[ 2.,  3.,  4.],
        [ 5.,  6.,  7.],
        [ 8.,  9., 10.]])

In [13]:
# Multiply by 2
multiplication_by_2 = addition *2
multiplication_by_2

tensor([[ 4.,  6.,  8.],
        [10., 12., 14.],
        [16., 18., 20.]])

In [14]:
reshape_1_9 = multiplication_by_2.reshape(1,9)
reshape_1_9

tensor([[ 4.,  6.,  8., 10., 12., 14., 16., 18., 20.]])

## Stage 2 : Autograde Basics

In [15]:
x = torch.tensor(2.0, requires_grad=True)
y = x**2 + 3*x+5
y

tensor(15., grad_fn=<AddBackward0>)

In [16]:
y.backward()

In [17]:
x.grad

tensor(7.)

In [18]:
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = (x ** 2).sum()   # y = x1^2 + x2^2 + x3^2
y.backward()

print(x.grad)  # tensor([2., 4., 6.])

tensor([2., 4., 6.])


In [19]:
# Stopping gradient
x = torch.tensor(3.0, requires_grad=True)
y = x ** 2

with torch.no_grad():
    z = y * 2   # no gradient tracking here

print(z.requires_grad)  # False

False


In [20]:
# Clear old gradient
w = torch.tensor(2.0, requires_grad=True)

for i in range(3):
    y = w * 3
    y.backward()
    print(w.grad)  # gradients accumulate!

    w.grad.zero_()  # reset gradients

tensor(3.)
tensor(3.)
tensor(3.)


In [21]:
x = torch.tensor(4.0, requires_grad=True)
y = 3*x**3 + 2*x**2 + 7
y.backward()
print(x.grad)

tensor(160.)


## 🔥 Stage 3: Neural Networks with nn.Module

In [22]:
import torch
import torch.nn as nn

# Define model
class LinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(1, 1)  # input size=1, output size=1

    def forward(self, x):
        return self.linear(x)

# Create model
model = LinearModel()
print(model)

LinearModel(
  (linear): Linear(in_features=1, out_features=1, bias=True)
)


In [23]:
x = torch.tensor([[2.0]])   # input (batch_size=1, features=1)
y_pred = model(x)           # forward pass
print(y_pred)

tensor([[2.4968]], grad_fn=<AddmmBackward0>)


In [24]:
# True data
X = torch.tensor([[1.0], [2.0], [3.0], [4.0]])
Y = torch.tensor([[3.0], [5.0], [7.0], [9.0]])

# Model
model = LinearModel()

# Loss function (Mean Squared Error)
criterion = nn.MSELoss()

# Optimizer (Stochastic Gradient Descent)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [25]:
for epoch in range(500):
    # Forward pass
    Y_pred = model(X)
    loss = criterion(Y_pred, Y)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 50 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

Epoch 50, Loss: 0.1091
Epoch 100, Loss: 0.0809
Epoch 150, Loss: 0.0599
Epoch 200, Loss: 0.0444
Epoch 250, Loss: 0.0329
Epoch 300, Loss: 0.0244
Epoch 350, Loss: 0.0181
Epoch 400, Loss: 0.0134
Epoch 450, Loss: 0.0099
Epoch 500, Loss: 0.0073


In [26]:
x_test = torch.tensor([[5.0]])
y_test = model(x_test)
print("Prediction for x=5:", y_test.item())

Prediction for x=5: 11.146489143371582


## 🔥 Stage 4: Loss Functions & Optimizers in PyTorch

In [30]:
num_epochs = 100
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X)
    loss = criterion(outputs, Y)

    # Backward pass
    optimizer.zero_grad()  # reset old gradients
    loss.backward()        # compute new gradients
    optimizer.step()       # update weights

    # print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

grad vs zero_grad?

In [31]:
import torch
import torch.nn as nn

# Data: [hours studied], label [pass/fail]
X = torch.tensor([[1.0], [2.0], [3.0], [4.0]])
Y = torch.tensor([[0.0], [0.0], [1.0], [1.0]])

# Model: simple linear classifier
model = nn.Sequential(
    nn.Linear(1, 1),
    nn.Sigmoid()   # squash output between 0 and 1
)

# Loss & Optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Training loop
for epoch in range(200):
    outputs = model(X)
    loss = criterion(outputs, Y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 50 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# Test
print("Prediction for 1.5 hours:", model(torch.tensor([[1.5]])).item())

Epoch 50, Loss: 0.5442
Epoch 100, Loss: 0.4602
Epoch 150, Loss: 0.3996
Epoch 200, Loss: 0.3546
Prediction for 1.5 hours: 0.3603784739971161


## 🎯 Stage 5: Training a Full Model (Putting it All Together)

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Fake dataset (100 samples, 10 features each, 2 classes)
X = torch.randn(100, 10)
y = torch.randint(0, 2, (100,))

# Wrap in DataLoader
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Model (simple feedforward)
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 32)
        self.fc2 = nn.Linear(32, 2)  # 2 classes

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
for epoch in range(5):
    for batch_X, batch_y in loader:
        # Forward
        preds = model(batch_X)
        loss = criterion(preds, batch_y)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 0.7087
Epoch 2, Loss: 0.5915
Epoch 3, Loss: 0.6551
Epoch 4, Loss: 0.4581
Epoch 5, Loss: 0.5864


## Stage 6: Training on a Real Dataset (MNIST).

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# 1. Load Data (with transforms to tensors)
transform = transforms.Compose([transforms.ToTensor()])
train_data = datasets.MNIST(root="data", train=True, download=True, transform=transform)
test_data = datasets.MNIST(root="data", train=False, download=True, transform=transform)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1000)

# 2. Define Model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)  # 10 digits (0–9)

    def forward(self, x):
        x = x.view(-1, 28*28)   # Flatten image
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = Net()

# 3. Loss & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 4. Training Loop
for epoch in range(3):  # train for 3 epochs
    for batch_X, batch_y in train_loader:
        # Forward
        preds = model(batch_X)
        loss = criterion(preds, batch_y)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# 5. Testing Loop (evaluation)
correct, total = 0, 0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        preds = model(batch_X)
        predicted = preds.argmax(dim=1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9.91M/9.91M [00:30<00:00, 322kB/s]


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28.9k/28.9k [00:00<00:00, 131kB/s]


Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.65M/1.65M [00:03<00:00, 519kB/s]


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.54k/4.54k [00:00<00:00, 3.86MB/s]


Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw

Epoch 1, Loss: 0.8522
Epoch 2, Loss: 0.4705
Epoch 3, Loss: 0.2051
Test Accuracy: 90.15%


## Stage 7: Convolutional Neural Networks (CNNs)

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# 1. Data
transform = transforms.Compose([transforms.ToTensor()])
train_data = datasets.MNIST(root="data", train=True, download=True, transform=transform)
test_data = datasets.MNIST(root="data", train=False, download=True, transform=transform)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1000)

# 2. CNN Model
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)  # 1 input channel (grayscale), 32 filters
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3) # 64 filters
        self.pool = nn.MaxPool2d(2, 2)  # 2x2 max pooling
        self.dropout = nn.Dropout(0.25)

        # Fully connected layers
        self.fc1 = nn.Linear(64 * 12 * 12, 128)  # after conv+pool, image size reduces
        self.fc2 = nn.Linear(128, 10)            # 10 classes (digits)

    def forward(self, x):
        x = torch.relu(self.conv1(x))   # [batch, 32, 26, 26]
        x = self.pool(torch.relu(self.conv2(x)))  # [batch, 64, 12, 12]
        x = self.dropout(x)
        x = x.view(-1, 64 * 12 * 12)    # Flatten
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = CNN()

# 3. Loss & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam usually faster than SGD

# 4. Training Loop
for epoch in range(3):
    for batch_X, batch_y in train_loader:
        preds = model(batch_X)
        loss = criterion(preds, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# 5. Testing
correct, total = 0, 0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        preds = model(batch_X)
        predicted = preds.argmax(dim=1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Epoch 1, Loss: 0.0419
Epoch 2, Loss: 0.0100
Epoch 3, Loss: 0.0307
Test Accuracy: 98.71%


## Stage 8: Training on GPU (CUDA)

In [61]:
import torch

print(torch.cuda.is_available())   # True if CUDA is installed
print(torch.cuda.device_count())   # Number of GPUs
# print(torch.cuda.get_device_name(0))  # GPU name

False
0


In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [37]:
model = CNN().to(device)

for epoch in range(3):
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)  # move data

        preds = model(batch_X)
        loss = criterion(preds, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 2.3073
Epoch 2, Loss: 2.3036
Epoch 3, Loss: 2.3054


In [38]:
correct, total = 0, 0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        preds = model(batch_X)
        predicted = preds.argmax(dim=1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 9.67%


## Stage 9: Saving & Loading Models in PyTorch

In [39]:
torch.save(model, "model.pth")

In [43]:
torch.save(model.state_dict(), "model_state.pth") # Recommended

In [41]:
model = torch.load("model.pth")
model.eval()  # set to evaluation mode

  model = torch.load("model.pth")


CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.25, inplace=False)
  (fc1): Linear(in_features=9216, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

In [44]:
# Recommended
model = Net()  # re-initialize the same architecture
model.load_state_dict(torch.load("model_state.pth"))
model.eval()   # evaluation mode (turns off dropout, etc.)

  model.load_state_dict(torch.load("model_state.pth"))


Net(
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=10, bias=True)
)

In [46]:
# Save optimizer state
torch.save({
    "epoch": epoch,
    "model_state": model.state_dict(),
    "optimizer_state": optimizer.state_dict(),
    "loss": loss,
}, "checkpoint.pth")

In [48]:
# Load optimizer state
checkpoint = torch.load("checkpoint.pth")
model.load_state_dict(checkpoint["model_state"])
optimizer.load_state_dict(checkpoint["optimizer_state"])
epoch = checkpoint["epoch"]
loss = checkpoint["loss"]

  checkpoint = torch.load("checkpoint.pth")


## 🔹 Stage 10: PyTorch Lightning & Accelerators

In [52]:
pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.5-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading pytorch_lightning-2.5.5-py3-none-any.whl (832 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m832.4/832.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m:01[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.

In [53]:
import pytorch_lightning as pl
print(pl.__version__)

2.5.5


In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# 1. Define Lightning Module
class LitNet(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # flatten
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = F.cross_entropy(preds, y)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=0.01)

# 2. Data
transform = transforms.ToTensor()
train_data = datasets.MNIST(root="data", train=True, download=True, transform=transform)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# 3. Train
model = LitNet()
trainer = pl.Trainer(max_epochs=3, accelerator="auto")
trainer.fit(model, train_loader)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
2025-10-06 16:42:04.805004: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759749124.975683   10384 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759749125.022928   10384 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1759749125.280327   10384 computation_placer.cc:177] computation placer already registered. Please check link

Training: |                                                                                                   …

`Trainer.fit` stopped: `max_epochs=3` reached.


In [59]:
# trainer = pl.Trainer(max_epochs=3, accelerator="gpu", devices=1)

In [60]:
# trainer = pl.Trainer(max_epochs=3, accelerator="gpu", devices=2)