In [1]:
import torch

# Introduction to PyTorch

PyTorch is an open-source machine learning library widely used for building and training deep learning models. Developed by Meta AI (formerly Facebook AI Research), PyTorch provides a flexible and efficient platform for **tensor computation**, **automatic differentiation**, and **neural network development**.

PyTorch supports multiple types of devices, such as CPU and GPU, allowing computations to run efficiently on different hardware. It can automatically detect available devices and easily move tensors or models between them.

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available")
else:
    device = torch.device("cpu")
    print("CUDA is not available")

MPS is available


# Tensors

Tensors are the basic data type in PyTorch, similar to `ndarray` in NumPy.  
They represent multi-dimensional arrays and support a wide range of mathematical operations.

Tensors can be processed on both CPU and GPU, making them suitable for efficient numerical and deep learning computations.

In [3]:
# Create a tensor
A = torch.tensor([[1, 2], [3, 4]])
print("A:")
print(A)

# Basic tensor operation
B = A + 2
print("\nB = A + 2:")
print(B)

# Move tensor to the selected device
A = A.to(device)
print(f"\nA on {device}:")
print(A)

A:
tensor([[1, 2],
        [3, 4]])

B = A + 2:
tensor([[3, 4],
        [5, 6]])

A on mps:
tensor([[1, 2],
        [3, 4]], device='mps:0')


# Automatic Differentiation

In [4]:
# Create a tensor with requires_grad=True to track computation
x = torch.tensor([[1.0], [3.0]], requires_grad=True, device=device)
print("Original Tensor:")
print(x)

# Perform some operations
y = x**2 + 2
print("\nAfter Operations:")
print(y)

out = y.mean()
print("\nAfter Averaging:")
print(out)

# Perform backpropagation
out.backward()
print("\nGradients:")
print(x.grad)

Original Tensor:
tensor([[1.],
        [3.]], device='mps:0', requires_grad=True)

After Operations:
tensor([[ 3.],
        [11.]], device='mps:0', grad_fn=<AddBackward0>)

After Averaging:
tensor(7., device='mps:0', grad_fn=<MeanBackward0>)

Gradients:
tensor([[1.],
        [3.]], device='mps:0')


# Data Preparation

In [5]:
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader

In [None]:
basic_transforms = transforms.Compose([
    transforms.ToTensor(),                      # Convert PIL image to tensor
    transforms.Lambda(lambda x: x.view(-1)),    # Flatten the image -> Convert it to vector
    transforms.Lambda(lambda x: x / 255.0)      # Normalize pixel values to [0, 1]
])

batch_size = 32

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    transform=basic_transforms,
    download=True
)
# Split the train_dataset into train and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    transform=basic_transforms,
    download=True
)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100.0%


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100.0%


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100.0%


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100.0%

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






# Neural Network Definition

In [7]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x
# Note that the softmax function is not needed in the last layer, as it is included in the loss function.

In [9]:
class SimpleNetWithRegularization(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(SimpleNetWithRegularization, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        self.fc3 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        return x

In [10]:
# Training Loop with Early Stopping
def train_model(model, train_loader, val_loader, loss_fn, 
                optimizer, epochs, device, 
                patience=5, min_delta=0.001):
    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(epochs):
        model.train()
        train_loss, train_steps = 0.0, 0
        for x_batch, y_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            # Forward pass  
            y_pred = model(x_batch)
            # Compute loss
            loss = loss_fn(y_pred, y_batch)
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            # Update weights
            optimizer.step()
            # Accumulate loss
            train_loss += loss.item()
            train_steps += 1
        train_loss_avg = train_loss / train_steps
        train_losses.append(train_loss_avg)

        # Evaluate on validation set
        model.eval()
        val_loss, val_steps = 0.0, 0
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = model(x_batch)
                loss = loss_fn(y_pred, y_batch)
                val_loss += loss.item()
                val_steps += 1
        val_loss_avg = val_loss / val_steps
        val_losses.append(val_loss_avg)
        
        # Early stopping logic
        if val_loss_avg < best_val_loss - min_delta:
            best_val_loss = val_loss_avg
            patience_counter = 0
            # Save the best model state
            best_model_state = model.state_dict().copy()
            print(f"Epoch {epoch + 1:3d}/{epochs}, Loss: {train_loss_avg:.4f},", 
                f"Validation Loss: {val_loss_avg:.4f} (Best)")
        else:
            patience_counter += 1
            print(f"Epoch {epoch + 1:3d}/{epochs}, Loss: {train_loss_avg:.4f},", 
                f"Validation Loss: {val_loss_avg:.4f}", 
                f"(Patience: {patience_counter}/{patience})")
        
        # Check for early stopping
        if patience_counter >= patience:
            print(f"\nEarly stopping triggered at epoch {epoch + 1}")
            print(f"Best validation loss: {best_val_loss:.4f}")
            break
    
    # Restore the best model state
    if best_model_state is not None:
        # Save the best model state to a file
        torch.save(best_model_state, 'best_model.pth')
        model.load_state_dict(best_model_state)
        print("Restored best model state")
    
    return train_losses, val_losses

In [11]:
# Evaluate the model on test set and calculate accuracy
def evaluate_model(model, test_loader, loss_fn, device):
    model.eval()
    test_loss, test_steps = 0.0, 0
    correct_predictions = 0
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            test_loss += loss.item()
            test_steps += 1
            preds = y_pred.argmax(dim=1)
            correct_predictions += (preds == y_batch).sum().item()
    test_loss_avg = test_loss / test_steps
    test_accuracy = correct_predictions / len(test_loader.dataset)
    print(f"Loss: {test_loss_avg:.4f}, Accuracy: {test_accuracy:.4f}")

In [12]:
model1 = SimpleNet().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model1.parameters(), lr=0.001)

epochs = 20
train_model(model1, train_loader, val_loader, loss_fn, optimizer, epochs, device)

print("Evaluating on training set...")
evaluate_model(model1, train_loader, loss_fn, device)

print("Evaluating on validation set...")
evaluate_model(model1, val_loader, loss_fn, device)

print("Evaluating on test set...")
evaluate_model(model1, test_loader, loss_fn, device)

Epoch   1/20, Loss: 1.0494, Validation Loss: 0.5961 (Best)
Epoch   2/20, Loss: 0.5192, Validation Loss: 0.4529 (Best)
Epoch   3/20, Loss: 0.4196, Validation Loss: 0.3988 (Best)
Epoch   4/20, Loss: 0.3737, Validation Loss: 0.3565 (Best)
Epoch   5/20, Loss: 0.3405, Validation Loss: 0.3218 (Best)
Epoch   6/20, Loss: 0.3114, Validation Loss: 0.3010 (Best)
Epoch   7/20, Loss: 0.2868, Validation Loss: 0.2772 (Best)
Epoch   8/20, Loss: 0.2618, Validation Loss: 0.2550 (Best)
Epoch   9/20, Loss: 0.2409, Validation Loss: 0.2387 (Best)
Epoch  10/20, Loss: 0.2234, Validation Loss: 0.2222 (Best)
Epoch  11/20, Loss: 0.2066, Validation Loss: 0.2069 (Best)
Epoch  12/20, Loss: 0.1924, Validation Loss: 0.1968 (Best)
Epoch  13/20, Loss: 0.1793, Validation Loss: 0.1907 (Best)
Epoch  14/20, Loss: 0.1679, Validation Loss: 0.1820 (Best)
Epoch  15/20, Loss: 0.1574, Validation Loss: 0.1718 (Best)
Epoch  16/20, Loss: 0.1482, Validation Loss: 0.1588 (Best)
Epoch  17/20, Loss: 0.1396, Validation Loss: 0.1525 (Bes

In [14]:
# Print layers and the number of parameters in each layer
total = 0
for name, param in model1.named_parameters():
    print(f"{name}:\t{param.numel()}")
    total += param.numel()
print(f"Total parameters: {total}")

fc1.weight:	100352
fc1.bias:	128
fc2.weight:	8192
fc2.bias:	64
fc3.weight:	640
fc3.bias:	10
Total parameters: 109386


In [15]:
# Model with regularization
model2 = SimpleNetWithRegularization(dropout_rate=0.2)
model2.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001, weight_decay=0.001)

epochs = 20
train_model(model2, train_loader, val_loader, loss_fn, optimizer, epochs, device)

print("Evaluating on training set...")
evaluate_model(model2, train_loader, loss_fn, device)

print("Evaluating on validation set...")
evaluate_model(model2, val_loader, loss_fn, device)

print("Evaluating on test set...")
evaluate_model(model2, test_loader, loss_fn, device)


Epoch   1/20, Loss: 0.3686, Validation Loss: 0.1510 (Best)
Epoch   2/20, Loss: 0.2035, Validation Loss: 0.1120 (Best)
Epoch   3/20, Loss: 0.1695, Validation Loss: 0.1041 (Best)
Epoch   4/20, Loss: 0.1643, Validation Loss: 0.0994 (Best)
Epoch   5/20, Loss: 0.1541, Validation Loss: 0.0942 (Best)
Epoch   6/20, Loss: 0.1499, Validation Loss: 0.0944 (Patience: 1/5)
Epoch   7/20, Loss: 0.1436, Validation Loss: 0.1004 (Patience: 2/5)
Epoch   8/20, Loss: 0.1437, Validation Loss: 0.0944 (Patience: 3/5)
Epoch   9/20, Loss: 0.1453, Validation Loss: 0.0963 (Patience: 4/5)
Epoch  10/20, Loss: 0.1431, Validation Loss: 0.0878 (Best)
Epoch  11/20, Loss: 0.1427, Validation Loss: 0.0884 (Patience: 1/5)
Epoch  12/20, Loss: 0.1405, Validation Loss: 0.0935 (Patience: 2/5)
Epoch  13/20, Loss: 0.1391, Validation Loss: 0.0897 (Patience: 3/5)
Epoch  14/20, Loss: 0.1408, Validation Loss: 0.0877 (Patience: 4/5)
Epoch  15/20, Loss: 0.1385, Validation Loss: 0.0928 (Patience: 5/5)

Early stopping triggered at epoch

In [16]:
# Print layers and the number of parameters in each layer
total = 0
for name, param in model2.named_parameters():
    print(f"{name}:\t{param.numel()}")
    total += param.numel()
print(f"Total parameters: {total}")

fc1.weight:	100352
fc1.bias:	128
bn1.weight:	128
bn1.bias:	128
fc2.weight:	8192
fc2.bias:	64
bn2.weight:	64
bn2.bias:	64
fc3.weight:	640
fc3.bias:	10
Total parameters: 109770
