In [2]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
input_size = 784
hidden_size = 500
num_classes = 10
num_epochs = 1
batch_size = 100
learning_rate = 0.001

# MNIST dataset
train_dataset = torchvision.datasets.MNIST(root='../data', train=True, transform=transforms.ToTensor(), download=True)

test_dataset = torchvision.datasets.MNIST(root='../data', train=False, transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes) -> None:
      super(NeuralNet, self).__init__()
      self.fc1 = nn.Linear(input_size, hidden_size)
      self.relu = nn.ReLU()
      self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
      out = self.fc1(x)
      out = self.relu(out)
      out = self.fc2(out)
      return out
model = NeuralNet(input_size, hidden_size, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
   for i, (images, labels) in enumerate(train_loader):
      # Move tensors to the configured device
      images = images.reshape(-1, 28*28).to(device)
      labels = labels.to(device)
      # Forward pass
      outputs = model(images)
      loss = criterion(outputs, labels)

      # Backward and optimize
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      if (i + 1) % 100 == 0:
        print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
   correct = 0
   total = 0
   for images, labels in test_loader:
      images = images.reshape(-1, 28 * 28).to(device)
      labels = labels.to(device)
      outputs = model(images)
      position, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

      print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))
  
# Save the model checkpoint
# torch.save(model.state_dict(), 'model.ckpt')

Epoch [1/1], Step [100/600], Loss: 0.3349
Epoch [1/1], Step [200/600], Loss: 0.2878
Epoch [1/1], Step [300/600], Loss: 0.2518
Epoch [1/1], Step [400/600], Loss: 0.2729
Epoch [1/1], Step [500/600], Loss: 0.3009
Epoch [1/1], Step [600/600], Loss: 0.1985
tensor([ 9.3139,  7.9711,  5.3000,  9.7696,  5.5520,  6.4282,  6.1638,  3.9411,
         4.1401,  6.0941,  8.7580,  3.8585,  4.8550,  8.7746,  7.4626,  4.8847,
         5.5072, 10.1513,  3.5605,  6.4902,  5.2220,  6.1701,  4.2951,  7.4851,
         4.7876, 13.6762,  7.3241,  6.8057, 11.3670,  5.1448,  7.7864,  4.5690,
         6.7488,  3.4413,  8.0540,  8.1771,  7.9047,  6.1554,  4.0920,  5.6182,
         4.5257,  6.8406,  6.3633,  4.3204,  4.4233,  4.3440,  4.9474,  6.2645,
         9.4521,  7.3716,  6.5807,  6.6081,  4.3724,  3.7100,  6.8963,  7.6057,
         9.0066,  5.0082,  5.6643,  2.9526, 10.6298,  4.8379,  2.2741,  5.6855,
         8.4844,  3.9979,  1.7085,  6.2493,  8.4133,  9.0932, 10.5384, 12.9735,
         7.7507,  4.2259,  5