## CMPE 297 Reinforcement Learning
### Deep Learning Primer

Use the convnet example provided on Canvas as a starting point and add the following two features:
1. Add He initialization and compare the training results with the base model.
2. Add Nadam optimization and compare the training results with the base model.
3. Combine the two modification and explain the overall impact of these two enhancements

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
device

device(type='cuda', index=0)

In [4]:
# Hyperparameters
num_epochs = 5
num_classes = 10
batch_size = 100
learning_rate = 0.001

In [5]:
train_dataset = torchvision.datasets.MNIST(root="./minst/", train=True, # misspelled mnist
                                           transform=transforms.ToTensor(), download=True)

In [6]:
test_dataset = torchvision.datasets.MNIST(root="./minst/", train=False, 
                                          transform=transforms.ToTensor())

In [7]:
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [8]:
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

## Default Network

In [9]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
          nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(16),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
          nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [10]:
model = ConvNet(num_classes).to(device)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/5], Step [100/600], Loss: 0.1642
Epoch [1/5], Step [200/600], Loss: 0.1952
Epoch [1/5], Step [300/600], Loss: 0.0635
Epoch [1/5], Step [400/600], Loss: 0.0808
Epoch [1/5], Step [500/600], Loss: 0.1633
Epoch [1/5], Step [600/600], Loss: 0.0749
Epoch [2/5], Step [100/600], Loss: 0.0548
Epoch [2/5], Step [200/600], Loss: 0.0367
Epoch [2/5], Step [300/600], Loss: 0.0284
Epoch [2/5], Step [400/600], Loss: 0.0219
Epoch [2/5], Step [500/600], Loss: 0.0054
Epoch [2/5], Step [600/600], Loss: 0.0222
Epoch [3/5], Step [100/600], Loss: 0.0041
Epoch [3/5], Step [200/600], Loss: 0.0605
Epoch [3/5], Step [300/600], Loss: 0.0118
Epoch [3/5], Step [400/600], Loss: 0.0146
Epoch [3/5], Step [500/600], Loss: 0.0274
Epoch [3/5], Step [600/600], Loss: 0.1384
Epoch [4/5], Step [100/600], Loss: 0.0365
Epoch [4/5], Step [200/600], Loss: 0.0090
Epoch [4/5], Step [300/600], Loss: 0.0298
Epoch [4/5], Step [400/600], Loss: 0.0317
Epoch [4/5], Step [500/600], Loss: 0.0057
Epoch [4/5], Step [600/600], Loss:

In [13]:
# Test the model
model.eval
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 98.64 %


In [14]:
# Save the model checkpoint
torch.save(model.state_dict(), 'default.ckpt')

## He initialization

Reference: https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_uniform_

In [15]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
          nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(16),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
          nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        # Apply HE Initialization
        nn.init.kaiming_uniform_(self.layer1[0].weight, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.layer2[0].weight, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc.weight, mode='fan_out', nonlinearity='relu')
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [16]:
model = ConvNet(num_classes).to(device)

In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/5], Step [100/600], Loss: 0.8453
Epoch [1/5], Step [200/600], Loss: 0.3935
Epoch [1/5], Step [300/600], Loss: 0.4081
Epoch [1/5], Step [400/600], Loss: 0.3112
Epoch [1/5], Step [500/600], Loss: 0.4268
Epoch [1/5], Step [600/600], Loss: 0.1883
Epoch [2/5], Step [100/600], Loss: 0.1137
Epoch [2/5], Step [200/600], Loss: 0.1338
Epoch [2/5], Step [300/600], Loss: 0.2112
Epoch [2/5], Step [400/600], Loss: 0.1894
Epoch [2/5], Step [500/600], Loss: 0.0248
Epoch [2/5], Step [600/600], Loss: 0.0872
Epoch [3/5], Step [100/600], Loss: 0.0534
Epoch [3/5], Step [200/600], Loss: 0.0761
Epoch [3/5], Step [300/600], Loss: 0.0813
Epoch [3/5], Step [400/600], Loss: 0.0441
Epoch [3/5], Step [500/600], Loss: 0.0121
Epoch [3/5], Step [600/600], Loss: 0.1362
Epoch [4/5], Step [100/600], Loss: 0.0373
Epoch [4/5], Step [200/600], Loss: 0.1048
Epoch [4/5], Step [300/600], Loss: 0.0045
Epoch [4/5], Step [400/600], Loss: 0.1885
Epoch [4/5], Step [500/600], Loss: 0.0039
Epoch [4/5], Step [600/600], Loss:

In [19]:
# Test the model
model.eval
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 97.69 %


In [20]:
# Save the model checkpoint
torch.save(model.state_dict(), 'he_init.ckpt')

## Apply Nadam optimization

In [21]:
!pip install imageio neuralnet-pytorch

Collecting imageio
  Downloading imageio-2.9.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 1.9 MB/s eta 0:00:01
[?25hCollecting neuralnet-pytorch
  Downloading neuralnet_pytorch-0.0.3-py3-none-any.whl (29 kB)
Collecting visdom
  Downloading visdom-0.1.8.9.tar.gz (676 kB)
[K     |████████████████████████████████| 676 kB 3.4 MB/s eta 0:00:01
Collecting jsonpatch
  Downloading jsonpatch-1.26-py2.py3-none-any.whl (11 kB)
Collecting torchfile
  Downloading torchfile-0.1.0.tar.gz (5.2 kB)
Collecting websocket-client
  Downloading websocket_client-0.57.0-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 3.7 MB/s eta 0:00:01
Collecting jsonpointer>=1.9
  Downloading jsonpointer-2.0-py2.py3-none-any.whl (7.6 kB)
Building wheels for collected packages: visdom, torchfile
  Building wheel for visdom (setup.py) ... [?25ldone
[?25h  Created wheel for visdom: filename=visdom-0.1.8.9-py3-none-any.whl size=668534 sha256=654f06ef1c49385f5

In [22]:
import neuralnet_pytorch
# Ref: https://neuralnet-pytorch.readthedocs.io/en/latest/manual/optimization.html

In [23]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
          nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(16),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
          nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [24]:
model = ConvNet(num_classes).to(device)

In [25]:
criterion = nn.CrossEntropyLoss()
optimizer = neuralnet_pytorch.NAdam(model.parameters(), lr=learning_rate)

In [26]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  exp_avg.mul_(beta1).add_(1. - beta1, grad)


Epoch [1/5], Step [100/600], Loss: 0.2135
Epoch [1/5], Step [200/600], Loss: 0.1246
Epoch [1/5], Step [300/600], Loss: 0.0648
Epoch [1/5], Step [400/600], Loss: 0.1498
Epoch [1/5], Step [500/600], Loss: 0.0406
Epoch [1/5], Step [600/600], Loss: 0.0714
Epoch [2/5], Step [100/600], Loss: 0.0347
Epoch [2/5], Step [200/600], Loss: 0.1074
Epoch [2/5], Step [300/600], Loss: 0.1216
Epoch [2/5], Step [400/600], Loss: 0.1042
Epoch [2/5], Step [500/600], Loss: 0.1053
Epoch [2/5], Step [600/600], Loss: 0.0048
Epoch [3/5], Step [100/600], Loss: 0.0132
Epoch [3/5], Step [200/600], Loss: 0.0411
Epoch [3/5], Step [300/600], Loss: 0.0242
Epoch [3/5], Step [400/600], Loss: 0.0403
Epoch [3/5], Step [500/600], Loss: 0.0912
Epoch [3/5], Step [600/600], Loss: 0.0205
Epoch [4/5], Step [100/600], Loss: 0.0586
Epoch [4/5], Step [200/600], Loss: 0.0093
Epoch [4/5], Step [300/600], Loss: 0.0256
Epoch [4/5], Step [400/600], Loss: 0.0199
Epoch [4/5], Step [500/600], Loss: 0.0215
Epoch [4/5], Step [600/600], Loss:

In [27]:
# Test the model
model.eval
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 98.62 %


In [28]:
# Save the model checkpoint
torch.save(model.state_dict(), 'nadam.ckpt')

## Nadam and HE initialization combined

In [29]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
          nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(16),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
          nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        # Apply HE Initialization
        nn.init.kaiming_uniform_(self.layer1[0].weight, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.layer2[0].weight, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc.weight, mode='fan_out', nonlinearity='relu')
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [30]:
model = ConvNet(num_classes).to(device)

In [31]:
criterion = nn.CrossEntropyLoss()
optimizer = neuralnet_pytorch.NAdam(model.parameters(), lr=learning_rate)

In [32]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/5], Step [100/600], Loss: 1.4416
Epoch [1/5], Step [200/600], Loss: 0.5379
Epoch [1/5], Step [300/600], Loss: 0.4891
Epoch [1/5], Step [400/600], Loss: 0.1039
Epoch [1/5], Step [500/600], Loss: 0.4146
Epoch [1/5], Step [600/600], Loss: 0.2820
Epoch [2/5], Step [100/600], Loss: 0.2315
Epoch [2/5], Step [200/600], Loss: 0.1208
Epoch [2/5], Step [300/600], Loss: 0.0760
Epoch [2/5], Step [400/600], Loss: 0.1192
Epoch [2/5], Step [500/600], Loss: 0.1658
Epoch [2/5], Step [600/600], Loss: 0.0931
Epoch [3/5], Step [100/600], Loss: 0.1842
Epoch [3/5], Step [200/600], Loss: 0.0097
Epoch [3/5], Step [300/600], Loss: 0.0518
Epoch [3/5], Step [400/600], Loss: 0.1655
Epoch [3/5], Step [500/600], Loss: 0.0483
Epoch [3/5], Step [600/600], Loss: 0.0426
Epoch [4/5], Step [100/600], Loss: 0.0600
Epoch [4/5], Step [200/600], Loss: 0.1235
Epoch [4/5], Step [300/600], Loss: 0.1638
Epoch [4/5], Step [400/600], Loss: 0.0333
Epoch [4/5], Step [500/600], Loss: 0.1088
Epoch [4/5], Step [600/600], Loss:

In [33]:
# Test the model
model.eval
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 97.28 %


In [34]:
# Save the model checkpoint
torch.save(model.state_dict(), 'he_nadam_combined.ckpt')

### Writeup

| Configuration           | Accuracy |
|-------------------|-------|
| Default           | 98.64 |
| He initialization | 97.69 |
| Nadam optim       | 98.62 |
| He + Nadam        | 97.28 |