## CMPE 297 Reinforcement Learning
### Deep Learning Primer

Use the convnet example provided on Canvas as a starting point and add the following two features:
1. Add He initialization and compare the training results with the base model.
2. Add Nadam optimization and compare the training results with the base model.
3. Combine the two modification and explain the overall impact of these two enhancements

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
device

device(type='cuda', index=0)

In [4]:
# Hyperparameters
num_epochs = 5
num_classes = 10
batch_size = 100
learning_rate = 0.001

In [6]:
train_dataset = torchvision.datasets.MNIST(root="./minst/", train=True, # misspelled mnist
                                           transform=transforms.ToTensor(), download=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./minst/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Extracting ./minst/MNIST/raw/train-images-idx3-ubyte.gz to ./minst/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./minst/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Extracting ./minst/MNIST/raw/train-labels-idx1-ubyte.gz to ./minst/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./minst/MNIST/raw/t10k-images-idx3-ubyte.gz



HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Extracting ./minst/MNIST/raw/t10k-images-idx3-ubyte.gz to ./minst/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./minst/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Extracting ./minst/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./minst/MNIST/raw
Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [8]:
test_dataset = torchvision.datasets.MNIST(root="./minst/", train=False, 
                                          transform=transforms.ToTensor())

In [9]:
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [10]:
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

## Default Network

In [15]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
          nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(16),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
          nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        return out




In [16]:
model = ConvNet(num_classes).to(device)

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [25]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/5], Step [100/600], Loss: 2.1873
Epoch [1/5], Step [200/600], Loss: 2.3314
Epoch [1/5], Step [300/600], Loss: 2.1164
Epoch [1/5], Step [400/600], Loss: 2.0874
Epoch [1/5], Step [500/600], Loss: 2.0558
Epoch [1/5], Step [600/600], Loss: 2.2396
Epoch [2/5], Step [100/600], Loss: 2.0795
Epoch [2/5], Step [200/600], Loss: 2.0834
Epoch [2/5], Step [300/600], Loss: 2.1748
Epoch [2/5], Step [400/600], Loss: 1.9754
Epoch [2/5], Step [500/600], Loss: 2.0241
Epoch [2/5], Step [600/600], Loss: 1.9306
Epoch [3/5], Step [100/600], Loss: 2.0278
Epoch [3/5], Step [200/600], Loss: 2.1669
Epoch [3/5], Step [300/600], Loss: 2.2082
Epoch [3/5], Step [400/600], Loss: 1.9605
Epoch [3/5], Step [500/600], Loss: 1.9214
Epoch [3/5], Step [600/600], Loss: 1.9061
Epoch [4/5], Step [100/600], Loss: 1.9747
Epoch [4/5], Step [200/600], Loss: 2.0738
Epoch [4/5], Step [300/600], Loss: 1.9923
Epoch [4/5], Step [400/600], Loss: 1.9831
Epoch [4/5], Step [500/600], Loss: 1.9613
Epoch [4/5], Step [600/600], Loss:

In [26]:
# Test the model
model.eval
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 31.97 %


In [27]:
# Save the model checkpoint
torch.save(model.state_dict(), 'default.ckpt')

## He initialization

Reference: https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_uniform_

In [58]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
          nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(16),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
          nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        # Apply HE Initialization
        nn.init.kaiming_uniform_(self.layer1[0].weight, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.layer2[0].weight, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc.weight, mode='fan_out', nonlinearity='relu')
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        return out

In [59]:
model = ConvNet(num_classes).to(device)

In [60]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [61]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/5], Step [100/600], Loss: 4.6500
Epoch [1/5], Step [200/600], Loss: 3.9968
Epoch [1/5], Step [300/600], Loss: 3.4714
Epoch [1/5], Step [400/600], Loss: 3.1483
Epoch [1/5], Step [500/600], Loss: 2.9116
Epoch [1/5], Step [600/600], Loss: 2.7317
Epoch [2/5], Step [100/600], Loss: 3.0870
Epoch [2/5], Step [200/600], Loss: 2.6206
Epoch [2/5], Step [300/600], Loss: 2.7175
Epoch [2/5], Step [400/600], Loss: 2.5413
Epoch [2/5], Step [500/600], Loss: 2.6289
Epoch [2/5], Step [600/600], Loss: 2.5128
Epoch [3/5], Step [100/600], Loss: 2.6133
Epoch [3/5], Step [200/600], Loss: 2.4364
Epoch [3/5], Step [300/600], Loss: 2.2037
Epoch [3/5], Step [400/600], Loss: 2.3789
Epoch [3/5], Step [500/600], Loss: 2.2477
Epoch [3/5], Step [600/600], Loss: 2.2576
Epoch [4/5], Step [100/600], Loss: 2.2401
Epoch [4/5], Step [200/600], Loss: 2.2923
Epoch [4/5], Step [300/600], Loss: 2.2653
Epoch [4/5], Step [400/600], Loss: 2.3265
Epoch [4/5], Step [500/600], Loss: 2.1498
Epoch [4/5], Step [600/600], Loss:

In [62]:
# Test the model
model.eval
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 30.34 %


In [42]:
# Save the model checkpoint
torch.save(model.state_dict(), 'he_init.ckpt')

## Apply Nadam optimization

In [66]:
!pip install neuralnet-pytorch imageio

Collecting neuralnet-pytorch
  Downloading neuralnet_pytorch-0.0.3-py3-none-any.whl (29 kB)
Collecting visdom
  Downloading visdom-0.1.8.9.tar.gz (676 kB)
[K     |████████████████████████████████| 676 kB 3.2 MB/s eta 0:00:01
Collecting jsonpatch
  Downloading jsonpatch-1.26-py2.py3-none-any.whl (11 kB)
Collecting torchfile
  Downloading torchfile-0.1.0.tar.gz (5.2 kB)
Collecting websocket-client
  Downloading websocket_client-0.57.0-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 3.1 MB/s eta 0:00:01
Collecting jsonpointer>=1.9
  Downloading jsonpointer-2.0-py2.py3-none-any.whl (7.6 kB)
Building wheels for collected packages: visdom, torchfile
  Building wheel for visdom (setup.py) ... [?25ldone
[?25h  Created wheel for visdom: filename=visdom-0.1.8.9-py3-none-any.whl size=668534 sha256=99c5731a9ce541d375a6fd930767b59d28dedb8feb43e3045f148ff18c14bba3
  Stored in directory: /root/.cache/pip/wheels/2d/cd/fb/005445070865d4e45365b2946ee88085a7392370f152cf

In [72]:
import neuralnet_pytorch

In [73]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
          nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(16),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
          nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        return out

In [74]:
model = ConvNet(num_classes).to(device)

In [77]:
criterion = nn.CrossEntropyLoss()
optimizer = neuralnet_pytorch.NAdam(model.parameters(), lr=learning_rate)

In [78]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  exp_avg.mul_(beta1).add_(1. - beta1, grad)


Epoch [1/5], Step [100/600], Loss: 5.2880
Epoch [1/5], Step [200/600], Loss: 4.4494
Epoch [1/5], Step [300/600], Loss: 4.0515
Epoch [1/5], Step [400/600], Loss: 3.5833
Epoch [1/5], Step [500/600], Loss: 3.6037
Epoch [1/5], Step [600/600], Loss: 3.3639
Epoch [2/5], Step [100/600], Loss: 3.3315
Epoch [2/5], Step [200/600], Loss: 3.2858
Epoch [2/5], Step [300/600], Loss: 3.1180
Epoch [2/5], Step [400/600], Loss: 2.9006
Epoch [2/5], Step [500/600], Loss: 2.7481
Epoch [2/5], Step [600/600], Loss: 2.8206
Epoch [3/5], Step [100/600], Loss: 2.5571
Epoch [3/5], Step [200/600], Loss: 2.6786
Epoch [3/5], Step [300/600], Loss: 2.5965
Epoch [3/5], Step [400/600], Loss: 2.7189
Epoch [3/5], Step [500/600], Loss: 2.5686
Epoch [3/5], Step [600/600], Loss: 2.3803
Epoch [4/5], Step [100/600], Loss: 2.4954
Epoch [4/5], Step [200/600], Loss: 2.5260
Epoch [4/5], Step [300/600], Loss: 2.4473
Epoch [4/5], Step [400/600], Loss: 2.2835
Epoch [4/5], Step [500/600], Loss: 2.3270
Epoch [4/5], Step [600/600], Loss:

In [79]:
# Test the model
model.eval
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 28.17 %


In [80]:
# Save the model checkpoint
torch.save(model.state_dict(), 'nadam.ckpt')

## Nadam and HE initialization combined

In [81]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
          nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(16),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
          nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        # Apply HE Initialization
        nn.init.kaiming_uniform_(self.layer1[0].weight, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.layer2[0].weight, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc.weight, mode='fan_out', nonlinearity='relu')
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        return out

In [82]:
model = ConvNet(num_classes).to(device)

In [83]:
criterion = nn.CrossEntropyLoss()
optimizer = neuralnet_pytorch.NAdam(model.parameters(), lr=learning_rate)

In [84]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/5], Step [100/600], Loss: 5.3969
Epoch [1/5], Step [200/600], Loss: 4.6659
Epoch [1/5], Step [300/600], Loss: 4.1772
Epoch [1/5], Step [400/600], Loss: 4.0554
Epoch [1/5], Step [500/600], Loss: 3.8239
Epoch [1/5], Step [600/600], Loss: 3.3925
Epoch [2/5], Step [100/600], Loss: 3.0167
Epoch [2/5], Step [200/600], Loss: 3.2880
Epoch [2/5], Step [300/600], Loss: 2.9667
Epoch [2/5], Step [400/600], Loss: 3.2483
Epoch [2/5], Step [500/600], Loss: 2.9733
Epoch [2/5], Step [600/600], Loss: 2.5645
Epoch [3/5], Step [100/600], Loss: 2.6773
Epoch [3/5], Step [200/600], Loss: 2.6239
Epoch [3/5], Step [300/600], Loss: 2.7314
Epoch [3/5], Step [400/600], Loss: 2.6112
Epoch [3/5], Step [500/600], Loss: 2.6661
Epoch [3/5], Step [600/600], Loss: 2.6085
Epoch [4/5], Step [100/600], Loss: 2.2669
Epoch [4/5], Step [200/600], Loss: 2.3850
Epoch [4/5], Step [300/600], Loss: 2.3502
Epoch [4/5], Step [400/600], Loss: 2.5026
Epoch [4/5], Step [500/600], Loss: 2.3448
Epoch [4/5], Step [600/600], Loss:

In [85]:
# Test the model
model.eval
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 27.53 %


In [86]:
# Save the model checkpoint
torch.save(model.state_dict(), 'he_nadam_combined.ckpt')

### Writeup