# Model Playground

**Various tests and small experiments on toy networks.**

In [22]:
from imp import reload
import nupic.research.frameworks.dynamic_sparse.networks.layers as layers
reload(layers);
import nupic.research.frameworks.dynamic_sparse.networks.layers as networks
reload(networks);

In [25]:
from collections import OrderedDict

import numpy as np
import torch
from torchvision import models
from nupic.research.frameworks.dynamic_sparse.networks.layers import DSConv2d
from nupic.torch.models.sparse_cnn import gsc_sparse_cnn, gsc_super_sparse_cnn, GSCSparseCNN, MNISTSparseCNN
from nupic.research.frameworks.dynamic_sparse.networks import mnist_sparse_dscnn, GSCSparseFullCNN, gsc_sparse_dscnn_fullyconv
from torchsummary import summary

from torchviz import make_dot

# Load Models

In [540]:
# resnet18 = models.resnet18()
alexnet = models.alexnet()
# mnist_scnn = MNISTSparseCNN()
gsc_scnn = GSCSparseCNN()
# dscnn = mnist_sparse_dscnn({})
# gscf = gsc_sparse_dscnn_fullyconv({'prune_methods': ["none", "static"]}) # GSCSparseFullCNN(cnn_out_channels=(32, 64, 1))

In [541]:
# resnet18
# resnet18
# mnist_scnn
gsc_scnn
# dscnn
# gscf

GSCSparseCNN(
  (cnn1): Conv2d(1, 64, kernel_size=(5, 5), stride=(1, 1))
  (cnn1_batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (cnn1_maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (cnn1_kwinner): KWinners2d(channels=64, n=0, percent_on=0.095, boost_strength=1.5, boost_strength_factor=0.9, k_inference_factor=1.5, duty_cycle_period=1000)
  (cnn2): Conv2d(64, 64, kernel_size=(5, 5), stride=(1, 1))
  (cnn2_batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (cnn2_maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (cnn2_kwinner): KWinners2d(channels=64, n=0, percent_on=0.125, boost_strength=1.5, boost_strength_factor=0.9, k_inference_factor=1.5, duty_cycle_period=1000)
  (flatten): Flatten()
  (linear): SparseWeights(
    weight_sparsity=0.4
    (module): Linear(in_features=1600, out_features=1000, bias=True)
  )
  (linear_bn

In [412]:
inp = torch.rand(2, 1, 32, 32)
gsc_scnn(inp).shape
gscf(inp).shape

summary(gscf, input_size=(1, 32, 32))

hi <class 'nupic.torch.models.sparse_cnn.GSCSparseCNN'>
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             832
       BatchNorm2d-2           [-1, 32, 28, 28]               0
        KWinners2d-3           [-1, 32, 28, 28]               0
         MaxPool2d-4           [-1, 32, 14, 14]               0
      SparseConv2d-5           [-1, 64, 10, 10]          51,264
       BatchNorm2d-6           [-1, 64, 10, 10]               0
        KWinners2d-7           [-1, 64, 10, 10]               0
         MaxPool2d-8             [-1, 64, 5, 5]               0
           Flatten-9                 [-1, 1600]               0
           Linear-10                   [-1, 12]          19,212
       LogSoftmax-11                   [-1, 12]               0
Total params: 71,308
Trainable params: 71,308
Non-trainable params: 0
-----------------------------------------

# Fun with sequentials.

In [336]:
sq0 = torch.nn.Sequential(OrderedDict([('sq1', torch.nn.Sequential(OrderedDict([('cnn1', torch.nn.Conv2d(3, 3, 3))])) )]))
sq1 = torch.nn.Sequential(od)
sq2 = torch.nn.Sequential(torch.nn.Sequential(od), torch.nn.Conv2d(3, 3, 3))
sq3 = torch.nn.Sequential(OrderedDict([('sq1', sq1), ('sq2', sq2)]))
sq4 = torch.nn.Sequential(sq3)

In [339]:
for n, m in sq4.named_modules():
    ns = n.split('.')
    print([n_.isdigit() for n_ in ns])
    print('name')
    print(n, m)   

# for n, m in sq2._modules.items():
#     print(n, m)   

[False]
name
 Sequential(
  (0): Sequential(
    (sq1): Sequential(
      (cnn1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
    )
    (sq2): Sequential(
      (0): Sequential(
        (cnn1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
      )
      (1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
    )
  )
)
[True]
name
0 Sequential(
  (sq1): Sequential(
    (cnn1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
  )
  (sq2): Sequential(
    (0): Sequential(
      (cnn1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
    )
    (1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
  )
)
[True, False]
name
0.sq1 Sequential(
  (cnn1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
)
[True, False, False]
name
0.sq1.cnn1 Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
[True, False]
name
0.sq2 Sequential(
  (0): Sequential(
    (cnn1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
  )
  (1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1))
)
[True, False, True]
name
0.sq2.0

## Fun with grads

In [114]:
v1 = torch.tensor([0., 0., 0.], requires_grad=True)
v2 = torch.tensor([1., 2., 3.], requires_grad=True)
v3 = torch.tensor([5.], requires_grad=True)
v4 = (v1.sum() + v2.sum()) / v3
h = v3.register_hook(lambda grad: grad * 1.5)  # double the gradient

v4.backward(torch.tensor([1.]))
v1.grad, v2.grad, v3.grad

(tensor([0.2000, 0.2000, 0.2000]),
 tensor([0.2000, 0.2000, 0.2000]),
 tensor([-0.3600]))

In [118]:
v1 = torch.tensor([1., 4., 1.], requires_grad=True)
v2 = torch.tensor([1., 2., 3.], requires_grad=True)
v3 = torch.tensor([5.], requires_grad=True)
v4 = (v1.sum() + v2.sum()) / v3
h = v3.register_hook(lambda grad: grad * 3.0)  # double the gradient

v4.backward(torch.tensor([1.]))
v1.grad, v2.grad, v3.grad

(tensor([0.2000, 0.2000, 0.2000]),
 tensor([0.2000, 0.2000, 0.2000]),
 tensor([-1.4400]))

# Wide RESNET

In [262]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    def __init__(self, in_planes, out_planes, stride, dropRate=0.0):
        super(BasicBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.droprate = dropRate
        self.equalInOut = (in_planes == out_planes)
        self.convShortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
                               padding=0, bias=False) or None
    def forward(self, x):
        if not self.equalInOut:
            x = self.relu1(self.bn1(x))
        else:
            out = self.relu1(self.bn1(x))
        out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else x)))
        if self.droprate > 0:
            out = F.dropout(out, p=self.droprate, training=self.training)
        out = self.conv2(out)
        return torch.add(x if self.equalInOut else self.convShortcut(x), out)

class NetworkBlock(nn.Module):
    def __init__(self, nb_layers, in_planes, out_planes, block, stride, dropRate=0.0):
        super(NetworkBlock, self).__init__()
        self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, dropRate)
    def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, dropRate):
        layers = []
        for i in range(int(nb_layers)):
            layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, dropRate))
        return nn.Sequential(*layers)
    def forward(self, x):
        return self.layer(x)

class WideResNet(nn.Module):
    def __init__(self, depth, num_classes, widen_factor=1, dropRate=0.0):
        super(WideResNet, self).__init__()
        nChannels = [16, 16*widen_factor, 32*widen_factor, 64*widen_factor]
        assert((depth - 4) % 6 == 0)
        n = (depth - 4) / 6
        block = BasicBlock
        # 1st conv before any network block
        self.conv1 = nn.Conv2d(3, nChannels[0], kernel_size=3, stride=1,
                               padding=1, bias=False)
        # 1st block
        self.block1 = NetworkBlock(n, nChannels[0], nChannels[1], block, 1, dropRate)
        # 2nd block
        self.block2 = NetworkBlock(n, nChannels[1], nChannels[2], block, 2, dropRate)
        # 3rd block
        self.block3 = NetworkBlock(n, nChannels[2], nChannels[3], block, 2, dropRate)
        # global average pooling and classifier
        self.bn1 = nn.BatchNorm2d(nChannels[3])
        self.relu = nn.ReLU(inplace=True)
        self.fc = nn.Linear(nChannels[3], num_classes)
        self.nChannels = nChannels[3]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.bias.data.zero_()
    def forward(self, x):
        out = self.conv1(x)
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)
        out = self.relu(self.bn1(out))
        out = F.avg_pool2d(out, 8)
        out = out.view(-1, self.nChannels)
        return self.fc(out)

In [267]:
WideResNet(16, 10)

WideResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (block1): NetworkBlock(
    (layer): Sequential(
      (0): BasicBlock(
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace)
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace)
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (1): BasicBlock(
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace)
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace)
      

# Fun with Learning Rates and Decays

In [586]:
import torch
import numpy as np

np.random.seed(123)
np.set_printoptions(8, suppress=True)

x_numpy = np.random.random((3, 4)).astype(np.double)
x_torch = torch.tensor(x_numpy, requires_grad=True)
x_torch2 = torch.tensor(x_numpy, requires_grad=True)

w_numpy = np.random.random((4, 5)).astype(np.double)
w_torch = torch.tensor(w_numpy, requires_grad=True)
w_torch2 = torch.tensor(w_numpy, requires_grad=True)

def log_grad(grad):
    print(grad)
    
w_torch.register_hook(log_grad)
w_torch2.register_hook(log_grad)

lr = 0.00001
weight_decay = 0.9
sgd = torch.optim.SGD([w_torch], lr=lr, weight_decay=0)
sgd2 = torch.optim.SGD([w_torch2], lr=lr, weight_decay=weight_decay)

y_torch = torch.matmul(x_torch, w_torch)
y_torch2 = torch.matmul(x_torch2, w_torch2)

loss = y_torch.sum()
loss2 = y_torch2.sum()

sgd.zero_grad()
sgd2.zero_grad()

loss.backward()
loss2.backward()

sgd.step()
sgd2.step()

w_grad = w_torch.grad.data.numpy()
w_grad2 = w_torch2.grad.data.numpy()

print("check_grad")
print(w_grad)
print(w_grad2 - weight_decay * w_numpy)


tensor([[1.8969, 1.8969, 1.8969, 1.8969, 1.8969],
        [1.1014, 1.1014, 1.1014, 1.1014, 1.1014],
        [1.5508, 1.5508, 1.5508, 1.5508, 1.5508],
        [1.9652, 1.9652, 1.9652, 1.9652, 1.9652]], dtype=torch.float64)
tensor([[1.8969, 1.8969, 1.8969, 1.8969, 1.8969],
        [1.1014, 1.1014, 1.1014, 1.1014, 1.1014],
        [1.5508, 1.5508, 1.5508, 1.5508, 1.5508],
        [1.9652, 1.9652, 1.9652, 1.9652, 1.9652]], dtype=torch.float64)
check_grad
[[1.89687006 1.89687006 1.89687006 1.89687006 1.89687006]
 [1.10136331 1.10136331 1.10136331 1.10136331 1.10136331]
 [1.55079367 1.55079367 1.55079367 1.55079367 1.55079367]
 [1.96519422 1.96519422 1.96519422 1.96519422 1.96519422]]
[[1.89687006 1.89687006 1.89687006 1.89687006 1.89687006]
 [1.10136331 1.10136331 1.10136331 1.10136331 1.10136331]
 [1.55079367 1.55079367 1.55079367 1.55079367 1.55079367]
 [1.96519422 1.96519422 1.96519422 1.96519422 1.96519422]]


# More fun with Gradients

In [1]:
import torch

# -----------------
# Helper function
# -----------------
def shape(t):
    if isinstance(t, tuple):
        return tuple(t_.shape if t_ is not None else None for t_ in t)
    else:
        return t.shape

# -----------------
# Grad hooks
# -----------------

# Zeros grad for weights
def w_hook(grad):
    print(' '*8, 'w-grad shape = ', shape(grad))
    grad[:] = 0
    return grad

# No change for biases.
def b_hook(grad):
    print(' '*8, 'b-grad shape = ', shape(grad))
    return grad

# -----------------------
# Test layers with biases
# -----------------------

# The following should confirm whether non-zero biases with non-zero gradient flows
# yield changes to the the weights of the layer - indepent of those weights' grad flows.  

layer1 = torch.nn.Conv2d(3, 3, 3)
layer2 = torch.nn.Linear(10, 100)
in1 = torch.rand(10, 3, 10, 10)
in2 = torch.rand(10, 10, 10)

for layer, input_ in [(layer1, in1), (layer2, in2)]:
    
    print('-------', layer.__class__.__name__, '--------\n')
    layer.weight.register_hook(w_hook)
    layer.bias.register_hook(b_hook)

    optim = torch.optim.SGD(layer.parameters(), lr=0.01)
    
    # Sets all weights and biases to 1.
    with torch.no_grad():
        layer.weight.data[:] = 1
        layer.bias.data[:] = 1

    optim.zero_grad()
    o = layer(input_)
    loss = o.mean()
    
    print(' '*5, 'Computing grads...')
    loss.backward()
    optim.step()
    
    # See if weights and biases are still 1.
    # This should only be the case for the weights
    # as we zeroed their gradients.
    print()
    print(' '*5, 'Checking results...')
    print(' '*8, 'Optimized weight - All close to 1:', (layer.weight == 1).all())
    print(' '*8, 'Optimized Bias - All close to 1:', (layer.bias == 1).all())
    print()


------- Conv2d --------

      Computing grads...
         w-grad shape =  torch.Size([3, 3, 3, 3])
         b-grad shape =  torch.Size([3])

      Checking results...
         Optimized weight - All close to 1: tensor(True)
         Optimized Bias - All close to 1: tensor(False)

------- Linear --------

      Computing grads...
         b-grad shape =  torch.Size([100])
         w-grad shape =  torch.Size([100, 10])

      Checking results...
         Optimized weight - All close to 1: tensor(True)
         Optimized Bias - All close to 1: tensor(False)

