In [0]:
import torch

In [0]:
def simple_batch_norm_1d(x, gamma, beta):
    eps = 1e-5
    x_mean = torch.mean(x, dim=0, keepdim=True)
    x_var = torch.mean((x - x_mean) ** 2, dim =0, keepdim=True)
    x_hat = (x - x_mean) / torch.sqrt(x_var +eps)
    return gamma.view_as(x_mean) * x_hat + beta.view_as(x_mean)

In [6]:
x = torch.arange(15).view(5,3)
gamma = torch.ones(x.shape[1])
beta = torch.zeros(x.shape[1])
print('before bn:')
print(x)
y = simple_batch_norm_1d(x.float(), gamma, beta)
print('after bn:')
print(y)

before bn:
tensor([[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11],
        [12, 13, 14]])
after bn:
tensor([[-1.4142, -1.4142, -1.4142],
        [-0.7071, -0.7071, -0.7071],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.7071,  0.7071,  0.7071],
        [ 1.4142,  1.4142,  1.4142]])


In [0]:
def batch_norm_1d(x,gamma, beta, is_training, moving_mean, moving_var, moving_momentum=0.1):
    eps = 1e-5
    x_mean = torch.mean(x, dim=0, keepdim=True)
    x_var = torch.mean((x - x_mean) ** 2, dim=0, keepdim=True)
    if is_training:
        x_hat = (x - x_mean) / torch.sqrt(x_var + eps)
        moving_mean[:] = moving_momentum * moving_mean + (1. - moving_momentum) * x_mean
        moving_var[:] = moving_momentum * moving_var +(1. - moving_momentum) * x_var
    else:
        x_hat = (x - moving_mean) / torch.sqrt(moving_var + eps)
    return gamma.view_as(x_mean) * x_hat + beta.view_as(x_mean)

In [0]:
import numpy as np
from torchvision.datasets import mnist
from torch.utils.data import DataLoader
from torch import nn
from torch.autograd import Variable

In [0]:
def data_tf(x):
    x = np.array(x, dtype='float32') / 255
    x = (x - 0.5) / 0.5
    x = x.reshape((-1,))
    x = torch.from_numpy(x)
    return x

In [11]:
train_set = mnist.MNIST('./data', train=True, download=True, transform=data_tf)
test_set = mnist.MNIST('./data', train=False, download=True, transform=data_tf)
train_data = DataLoader(train_set, batch_size=64, shuffle=True)
test_data = DataLoader(test_set, batch_size=128, shuffle=False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!


In [0]:
class multi_network(nn.Module):
    def __init__(self):
        super(multi_network, self).__init__()
        self.layer1 = nn.Linear(784, 100)
        self.relu = nn.ReLU(True)
        self.layer2 = nn.Linear(100, 10)
        
        self.gamma = nn.Parameter(torch.randn(100))
        self.beta = nn.Parameter(torch.randn(100))
        if torch.cuda.is_available():
            self.moving_mean = Variable(torch.zeros(100)).cuda()
            self.moving_var = Variable(torch.zeros(100)).cuda()
        else:
            self.moving_mean = Variable(torch.zeros(100))
            self.moving_var = Variable(torch.zeros(100))
        
    def forward(self, x, is_train=True):
        x = self.layer1(x)
        x = batch_norm_1d(x, self.gamma, self.beta, is_train, self.moving_mean, self.moving_var)
        x = self.relu(x)
        x = self.layer2(x)
        return x

In [0]:
net = multi_network()

In [0]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), 1e-1)

In [47]:
from Myutils import train
train(net, train_data, test_data, 10, optimizer, criterion)

  im = Variable(im.cuda(), volatile=True)
  label = Variable(label.cuda(), volatile=True)


Epoch 0. Train Loss: 0.306892, Train Acc: 0.912963, Valid Loss: 0.171380, Valid Acc: 0.951444, Time 00:00:06
Epoch 1. Train Loss: 0.167229, Train Acc: 0.951592, Valid Loss: 0.136442, Valid Acc: 0.960047, Time 00:00:07
Epoch 2. Train Loss: 0.131591, Train Acc: 0.961754, Valid Loss: 0.120915, Valid Acc: 0.965190, Time 00:00:07
Epoch 3. Train Loss: 0.108258, Train Acc: 0.968850, Valid Loss: 0.109535, Valid Acc: 0.966574, Time 00:00:07
Epoch 4. Train Loss: 0.093886, Train Acc: 0.972648, Valid Loss: 0.106067, Valid Acc: 0.968058, Time 00:00:07
Epoch 5. Train Loss: 0.082421, Train Acc: 0.975430, Valid Loss: 0.098624, Valid Acc: 0.970036, Time 00:00:07
Epoch 6. Train Loss: 0.074356, Train Acc: 0.977545, Valid Loss: 0.097599, Valid Acc: 0.971717, Time 00:00:07
Epoch 7. Train Loss: 0.067164, Train Acc: 0.979744, Valid Loss: 0.094806, Valid Acc: 0.972508, Time 00:00:07
Epoch 8. Train Loss: 0.061420, Train Acc: 0.981343, Valid Loss: 0.090122, Valid Acc: 0.972903, Time 00:00:07
Epoch 9. Train Loss

In [48]:
print(net.moving_mean[:10])

tensor([ 0.2481, -0.3017,  0.3342,  1.5678,  0.3352, -0.9297,  1.9812, -0.4364,
        -1.2494,  1.0724], device='cuda:0', grad_fn=<SliceBackward>)


In [50]:
no_bn_net = nn.Sequential(
    nn.Linear(784, 100),
    nn.ReLU(True),
    nn.Linear(100, 10)
)

optimizer = torch.optim.SGD(no_bn_net.parameters(), 1e-1)
train(no_bn_net, train_data, test_data, 10, optimizer, criterion)

  im = Variable(im.cuda(), volatile=True)
  label = Variable(label.cuda(), volatile=True)


Epoch 0. Train Loss: 0.401558, Train Acc: 0.876149, Valid Loss: 0.286918, Valid Acc: 0.908426, Time 00:00:05
Epoch 1. Train Loss: 0.181660, Train Acc: 0.945446, Valid Loss: 0.151143, Valid Acc: 0.954213, Time 00:00:06
Epoch 2. Train Loss: 0.133588, Train Acc: 0.959438, Valid Loss: 0.178269, Valid Acc: 0.943236, Time 00:00:06
Epoch 3. Train Loss: 0.109306, Train Acc: 0.966751, Valid Loss: 0.147290, Valid Acc: 0.952235, Time 00:00:06
Epoch 4. Train Loss: 0.092547, Train Acc: 0.971532, Valid Loss: 0.167697, Valid Acc: 0.951938, Time 00:00:06
Epoch 5. Train Loss: 0.081924, Train Acc: 0.974997, Valid Loss: 0.093806, Valid Acc: 0.970926, Time 00:00:06
Epoch 6. Train Loss: 0.073301, Train Acc: 0.977595, Valid Loss: 0.084829, Valid Acc: 0.973596, Time 00:00:06
Epoch 7. Train Loss: 0.066118, Train Acc: 0.978811, Valid Loss: 0.099598, Valid Acc: 0.969244, Time 00:00:06
Epoch 8. Train Loss: 0.059102, Train Acc: 0.980993, Valid Loss: 0.132098, Valid Acc: 0.958663, Time 00:00:06
Epoch 9. Train Loss

In [0]:
def data_tf(x):
    x = np.array(x ,dtype='float32') / 255
    x = (x - 0.5) / 0.5
    x = torch.from_numpy(x)
    x = x.unsqueeze(0)
    return x

train_set = mnist.MNIST('./data', train=True, transform=data_tf)
test_set = mnist.MNIST('./data', train=False, transform=data_tf)
train_data = DataLoader(train_set, batch_size=64, shuffle=True)
test_data = DataLoader(test_set, batch_size=128, shuffle=False)

In [54]:
class conv_bn_net(nn.Module):
    def __init__(self):
        super(conv_bn_net,self).__init__()
        self.stage1 = nn.Sequential(
            nn.Conv2d(1, 6, 3, padding=1),
            nn.BatchNorm2d(6),
            nn.ReLU(True),
            nn.MaxPool2d(2,2),
            nn.Conv2d(6,16,5),
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.MaxPool2d(2,2)
        )
        self.classfy = nn.Linear(400, 10)
    
    def forward(self,x):
        x = self.stage1(x)
        x = x.view(x.shape[0], -1)
        x = self.classfy(x)
        return x
    
net = conv_bn_net()
optimizer = torch.optim.SGD(net.parameters(), 1e-1)

train(net, train_data, test_data, 10, optimizer, criterion)

  im = Variable(im.cuda(), volatile=True)
  label = Variable(label.cuda(), volatile=True)


Epoch 0. Train Loss: 0.164731, Train Acc: 0.952192, Valid Loss: 0.090327, Valid Acc: 0.970431, Time 00:00:09
Epoch 1. Train Loss: 0.065708, Train Acc: 0.979861, Valid Loss: 0.049022, Valid Acc: 0.983287, Time 00:00:10
Epoch 2. Train Loss: 0.051395, Train Acc: 0.984142, Valid Loss: 0.050148, Valid Acc: 0.983782, Time 00:00:10
Epoch 3. Train Loss: 0.042643, Train Acc: 0.986890, Valid Loss: 0.047928, Valid Acc: 0.984771, Time 00:00:10
Epoch 4. Train Loss: 0.036959, Train Acc: 0.988623, Valid Loss: 0.053632, Valid Acc: 0.983386, Time 00:00:10
Epoch 5. Train Loss: 0.033479, Train Acc: 0.989772, Valid Loss: 0.040186, Valid Acc: 0.986551, Time 00:00:10
Epoch 6. Train Loss: 0.029992, Train Acc: 0.990422, Valid Loss: 0.042218, Valid Acc: 0.986551, Time 00:00:10
Epoch 7. Train Loss: 0.027558, Train Acc: 0.991121, Valid Loss: 0.034755, Valid Acc: 0.988825, Time 00:00:10
Epoch 8. Train Loss: 0.025135, Train Acc: 0.991921, Valid Loss: 0.037770, Valid Acc: 0.988232, Time 00:00:10
Epoch 9. Train Loss

In [56]:
class conv_no_net(nn.Module):
    def __init__(self):
        super(conv_no_net,self).__init__()
        self.stage1 = nn.Sequential(
            nn.Conv2d(1, 6, 3, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(2,2),
            nn.Conv2d(6,16,5),
            nn.ReLU(True),
            nn.MaxPool2d(2,2)
        )
        self.classfy = nn.Linear(400, 10)
    
    def forward(self,x):
        x = self.stage1(x)
        x = x.view(x.shape[0], -1)
        x = self.classfy(x)
        return x
    
net = conv_no_net()
optimizer = torch.optim.SGD(net.parameters(), 1e-1)

train(net, train_data, test_data, 10, optimizer, criterion)

  im = Variable(im.cuda(), volatile=True)
  label = Variable(label.cuda(), volatile=True)


Epoch 0. Train Loss: 0.211028, Train Acc: 0.934818, Valid Loss: 0.085779, Valid Acc: 0.972013, Time 00:00:08
Epoch 1. Train Loss: 0.069709, Train Acc: 0.978412, Valid Loss: 0.043145, Valid Acc: 0.985858, Time 00:00:09
Epoch 2. Train Loss: 0.053283, Train Acc: 0.983725, Valid Loss: 0.050343, Valid Acc: 0.982100, Time 00:00:09
Epoch 3. Train Loss: 0.044643, Train Acc: 0.986174, Valid Loss: 0.044429, Valid Acc: 0.985661, Time 00:00:09
Epoch 4. Train Loss: 0.038860, Train Acc: 0.987757, Valid Loss: 0.040852, Valid Acc: 0.986155, Time 00:00:09
Epoch 5. Train Loss: 0.033940, Train Acc: 0.989289, Valid Loss: 0.054258, Valid Acc: 0.983485, Time 00:00:09
Epoch 6. Train Loss: 0.031334, Train Acc: 0.989739, Valid Loss: 0.033199, Valid Acc: 0.989320, Time 00:00:09
Epoch 7. Train Loss: 0.028923, Train Acc: 0.991305, Valid Loss: 0.034567, Valid Acc: 0.988924, Time 00:00:09
Epoch 8. Train Loss: 0.026832, Train Acc: 0.991071, Valid Loss: 0.036810, Valid Acc: 0.988133, Time 00:00:09
Epoch 9. Train Loss