# Batch Norm

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import math
import time
import pandas as pd
import simplejson as json
from IPython.display import display
from IPython.display import clear_output

from collections import namedtuple
from itertools import product
from collections import OrderedDict
torch.set_printoptions(linewidth=150)

In [2]:
class RunBuilder():
    @staticmethod
    def get_runs(params):
        # Build runs for us, based on the params we passed in
        Run = namedtuple("Run", params.keys())
        
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))
            
        return runs

In [3]:
# Run Manager Class for separating tensorboard code
class RunManager():
    def __init__(self):
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        
        self.network = None
        self.loader = None
        self.tb = None
    
    def begin_run(self, run, network, loader):
        self.run_start_time = time.time()
        
        self.run_params = run
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f"-{run}")
        
        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)
        
        self.tb.add_image("images", grid)
        
        # allow to try CUDA => move parameters from network to CUDA
        self.tb.add_graph(self.network, images.to(getattr(run, "device", "cpu"))) # allow device, see if we have a device attribute and if it is cpu or ntos
    
    def end_run(self):
        self.tb.close()
        self.epoch_count = 0
    
    def begin_epoch(self):
        self.epoch_start_time = time.time()
        
    def begin_epoch(self):
        self.epoch_start_time = time.time()
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        
    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)
        
        self.tb.add_scalar("Loss", loss, self.epoch_count)
        self.tb.add_scalar("Accuracy", accuracy, self.epoch_count)
        
        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f"{name}.grad", param.grad, self.epoch_count)
          
        # built pandas to analyze data outside of TB
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch duration"] = epoch_duration
        results["run duration"] = run_duration
        for k,v in self.run_params._asdict().items(): results[k] = v # allow us to see what results match with what param
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient="columns")
        
        # update in ipynb in real time
        clear_output(wait=True)
        display(df)
    
    def track_loss(self, loss):
        self.epoch_loss += loss.item() * self.loader.batch_size
    
    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
        
    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):
        pd.DataFrame.from_dict(
            self.run_data,
            orient="columns"
        ).to_csv(f"{fileName}.csv") # save in csv
        
        # to create in tensorboard 
        with open(f"{fileName}.json", "w", encoding="utf-8") as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent = 4)

In [4]:
# way 1 - Sequential - no need forward, contain ReLU, MaxPool
torch.manual_seed(50)
network1 = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=12*4*4, out_features=120),
    nn.ReLU(),
    nn.Linear(in_features=120, out_features=60),
    nn.ReLU(),
    nn.Linear(in_features=60, out_features=10)
)

In [5]:
torch.manual_seed(50)
network2 = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.BatchNorm2d(6), # batch norm
    nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=12*4*4, out_features=120),
    nn.ReLU(),
    nn.BatchNorm1d(120), # batch norm 1 d since we already flatten out our images
    nn.Linear(in_features=120, out_features=60),
    nn.ReLU(),
    nn.Linear(in_features=60, out_features=10)
)

In [6]:
# torch.manual_seed(50)
# layers = OrderedDict([
#     ('conv1', nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)),
#     ('relu1', nn.ReLU()),
#     ('maxpool1', nn.MaxPool2d(kernel_size=2, stride=2)),
    
#     ('conv2', nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)),
#     ('relu2', nn.ReLU()),
#     ('maxpool2', nn.MaxPool2d(kernel_size=2, stride=2)),
    
#     ('flatten', nn.Flatten(start_dim=1)),
#     ('fc1', nn.Linear(in_features=12*4*4, out_features=120)),
#     ('relu3', nn.ReLU()),
    
#     ('fc2', nn.Linear(in_features=120, out_features=60)),
#     ('relu4', nn.ReLU()),
#     ('out', nn.Linear(in_features=60, out_features=10))
# ])

# sequential2 = nn.Sequential(layers)

In [7]:
# torch.manual_seed(50)
# sequential3 = nn.Sequential()
# sequential3.add_module('conv1', nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5))
# sequential3.add_module('relu1', nn.ReLU())
# sequential3.add_module('maxpool1', nn.MaxPool2d(kernel_size=2, stride=2))

# sequential3.add_module('conv2', nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5))
# sequential3.add_module('relu2', nn.ReLU())
# sequential3.add_module('maxpool2', nn.MaxPool2d(kernel_size=2, stride=2))

# sequential3.add_module('flatten', nn.Flatten(start_dim=1))
# sequential3.add_module('fc1', nn.Linear(in_features=12*4*4, out_features=120))
# sequential3.add_module('relu3', nn.ReLU())

# sequential3.add_module('fc2', nn.Linear(in_features=120, out_features=60))
# sequential3.add_module('relu4', nn.ReLU())
# sequential3.add_module('out', nn.Linear(in_features=60, out_features=10))

In [8]:
train_set = torchvision.datasets.FashionMNIST(
    root="./data",
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor() 
    ])
)

In [9]:
loader = DataLoader(train_set, batch_size=len(train_set), num_workers=1)
data = next(iter(loader))
mean = data[0].mean()
std = data[0].std()
mean, std

(tensor(0.2860), tensor(0.3530))

In [10]:
train_set_normal = torchvision.datasets.FashionMNIST(
    root="./data",
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
)

In [11]:
trainsets = {
    'not_normal': train_set,
    'normal': train_set_normal
}

In [12]:
networks  = {
    'no_batch_norm_net': network1,
    'batch_norm_net': network2
}

In [13]:
params = OrderedDict(
    lr = [0.01],
    batch_size = [1000],
    num_workers = [0],
    shuffle = [True],
    device = ['cpu'],
    trainset = ['not_normal', 'normal'],
    network = list(networks.keys()) # second way to query beside the "trainset"
)
m = RunManager()

for run in RunBuilder.get_runs(params):
    device = torch.device(run.device) # allow to try CUDA
    network = networks[run.network].to(device) # allow to try CUDA
    loader = torch.utils.data.DataLoader(trainsets[run.trainset], batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers) # num worker to speed up process for dataloader
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        for batch in loader:
            images = batch[0].to(device) # allow to try CUDA
            labels = batch[1].to(device) # allow to try CUDA
            preds = network(images) # pass batch
            loss = F.cross_entropy(preds, labels) # calculate loss
            optimizer.zero_grad() # zero gradient
            loss.backward() # back prop for calculating gradient
            optimizer.step() # update weights
            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
        
        m.end_epoch()
    m.end_run()
m.save("results")

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,shuffle,device,trainset,network
0,1,1,1.039251,0.599417,26.770857,32.274853,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net
1,1,2,0.549068,0.784467,26.683042,59.389897,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net
2,1,3,0.466096,0.827,27.326407,86.969292,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net
3,1,4,0.415494,0.847267,23.68005,110.854354,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net
4,1,5,0.381524,0.86015,24.746218,135.974568,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net
5,2,1,0.567356,0.7947,30.690202,32.367694,0.01,1000,0,True,cpu,not_normal,batch_norm_net
6,2,2,0.340967,0.872083,25.816479,58.559174,0.01,1000,0,True,cpu,not_normal,batch_norm_net
7,2,3,0.304555,0.88475,20.761148,79.63432,0.01,1000,0,True,cpu,not_normal,batch_norm_net
8,2,4,0.283832,0.893817,21.909546,101.811868,0.01,1000,0,True,cpu,not_normal,batch_norm_net
9,2,5,0.269676,0.8976,25.283419,127.335291,0.01,1000,0,True,cpu,not_normal,batch_norm_net


In [14]:
# sort pandas
pd.DataFrame.from_dict(m.run_data, orient="columns").sort_values("accuracy", ascending=False)

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,shuffle,device,trainset,network
19,4,5,0.225961,0.914917,29.925764,155.096028,0.01,1000,0,True,cpu,normal,batch_norm_net
18,4,4,0.232873,0.91245,30.411423,124.928265,0.01,1000,0,True,cpu,normal,batch_norm_net
17,4,3,0.248761,0.90675,31.334944,94.26484,0.01,1000,0,True,cpu,normal,batch_norm_net
16,4,2,0.261743,0.901633,29.691013,62.514211,0.01,1000,0,True,cpu,normal,batch_norm_net
9,2,5,0.269676,0.8976,25.283419,127.335291,0.01,1000,0,True,cpu,not_normal,batch_norm_net
8,2,4,0.283832,0.893817,21.909546,101.811868,0.01,1000,0,True,cpu,not_normal,batch_norm_net
7,2,3,0.304555,0.88475,20.761148,79.63432,0.01,1000,0,True,cpu,not_normal,batch_norm_net
15,4,1,0.308846,0.884267,30.437222,32.385219,0.01,1000,0,True,cpu,normal,batch_norm_net
14,3,5,0.32607,0.878233,29.557731,145.6999,0.01,1000,0,True,cpu,normal,no_batch_norm_net
13,3,4,0.338816,0.875767,27.33922,115.950202,0.01,1000,0,True,cpu,normal,no_batch_norm_net
