# Adding Normalization to Conv Net Layers and Experimenting with Networks

- Using batch normalization for faster congergence

In [15]:
import my_model

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch.utils.tensorboard import SummaryWriter

from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import time
from collections import OrderedDict, namedtuple
from itertools import product

import pandas as pd
import json

from IPython.display import clear_output


#### Without BatchNorm

In [6]:
torch.manual_seed(50)
network1 = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
    , nn.ReLU()
    , nn.MaxPool2d(kernel_size=2, stride=2)
    , nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
    , nn.ReLU()
    , nn.MaxPool2d(kernel_size=2, stride=2)
    , nn.Flatten(start_dim=1)  
    , nn.Linear(in_features=12*4*4, out_features=120)
    , nn.ReLU()
    , nn.Linear(in_features=120, out_features=60)
    , nn.ReLU()
    , nn.Linear(in_features=60, out_features=10)
)

#### With BatchNorm

In [7]:
torch.manual_seed(50)
network2 = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
    , nn.ReLU()
    , nn.MaxPool2d(kernel_size=2, stride=2)
    , nn.BatchNorm2d(6)
    , nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
    , nn.ReLU()
    , nn.MaxPool2d(kernel_size=2, stride=2)
    , nn.Flatten(start_dim=1)  
    , nn.Linear(in_features=12*4*4, out_features=120)
    , nn.ReLU()
    , nn.BatchNorm1d(120)
    , nn.Linear(in_features=120, out_features=60)
    , nn.ReLU()
    , nn.Linear(in_features=60, out_features=10)
)

#### Test

In [4]:
train_set = torchvision.datasets.FashionMNIST(
    root='./Documents/data'
     ,train=True
    ,download=True # downloads it locally (checks existence beforehand)
    ,transform=transforms.Compose([
        transforms.ToTensor()
    ])
)

loader = DataLoader(train_set, batch_size=len(train_set), num_workers=1)
data = next(iter(loader))
mean = data[0].mean(), 
std = data[0].std()

train_set_normal = torchvision.datasets.FashionMNIST(
    root='./Documents/data'
     ,train=True
    ,download=True # downloads it locally (checks existence beforehand)
    ,transform=transforms.Compose([
        transforms.ToTensor(), # butilt in tensor transformer
        # TODO: Normalize
        transforms.Normalize(mean, std)
    ])
)

In [11]:
class RunBuilder():
    @staticmethod
    def get_runs(params):
        
        Run = namedtuple('Run', params.keys())
        
        runs = []
        
        for v in product(*params.values()):
            runs.append(Run(*v))
        
        return runs

class RunManager():
    def __init__(self):
        
        # we will need to extract a class out of these epoch values
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        
        self.network = None
        self.loader = None
        self.tb = None
        
    def begin_run(self, run, network, loader):
        
        self.run_start_time = time.time()
        
        self.run_params = run
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')
        
        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)
        
    def end_run(self):
        self.tb.close()
        self.epoch_count = 0
        
    def begin_epoch(self):
        self.epoch_start_time = time.time()
        
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        
    def end_epoch(self):
        
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)
        
        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)
        
        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)
            
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch duration"] = epoch_duration
        results["run duration"] = run_duration
        
        for k,v in self.run_params._asdict().items():
            results[k] = v
        
        self.run_data.append(results)
        
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        
        clear_output(wait=True)
        display(df)
        
    def track_loss(self, loss):
        self.epoch_loss += loss.item() * self.loader.batch_size
        
    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
        
    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        # underscore indicates that this method should not really 
        # be used outside this class (not sure about this)
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self,fileName):
        
        pd.DataFrame.from_dict(
            self.run_data,
            orient='columns',
        ).to_csv(f'{fileName}.csv')
        
        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)

In [13]:
trainsets = {
    'not_normal': train_set,
    'normal': train_set_normal
}

networks = {
    'network1': network1,
    'network2': network2
}

params = OrderedDict(
    lr = [.01],
    batch_size = [1000],
    num_workers = [1],
    device = ['cpu'],
    trainset = ['not_normal', 'normal'],
    network = list(networks.keys())
)

In [16]:

m = RunManager()

for run in RunBuilder.get_runs(params):
    
    network = networks[run.network]
    
    loader = DataLoader(trainsets[run.trainset], 
                        batch_size=run.batch_size, 
                        num_workers=run.num_workers)
    
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        for batch in loader:
            
            images = batch[0]
            labels = batch[1]
            preds = network(images)
            loss = F.cross_entropy(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
            
        m.end_epoch()
    m.end_run()
m.save('results')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device,trainset,network
0,1,1,1.009978,0.60725,10.09456,13.081701,0.01,1000,1,cpu,not_normal,network1
1,1,2,0.54463,0.78925,10.575189,23.735007,0.01,1000,1,cpu,not_normal,network1
2,1,3,0.462403,0.830467,10.291737,34.102988,0.01,1000,1,cpu,not_normal,network1
3,1,4,0.409779,0.84965,10.114872,44.287799,0.01,1000,1,cpu,not_normal,network1
4,1,5,0.371018,0.864167,9.886716,54.242584,0.01,1000,1,cpu,not_normal,network1
5,2,1,0.572865,0.792333,11.304698,11.497324,0.01,1000,1,cpu,not_normal,network2
6,2,2,0.340645,0.873917,11.214774,22.798577,0.01,1000,1,cpu,not_normal,network2
7,2,3,0.306449,0.884767,11.752799,34.635646,0.01,1000,1,cpu,not_normal,network2
8,2,4,0.285718,0.894,11.675694,46.412911,0.01,1000,1,cpu,not_normal,network2
9,2,5,0.268853,0.89995,12.565993,59.065538,0.01,1000,1,cpu,not_normal,network2


In [17]:
pd.DataFrame.from_dict(m.run_data).sort_values('accuracy', ascending=False)

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device,trainset,network
19,4,5,0.226433,0.914833,11.592896,61.078544,0.01,1000,1,cpu,normal,network2
18,4,4,0.235218,0.911217,12.181427,49.399874,0.01,1000,1,cpu,normal,network2
17,4,3,0.246076,0.90805,12.60149,37.128583,0.01,1000,1,cpu,normal,network2
16,4,2,0.262082,0.90215,11.645466,24.420243,0.01,1000,1,cpu,normal,network2
9,2,5,0.268853,0.89995,12.565993,59.065538,0.01,1000,1,cpu,not_normal,network2
8,2,4,0.285718,0.894,11.675694,46.412911,0.01,1000,1,cpu,not_normal,network2
15,4,1,0.309273,0.884783,12.379713,12.678247,0.01,1000,1,cpu,normal,network2
7,2,3,0.306449,0.884767,11.752799,34.635646,0.01,1000,1,cpu,not_normal,network2
14,3,5,0.317883,0.882233,10.079409,53.281494,0.01,1000,1,cpu,normal,network1
13,3,4,0.328869,0.877317,10.225703,43.126849,0.01,1000,1,cpu,normal,network1


## Conclusion

- BatchNorm smokes the setup without batch norm
    - Much faster convergence

# Resources

- https://deeplizard.com/learn/video/bCQ2cNhUWQ8
- https://arxiv.org/pdf/1502.03167.pdf