# Customized Convolutional Neural Network Hyperparameters Sampling on MNIST Dataset
- Show network architectures (optimization + Hyperparameter tunning)
- Basically Hyperparameters tunning
- Applied Dataset Normalization Techniques
- Analysis with integrated Tensorboard and Pandas

In [1]:
!pip install tensorflow



In [2]:
!pip install tensorboard



In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import time
import pandas as pd
from IPython.display import display
from IPython.display import clear_output
import simplejson as json

torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)

from collections import OrderedDict
from collections import namedtuple
from itertools import product

### Non Batch Normalization Network

In [4]:
# Class to create customized network (this is determined by the user)
class CustomedNetwork(nn.Module):
    # CONSTRUCTOR
    def __init__(self):
        """Initialize 5 distinct layers of the network for building forward step
        """
        super().__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5, stride=1)
        
        # Fully connection layers
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120, bias=True)
        self.fc2 = nn.Linear(in_features = 120, out_features=60, bias=True)
        self.out = nn.Linear(in_features = 60, out_features=10, bias=True) 
    
    # PUBLIC METHOD
    def forward(self, x):
        """Forward propagation of the Customed Neural Network
        
        Parameters
        ----------
        x:
            input batch of images
        """
        # Input layers
        x = x
        
        # Convolution layer 1
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        
        # Convolution layer 2 
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        
        # Process input from convolution input to 1 input for fully connected layer
        x = x.reshape(-1, 12*4*4)
        
        # Linear layer 1
        x = self.fc1(x)
        x = F.relu(x)
        
        # Linear layer 2
        x = self.fc2(x)
        x = F.relu(x)
        
        # Output layer
        x = self.out(x)
        
        return x

customed_net = CustomedNetwork()
print(customed_net)

CustomedNetwork(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (out): Linear(in_features=60, out_features=10, bias=True)
)


### Batch Normalzation Network

In [5]:
batch_norm_network = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.BatchNorm2d(6), # batch norm
    nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=12*4*4, out_features=120),
    nn.ReLU(),
    nn.BatchNorm1d(120), # batch norm 1 d since we already flatten out our images
    nn.Linear(in_features=120, out_features=60),
    nn.ReLU(),
    nn.Linear(in_features=60, out_features=10)
)

print(batch_norm_network)

Sequential(
  (0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (3): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (4): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (5): ReLU()
  (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (7): Flatten(start_dim=1, end_dim=-1)
  (8): Linear(in_features=192, out_features=120, bias=True)
  (9): ReLU()
  (10): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): Linear(in_features=120, out_features=60, bias=True)
  (12): ReLU()
  (13): Linear(in_features=60, out_features=10, bias=True)
)


In [6]:
# Class to create an object to run the surveying parameters lists combinations
class RunBuilder():
    @staticmethod
    def get_runs(params):
        """Get the lists of parameters' values 
        
        Parameters
        ----------
        params:
            list of parameters contained of different related parameters
        """
        Run = namedtuple("Run", params.keys())
        
        runs_list = []
        
        for value in product(*params.values()):
            runs_list.append(Run(*value))
            
        return runs_list

In [7]:
# Class Run Manage that run the surveys of combinations of values of the RunBuilder() object
class RunManager():
    # CONSTRUCTORS
    def __init__(self):
        """Initialize parameters
        """
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0;
        self.run_data = []
        self.run_start_time = None
        
        self.network = None
        self.loader = None
        self.tb = None # Tensorboard
    
    # PUBLIC METHODS    
    def begin_run(self, run, network, loaders):
        """Start running the values combinations surveys
        
        Parameters
        ----------
        run:
            run list
        network:
            neural network
        loader:
            DataLoader - basically preprocessed data objects
        """
        self.run_start_time = time.time() # used for keep track of run time
        
        self.run_params = run
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f"-{run}")
        
        images, labels = next(iter(self.loader)) # get the first batch of images and labels
        grid = torchvision.utils.make_grid(images)
        
        self.tb.add_image("image", grid)
        
        # Try CUDA
        self.tb.add_graph(self.network, images.to(getattr(run, "device", "cpu")))   
        
    def end_run(self):
        """End runningthe values combinations surveys
        """
        self.tb.close() # close tensorboard
        self.epoch_count = 0 # reinitialized the epoch
    
    def begin_epoch(self):
        """Begin the epoch, initialize related variables
        """
        self.epoch_start_time = time.time()
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        
    def end_epoch(self):
        """End the epoch, calculated initialized variables above 
        """
        # Calculate run time
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        # Calculate the loss and accuracy of the trained dataset
        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)
        
        # Calculate the average loss and accuracy
        self.tb.add_scalar("Loss", loss, self.epoch_count)
        self.tb.add_scalar("Accuracy", accuracy, self.epoch_count)
        
        # Draw historgram 
        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f"{name}.grad", param.grad, self.epoch_count)
            
        # Build pandas to data output of tensorboard
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch duration"] = epoch_duration
        results["run duration"] = run_duration
        
        # Add data in the DataFrames
        for k,v in self.run_params._asdict().items(): 
            results[k] = v # allow us to see what results match with what param
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient="columns")
        
        # Update Dataframe in .ipynb in real time
        clear_output(wait=True)
        display(df)
    
    def track_loss(self, loss):
        """Track the loss
        
        Parameters
        ----------
        loss:
            loss of the training process of a batch
        """
        self.epoch_loss += loss.item() * self.loader.batch_size
        
    def track_num_correct(self, preds, labels):
        """Track total number of correct of a batch
        
        Parameters
        ----------
        preds:
            list of predictions in training process
        labels:
            list of labels given in the dataset
        """
        self.epoch_num_correct += self._get_num_correct(preds, labels)
        
    def save(self, file_name):
        """Save the Dataframe to .csv file
        """
        pd.DataFrame.from_dict(
            self.run_data,
            orient="columns"
        ).to_csv(f"{file_name}.csv") # save in csv
        
        # to create in tensorboard 
        with open(f"{file_name}.json", "w", encoding="utf-8") as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent = 4)
        
    # PRIVATE METHODS
    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        """Get the total number that the prediction is correct with the labels

        Parameters
        ----------
        preds:
            list of predictions
        labels:
            list of labels

        Return
        ----------
        total number that prediction and label are equal when comparing 2 lists
        """
        return preds.argmax(dim=1).eq(labels).sum().item()

## Get Dataset - MNIST
- Normalization: Standardization is a specific type of normalization technique and sometime is referred to as z-score normalization or the standard score.
    - z = (x-mean)/std

### Download Original Dataset

In [8]:
train_set = torchvision.datasets.MNIST(
    root="./data/MNIST",
    train=True,
    download=True,
    transform=transforms.Compose([ # convert image to 
        transforms.ToTensor()
    ]))
print(train_set.data.size())

test_set = torchvision.datasets.MNIST(
    root="./data/MNIST", 
    train = False, 
    transform=transforms.Compose([ # convert image to 
        transforms.ToTensor()
    ]))
print(test_set.data.size())

torch.Size([60000, 28, 28])
torch.Size([10000, 28, 28])


### Calculate the mean and standard of deviation for normalization

In [9]:
loader = DataLoader(train_set, batch_size=1000, num_workers=1) # create dataloader
num_of_pixels = len(train_set) * 28 * 28 # number of total pixel in the image, 28x28 = height and width of the image

# Mean
total_sum = 0 
for batch in loader: total_sum += batch[0].sum() # total sum of all pixel in 1 image
mean = total_sum / num_of_pixels

# Standard of Dev
sum_of_squared_error = 0
for batch in loader: sum_of_squared_error += ((batch[0]-mean).pow(2)).sum()
std = torch.sqrt(sum_of_squared_error / num_of_pixels)

print(mean)
print(std)

tensor(0.1307)
tensor(0.3081)


### Create a new normalized MNIST processed dataset

In [10]:
train_set_normal = torchvision.datasets.MNIST(
    root="./data/MNIST",
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
)
print(train_set_normal.data.size())

test_set_normal = torchvision.datasets.MNIST(
    root="./data/MNIST", 
    train = False, 
    transform=transforms.Compose([ # convert image to 
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]))
print(test_set_normal.data.size())

torch.Size([60000, 28, 28])
torch.Size([10000, 28, 28])


### Create a DataLoader and analyse the batch

In [11]:
loader = DataLoader(train_set_normal, batch_size=len(train_set), num_workers=1)
data = next(iter(loader))
data[0].mean(), data[0].std()

(tensor(6.1284e-08), tensor(1.))

### Create trainsets list

In [12]:
trainsets = {
    'not_normal': train_set,
    'normal': train_set_normal
}

### Create networks list

In [13]:
customed_network = CustomedNetwork()# allow to try CUDA

networks  = {
    'no_batch_norm_net': customed_network,
    'batch_norm_net': batch_norm_network
}

### Training & Testing Process 

In [24]:
params = OrderedDict(
    lr = [0.01, 0.02, 0.03],
    batch_size = [1000],
    num_workers = [0],
    shuffle = [True],
    device = ['cpu'],
    trainset = ['not_normal', 'normal'],
    network = list(networks.keys()),
    num_epochs = [2],
    test_accuracy = [0]
)
m = RunManager()

for run in RunBuilder.get_runs(params):
    params['test_accuracy'] = 0 # restart writing test_accuracy
    
    device = torch.device(run.device) # allow to try CUDA
    network = networks[run.network].to(device) # allow to try CUDA
    
    network.train() # mark network as train
    
    train_loader = torch.utils.data.DataLoader(trainsets[run.trainset], batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers) # num worker to speed up process for dataloader
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, train_loader)
    for epoch in range(run.num_epochs):
        m.begin_epoch()
        for batch in train_loader:
            images = batch[0].to(device) # allow to try CUDA
            labels = batch[1].to(device) # allow to try CUDA
            preds = network(images) # pass batch
            loss = F.cross_entropy(preds, labels) # calculate loss
            optimizer.zero_grad() # zero gradient
            loss.backward() # back prop for calculating gradient
            optimizer.step() # update weights
            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
        
        m.end_epoch()
    m.end_run()
    
    if(run.trainset == 'not_normal'):
        # Get the testing dataset
        test_loader = torch.utils.data.DataLoader(test_set, batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers)
        
        network.eval()
        with torch.no_grad():
            for batch in test_loader:
                images = batch[0].to(device) # allow to try CUDA
                labels = batch[1].to(device) # allow to try CUDA
                preds = network(images) # pass batch
                accuracy = (preds.argmax(dim=1).eq(labels).sum().item()) / float(run.batch_size)
        print('Test Accuracy of the not-normalized-dataset model on the 10000 test images: %.2f' % accuracy)
#         params['test_accuracy'] = accuracy
        
    elif(run.trainset == "normal"):
        # Get the testing dataset
        test_loader_normal = torch.utils.data.DataLoader(test_set_normal, batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers)
        
        network.eval()
        with torch.no_grad():
            for batch in test_loader:
                images = batch[0].to(device) # allow to try CUDA
                labels = batch[1].to(device) # allow to try CUDA
                preds = network(images) # pass batch     
                accuracy = (preds.argmax(dim=1).eq(labels).sum().item()) / float(run.batch_size)
        print('Test Accuracy of the normalized-dataset model on the 10000 test images: %.2f' % accuracy)
#         params['test_accuracy'] = accuracy # Just write this to another data frame then append it then save
        
m.save("results")

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,shuffle,device,trainset,network,num_epochs,test_accuracy
0,1,1,1.481577,0.992417,28.928096,147.617012,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net,2,0
1,1,2,1.081974,0.993983,27.011807,174.878378,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net,2,0
2,2,1,1.263599,0.9932,29.295499,99.482012,0.01,1000,0,True,cpu,not_normal,batch_norm_net,2,0
3,2,2,0.922963,0.995017,28.016947,127.987806,0.01,1000,0,True,cpu,not_normal,batch_norm_net,2,0
4,3,1,2.454989,0.991833,33.790269,98.908297,0.01,1000,0,True,cpu,normal,no_batch_norm_net,2,0
5,3,2,0.744353,0.9963,31.787043,131.156647,0.01,1000,0,True,cpu,normal,no_batch_norm_net,2,0
6,4,1,1.077831,0.994183,32.442022,96.841218,0.01,1000,0,True,cpu,normal,batch_norm_net,2,0
7,4,2,0.763967,0.995767,33.016057,130.183293,0.01,1000,0,True,cpu,normal,batch_norm_net,2,0


Test Accuracy of the normalized-dataset model on the 10000 test images: 0.60


In [25]:
# Sort DataFrame by Accuracy
pd.DataFrame.from_dict(m.run_data, orient="columns").sort_values("accuracy", ascending=False)

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,shuffle,device,trainset,network,num_epochs,test_accuracy
5,3,2,0.744353,0.9963,31.787043,131.156647,0.01,1000,0,True,cpu,normal,no_batch_norm_net,2,0
7,4,2,0.763967,0.995767,33.016057,130.183293,0.01,1000,0,True,cpu,normal,batch_norm_net,2,0
3,2,2,0.922963,0.995017,28.016947,127.987806,0.01,1000,0,True,cpu,not_normal,batch_norm_net,2,0
6,4,1,1.077831,0.994183,32.442022,96.841218,0.01,1000,0,True,cpu,normal,batch_norm_net,2,0
1,1,2,1.081974,0.993983,27.011807,174.878378,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net,2,0
2,2,1,1.263599,0.9932,29.295499,99.482012,0.01,1000,0,True,cpu,not_normal,batch_norm_net,2,0
0,1,1,1.481577,0.992417,28.928096,147.617012,0.01,1000,0,True,cpu,not_normal,no_batch_norm_net,2,0
4,3,1,2.454989,0.991833,33.790269,98.908297,0.01,1000,0,True,cpu,normal,no_batch_norm_net,2,0


### Evaluate on Testing dataset