# CNN Hyperparameters Testing
- Hyperparameter Testing
- Dataset Normalization

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import time 
import pandas as pd
from IPython.display import display
from IPython.display import clear_output
import simplejson as json

torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)

from collections import OrderedDict
from collections import namedtuple
from itertools import product

In [11]:
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [12]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        # Convolutional layers
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1) # in_channel = 1 = grayscale, hyperparam, hyperparam
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5, stride=1) # we in crease the output channel when have extra conv layers
                
        # Fully connected layers
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120, bias=True) # we also shrink the number of features to number of class that we have
        self.fc2 = nn.Linear(in_features = 120, out_features=60, bias=True)
        self.out = nn.Linear(in_features = 60, out_features=10, bias=True) 
        
    def forward(self, t):
        # input layer
        t = t
        
        # convolution 1, not 
        t = self.conv1(t)
        t = F.relu(t) # operation do not use weight, unlike layers
        t = F.max_pool2d(t, kernel_size=2, stride=2) # operation do not use weight, unlike layers
        
        # convolution 2: => relu => maxpool
        t = self.conv2(t)
        # WHY do we need these 2 layers?
        t = F.relu(t) 
        t = F.max_pool2d(t, kernel_size=2, stride=2) # how to determine these values?
        
        # Transition from Conv to Linear will require flatten
        t = t.reshape(-1, 12*4*4) # 4x4 = shape of reduce image (originally 28x28)
        
        # linear 1:
        t = self.fc1(t)
        t = F.relu(t)
        
        # linear 2:
        t = self.fc2(t)
        t = F.relu(t)
        
        # output:
        t = self.out(t)
        
        return t

In [13]:
class RunBuilder():
    @staticmethod
    def get_runs(params):
        # Build runs for us, based on the params we passed in
        Run = namedtuple("Run", params.keys())
        
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))
            
        return runs

In [14]:
# Run Manager Class for separating tensorboard code
class RunManager():
    def __init__(self):
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        
        self.network = None
        self.loader = None
        self.tb = None
    
    def begin_run(self, run, network, loader):
        self.run_start_time = time.time()
        
        self.run_params = run
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f"-{run}")
        
        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)
        
        self.tb.add_image("images", grid)
        
        # allow to try CUDA => move parameters from network to CUDA
        self.tb.add_graph(self.network, images.to(getattr(run, "device", "cpu"))) # allow device, see if we have a device attribute and if it is cpu or ntos
    
    def end_run(self):
        self.tb.close()
        self.epoch_count = 0
    
    def begin_epoch(self):
        self.epoch_start_time = time.time()
        
    def begin_epoch(self):
        self.epoch_start_time = time.time()
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        
    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)
        
        self.tb.add_scalar("Loss", loss, self.epoch_count)
        self.tb.add_scalar("Accuracy", accuracy, self.epoch_count)
        
        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f"{name}.grad", param.grad, self.epoch_count)
          
        # built pandas to analyze data outside of TB
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch duration"] = epoch_duration
        results["run duration"] = run_duration
        for k,v in self.run_params._asdict().items(): results[k] = v # allow us to see what results match with what param
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient="columns")
        
        # update in ipynb in real time
        clear_output(wait=True)
        display(df)
    
    def track_loss(self, loss):
        self.epoch_loss += loss.item() * self.loader.batch_size
    
    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
        
    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):
        pd.DataFrame.from_dict(
            self.run_data,
            orient="columns"
        ).to_csv(f"{fileName}.csv") # save in csv
        
        # to create in tensorboard 
        with open(f"{fileName}.json", "w", encoding="utf-8") as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent = 4)

In [15]:
train_set = torchvision.datasets.FashionMNIST(
    root="./data/FashionMNIST",
    train=True,
    download=True,
    transform=transforms.Compose([ # convert image to 
        transforms.ToTensor()
    ]))

In [16]:
params = OrderedDict(
    lr = [0.01],
    batch_size = [1000],
    num_workers = [0],
    shuffle = [True],
    device = ['cpu']
)
m = RunManager()

for run in RunBuilder.get_runs(params):
    device = torch.device(run.device) # allow to try CUDA
    network = Network().to(device) # allow to try CUDA
    loader = torch.utils.data.DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers) # num worker to speed up process for dataloader
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        for batch in loader:
            images = batch[0].to(device) # allow to try CUDA
            labels = batch[1].to(device) # allow to try CUDA
            preds = network(images) # pass batch
            loss = F.cross_entropy(preds, labels) # calculate loss
            optimizer.zero_grad() # zero gradient
            loss.backward() # back prop for calculating gradient
            optimizer.step() # update weights
            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
        
        m.end_epoch()
    m.end_run()
m.save("results")

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,shuffle,device
0,1,1,0.924833,0.650133,62.053895,69.35288,0.01,1000,0,True,cpu
1,1,2,0.503888,0.808533,59.851633,129.52651,0.01,1000,0,True,cpu
2,1,3,0.416354,0.8471,26.284248,156.128757,0.01,1000,0,True,cpu
3,1,4,0.354021,0.869517,24.94105,181.390814,0.01,1000,0,True,cpu
4,1,5,0.3271,0.880883,25.064579,206.79319,0.01,1000,0,True,cpu


In [16]:
torch.cuda.is_available()

False

In [17]:
# sort pandas
pd.DataFrame.from_dict(m.run_data, orient="columns").sort_values("epoch duration")

NameError: name 'm' is not defined

## Dataset Normalization
- Transform data in the dataset to a new set of dataset
- This can also means features scaling = normalize dataset
- Rescale features in a similar scale = Data normalization

## Standardization
- Standardization is a specific type of normalization technique and sometime is referred to as z-score normalization or the standard score.
- z = (x-mean)/std

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

In [19]:
train_set = torchvision.datasets.FashionMNIST(
    root="./data",
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor() 
        
        # normalization
    ])
)

## Easy way to do normalization
 - Build in method pytorch


In [20]:
loader = DataLoader(train_set, batch_size=len(train_set), num_workers=1)
data = next(iter(loader))
data[0].mean(), data[0].std()

(tensor(0.2860), tensor(0.3530))

## Harder way
- If dataset is too large

In [21]:
loader = DataLoader(train_set, batch_size=1000, num_workers=1) # create dataloader
num_of_pixels = len(train_set) * 28 * 28 # number of total pixel in the image, 28x28 = height and width of the image

# Mean
total_sum = 0 
for batch in loader: total_sum += batch[0].sum() # total sum of all pixel in 1 image
mean = total_sum / num_of_pixels

# Standard of Dev
sum_of_squared_error = 0
for batch in loader: sum_of_squared_error += ((batch[0]-mean).pow(2)).sum()
std = torch.sqrt(sum_of_squared_error / num_of_pixels)

mean, std

(tensor(0.2860), tensor(0.3530))

## Using the mean and std values
- We use the same mean and std values fo

In [22]:
train_set_normal = torchvision.datasets.FashionMNIST(
    root="./data",
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
)

In [23]:
loader = DataLoader(train_set_normal, batch_size=len(train_set), num_workers=1)
data = next(iter(loader))
data[0].mean(), data[0].std()

(tensor(-1.8468e-07), tensor(1.))

In [24]:
# plt.hist(data[0].flatten())
# plt.axvline(data[0].mean())

In [25]:
trainsets = {
    'not_normal': train_set,
    'normal': train_set_normal
}

In [26]:
params = OrderedDict(
    lr = [0.01],
    batch_size = [1000],
    num_workers = [0],
    shuffle = [True],
    device = ['cpu'],
    trainset = ['not_normal', 'normal']
)
m = RunManager()

for run in RunBuilder.get_runs(params):
    device = torch.device(run.device) # allow to try CUDA
    network = Network().to(device) # allow to try CUDA
    loader = torch.utils.data.DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers) # num worker to speed up process for dataloader
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        for batch in loader:
            images = batch[0].to(device) # allow to try CUDA
            labels = batch[1].to(device) # allow to try CUDA
            preds = network(images) # pass batch
            loss = F.cross_entropy(preds, labels) # calculate loss
            optimizer.zero_grad() # zero gradient
            loss.backward() # back prop for calculating gradient
            optimizer.step() # update weights
            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
        
        m.end_epoch()
    m.end_run()
m.save("results")

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,shuffle,device,trainset
0,1,1,0.978055,0.620267,16.149005,25.343438,0.01,1000,0,True,cpu,not_normal
1,1,2,0.539666,0.7922,16.098004,41.667437,0.01,1000,0,True,cpu,not_normal
2,1,3,0.448208,0.830767,25.96891,67.831344,0.01,1000,0,True,cpu,not_normal
3,1,4,0.389815,0.855467,24.679956,92.906079,0.01,1000,0,True,cpu,not_normal
4,1,5,0.356474,0.866833,24.228186,117.399278,0.01,1000,0,True,cpu,not_normal
5,2,1,0.957169,0.633233,20.135358,21.310358,0.01,1000,0,True,cpu,normal
6,2,2,0.527017,0.79665,18.862527,40.369882,0.01,1000,0,True,cpu,normal
7,2,3,0.443414,0.836483,25.036893,65.614778,0.01,1000,0,True,cpu,normal
8,2,4,0.391875,0.8556,23.576757,89.406533,0.01,1000,0,True,cpu,normal
9,2,5,0.364971,0.867217,21.431822,111.037355,0.01,1000,0,True,cpu,normal


In [28]:
# sort pandas
pd.DataFrame.from_dict(m.run_data, orient="columns").sort_values("accuracy", ascending=False)

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,shuffle,device,trainset
9,2,5,0.364971,0.867217,21.431822,111.037355,0.01,1000,0,True,cpu,normal
4,1,5,0.356474,0.866833,24.228186,117.399278,0.01,1000,0,True,cpu,not_normal
8,2,4,0.391875,0.8556,23.576757,89.406533,0.01,1000,0,True,cpu,normal
3,1,4,0.389815,0.855467,24.679956,92.906079,0.01,1000,0,True,cpu,not_normal
7,2,3,0.443414,0.836483,25.036893,65.614778,0.01,1000,0,True,cpu,normal
2,1,3,0.448208,0.830767,25.96891,67.831344,0.01,1000,0,True,cpu,not_normal
6,2,2,0.527017,0.79665,18.862527,40.369882,0.01,1000,0,True,cpu,normal
1,1,2,0.539666,0.7922,16.098004,41.667437,0.01,1000,0,True,cpu,not_normal
5,2,1,0.957169,0.633233,20.135358,21.310358,0.01,1000,0,True,cpu,normal
0,1,1,0.978055,0.620267,16.149005,25.343438,0.01,1000,0,True,cpu,not_normal
