# Classifying Street View House Numbers

In [1]:
import torch
from scipy.io import loadmat
import torchvision.transforms as T
from torchvision.io import read_image
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import h5py

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Online data loader
A single image contains multiple labels, so the images are first cropped into separate digits according to the boundary boxes defined in digitStruct.mat. The data loader reads images from storage until we have full batch, and then it yields the batch for processing on the GPU. 

In [3]:
class OnlineLoader:
    def __init__(self, batch_size=32, shuffle=True):
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.matdata = h5py.File('lazy_data/train/digitStruct.mat')
        self.size = self.matdata['/digitStruct/name'].size
        self.indices = list(range(self.size))
            
    def __iter__(self):
        """  Generate batches. Optionally shuffle at start of each epoch. """
        if self.shuffle:
            self.indices = torch.randperm(self.size)
            
        n = 0
        all_xs, all_ys = [], []
        for i in self.indices:
            xs, ys = self._load(i)
            n += len(ys)
            all_xs.append(xs)
            all_ys.append(ys)
            if n >= self.batch_size:
                n = 0
                x = torch.cat(all_xs)[:self.batch_size]
                y = torch.cat(all_ys)[:self.batch_size]
                y[y==10] = 0
                all_xs, all_ys = [], []
                yield x / 255, y.type(torch.LongTensor)
    
    def _load(self, i):
        """ Load a .png image and separate digits by cropping according to the boundary boxes. The
        images are also resized to maintain a consistent tensor shape. """
        filename = get_name(i, self.matdata)
        boxdata = get_box_data(i, self.matdata)
        img = read_image(f'lazy_data/train/{filename}')
        xs = []
        for j, label in enumerate(boxdata['label']):
            top = int(boxdata['top'][j])
            left = int(boxdata['left'][j])
            height = int(boxdata['height'][j])
            width = int(boxdata['width'][j])
            xs.append(T.Resize([32, 32])(T.functional.crop(img, top, left, height, width)))
        return torch.stack(xs), torch.as_tensor(boxdata['label'])
        
    def __len__(self):
        return self.size
    
# https://stackoverflow.com/questions/41176258/h5py-access-data-in-datasets-in-svhn
def get_box_data(index, hdf5_data):
    """
    get `left, top, width, height` of each picture
    :param index:
    :param hdf5_data:
    :return:
    """
    meta_data = dict()
    meta_data['height'] = []
    meta_data['label'] = []
    meta_data['left'] = []
    meta_data['top'] = []
    meta_data['width'] = []

    def print_attrs(name, obj):
        vals = []
        if obj.shape[0] == 1:
            vals.append(obj[0][0])
        else:
            for k in range(obj.shape[0]):
                vals.append(int(hdf5_data[obj[k][0]][0][0]))
        meta_data[name] = vals

    box = hdf5_data['/digitStruct/bbox'][index]
    hdf5_data[box[0]].visititems(print_attrs)
    return meta_data

def get_name(index, hdf5_data):
    """ Get file path to image. """
    name = hdf5_data['/digitStruct/name']
    return ''.join([chr(v[0]) for v in hdf5_data[name[index][0]].value])

## Network architecture
Below is a fairly simple ResNet implementation consisting of some number or same-size ResBlocks, followed by two fully connected layers.

In [4]:
class ResNet(nn.Module):
    def __init__(self, num_blocks=5):
        super().__init__()
        in_channels = 3
        conv_dim = 64
        pool_size = 2
        image_size = 32
        conv_out = image_size // pool_size
        fc_in = int(conv_out * conv_out * conv_dim)
        fc_dim = 512
        n_classes = 10
        
        self.conv = nn.Conv2d(in_channels, conv_dim, kernel_size=3, stride=1, padding=1)
        self.res_blocks = nn.ModuleList([ResBlock(conv_dim, conv_dim) for _ in range(num_blocks)])
        self.max_pool = nn.MaxPool2d(pool_size)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(fc_in, fc_dim)
        self.fc2 = nn.Linear(fc_dim, n_classes)
        
    def forward(self, x):
        x = self.conv(x)
        for res_block in self.res_blocks:
            x = res_block(x)
        x = self.max_pool(x)
        x = self.dropout(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x
    
class ResBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += identity # Res connection
        out = self.relu(out)
        return out

The online dataloader is very slow compared to the offline approach, so below is just a demonstration that the online approach works (the models learns since it keeps improving the loss). This is far from an optimal solution.

I have not applied data augmentation here since that would make this solution even slower, but to implement it I would follow the same approach as in the offline solution: convert the tensors into PIL images, then apply some transformations and convert the images back to tensors.

In [5]:
def run_one_epoch(dataloader, model, optimizer, loss_function, device, should_update=True):
    model.to(device)
    loss_tot, correct_tot = 0, 0
    N = 0
    for i, batch in enumerate(dataloader):
        x, y = batch[0].to(device), batch[1].to(device)
        N += len(y)
        out = model(x)
        loss = loss_function(out, y)
        if should_update:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss_tot += loss.item()
        _, y_hat = torch.max(out.data, 1)
        correct_tot += (y_hat == y).sum().item()
        if i % 100 == 0:
            print(f'Batch: {i} Loss: {loss_tot / N:.3f} Acc: {correct_tot / N}')

    return loss_tot / N, correct_tot / N

online = OnlineLoader()
net = ResNet(num_blocks=1)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

run_one_epoch(online, net, optimizer, loss_function, device)

  self.matdata = h5py.File('lazy_data/train/digitStruct.mat')
  return ''.join([chr(v[0]) for v in hdf5_data[name[index][0]].value])


Batch: 0 Loss: 0.072 Acc: 0.0625
Batch: 100 Loss: 0.070 Acc: 0.19492574257425743
Batch: 200 Loss: 0.064 Acc: 0.29322139303482586
Batch: 300 Loss: 0.056 Acc: 0.38932724252491696
Batch: 400 Loss: 0.050 Acc: 0.4661783042394015
Batch: 500 Loss: 0.045 Acc: 0.5225174650698603
Batch: 600 Loss: 0.042 Acc: 0.5645278702163061
Batch: 700 Loss: 0.039 Acc: 0.5974054921540656
Batch: 800 Loss: 0.037 Acc: 0.6240246566791511
Batch: 900 Loss: 0.035 Acc: 0.6439372918978913
Batch: 1000 Loss: 0.033 Acc: 0.661245004995005
Batch: 1100 Loss: 0.032 Acc: 0.6753235694822888
Batch: 1200 Loss: 0.031 Acc: 0.6872398001665279
Batch: 1300 Loss: 0.030 Acc: 0.6985491929285165
Batch: 1400 Loss: 0.029 Acc: 0.7083110278372591
Batch: 1500 Loss: 0.028 Acc: 0.7170011658894071
Batch: 1600 Loss: 0.027 Acc: 0.7252498438475953
Batch: 1700 Loss: 0.027 Acc: 0.7324368018812464
Batch: 1800 Loss: 0.026 Acc: 0.7389297612437534
Batch: 1900 Loss: 0.026 Acc: 0.7451834560757497
Batch: 2000 Loss: 0.025 Acc: 0.7503591954022989
Batch: 2100 Lo

(0.02414551179239617, 0.7613890129522108)