In [None]:
import os
import sys
import pickle

import scipy.io as sio
import numpy as np
import torch
import torchvision
from torchvision import transforms

import h5py
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib import patches,  lines
import time

# Root directory of the project
ROOT_DIR = os.path.abspath("")

# To find local version of the library
sys.path.append(ROOT_DIR)

from humanware.svhn.utils import load_obj

%matplotlib inline

In [None]:
traindata_dir = 'data/SVHN/train/'
extradata_dir = 'data/SVHN/extra/'

filename = 'labels'
metadata_train = load_obj(traindata_dir, filename)
metadata_extra = load_obj(extradata_dir, filename)


## Create custom dataloader.

https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel

In [None]:
from torch.utils import data

class SVHNDataset(data.Dataset):
    
    def __init__(self, metadata, data_dir, transform=None):
        """
        Args:
            labels (dict): Dictionary containing all labels and metadata
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.metadata = metadata
        self.data_dir = data_dir
        self.transform = transform
        
        
    def __len__(self):
        return len(self.metadata)

    
    def __getitem__(self, index):
        '''
        Parameters
        ----------
        index : int
            The index of the dataset

        Returns
        -------
        X : PIL objet
        
        y : dict
            The metadata associated to the image in dict form.

        '''
        'Generates one sample of data'

        img_name = os.path.join(self.data_dir,
                                self.metadata[index]['filename'])
        
        # Load data and get raw metadata (labels & boxes)
        image = Image.open(img_name)
        metadata_raw = self.metadata[index]['metadata']

        labels, boxes = extract_labels_boxes(metadata_raw)
        
        metadata = {'labels': labels, 'boxes': boxes}
        
        sample = {'image': image, 'metadata': metadata}
        
        if self.transform:
            sample = self.transform(sample)

        return sample



## Create custom transformations

In [None]:
class FirstCrop(object):
    """Crop the image such that all bounding boxes +30% in x,y are contained in the image.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, pad_size):

        self.pad_size = pad_size

    def __call__(self, sample):
        
        image = sample['image']
        
        labels, boxes = sample['metadata']['labels'], sample['metadata']['boxes'] 

        outer_box = extract_outer_box(sample, padding=self.pad_size)
        outer_box = np.round(outer_box).astype('int')
        
        x1_tot, x2_tot, y1_tot, y2_tot = outer_box

        boxes_cropped = boxes
        boxes_cropped[:,0:2] -= x1_tot
        boxes_cropped[:,2:] -= y1_tot

        img_cropped = image.crop((x1_tot,y1_tot,x2_tot,y2_tot))
        
        metadata = {'boxes': boxes_cropped, 'labels': labels}
        
        return {'image': img_cropped, 'metadata': metadata}


class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        image, boxes, labels = sample['image'], sample['metadata']['boxes'], sample['metadata']['labels']
      
        h, w = np.asarray(image).shape[:2]

        new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        image_scaled = image.resize((new_h, new_w))

        # h and w are swapped for landmarks because for images,
        # x and y axes are axis 1 and 0 respectively
        
        boxes = boxes.astype('float64')
        boxes[:,:2] *= (new_w / w)
        boxes[:, 2:] *= (new_h / h)
        boxes = boxes.astype('int64')
        
        metadata = {'boxes': boxes, 'labels': labels}

        return {'image': image_scaled, 'metadata': metadata}

    
class RandomCrop(object):
    """Crop randomly the image in a sample.

    Args:
        output_size (tuple or int): Desired output size. If int, square crop
            is made.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size

    def __call__(self, sample):
        image, boxes, labels = sample['image'], sample['metadata']['boxes'], sample['metadata']['labels']

        h, w = np.asarray(image).shape[:2]
        new_h, new_w = self.output_size

        top = np.random.randint(0, h - new_h)
        left = np.random.randint(0, w - new_w)
        
        image_cropped = image.crop((left,top,left+new_w,top+new_h))

        boxes[:, 0:2] -= left
        boxes[:, 2:] -= top
                
        boxes[:, :2] = np.clip(boxes[:, :2], 0, new_w-1)
        boxes[:, 2:] = np.clip(boxes[:, 2:], 0, new_h-1)
        
        metadata = {'boxes': boxes, 'labels': labels}

        return {'image': image_cropped, 'metadata': metadata}


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, boxes, labels = sample['image'], sample['metadata']['boxes'], sample['metadata']['labels']
        
        image = np.asarray(image)
        image = image - np.mean(image)
        assert image.shape == (54, 54, 3)
        
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        image = image.transpose((2, 0, 1))
        image = torch.from_numpy(image).float()

        
        # TODO
        # Process boxes
        
        labels = np.asarray(labels)
        
        # Target is a 1x6 vector, where [0] is the number of digits and 
        # targets[1:targets[0]] is the digit sequence.
        # i.e. the sequence 157 is represented by target [3,1,5,5,7,-1,-1]
        target = -np.ones(6)
        target[0] = len(labels)

        for jj in range(len(labels)):

            target[jj+1] = labels[jj]
            
        target = torch.from_numpy(target).int()        
#         metadata = {'boxes': boxes, 'labels': labels}
        
        return {'image': image,
                'target': target}


In [None]:
## code to extract bboxes
## Inspiration: https://github.com/matterport/Mask_RCNN/blob/master/mrcnn/visualize.py

def extract_labels_boxes(meta):
    '''Extract the labels and boxes from a given sample'''
    
#     meta = sample['metadata']
    N = len(meta['label']) # Number of digits in image

    labels = [] # Digits present in image
    boxes = [] # bboxes present in image

    # Extract digit boxes and labels
    for jj in range(N):
        labels.append(meta['label'][jj])
        y1 = meta['top'][jj]
        y2 = y1+meta['height'][jj]
        x1 = meta['left'][jj]
        x2 = x1 + meta['width'][jj]

        boxes.append((x1,x2,y1,y2))
        
    boxes = np.asarray(boxes)
        
    return labels, boxes


def extract_outer_box(sample, padding=0.3):
    
    img_shape = np.asarray(sample['image']).shape
    boxes = sample['metadata']['boxes']

    x1_tot = np.min(boxes[:,0])
    x2_tot = np.max(boxes[:,1])
    y1_tot = np.min(boxes[:,2])
    y2_tot = np.max(boxes[:,3])
    
    x1_tot -= padding/2*(x2_tot-x1_tot)
    x2_tot += padding/2*(x2_tot-x1_tot)
    y1_tot -= padding/2*(y2_tot-y1_tot)
    y2_tot += padding/2*(y2_tot-y1_tot)
    
    x1_tot = max(0, x1_tot)
    x2_tot = min(x2_tot, img_shape[1]-1)
    y1_tot = max(0, y1_tot)
    y2_tot = min(y2_tot, img_shape[0]-1)
    
    
    outer_bbox = (x1_tot, x2_tot, y1_tot, y2_tot)
        
    return outer_bbox

    
def visualize_sample(sample, outer_bbox=None):
    
    img = sample['image']
    boxes = sample['metadata']['boxes']    
    labels = sample['metadata']['labels']    

    # Display image
    _, ax = plt.subplots(1)
    plt.axis('off')
    plt.imshow(img)
    
    
    N = len(labels) # Number of digits in image
    
    # Show individual boxes and labels
    for i in range(N):
        
        # Show bounding boxes
        c = ['r','k']
        if boxes is not None:
            x1, x2, y1, y2 = boxes[i]
            p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2,
                                alpha=0.7, linestyle="dashed",
                                edgecolor=c[i % 2], facecolor='none')
            ax.add_patch(p)
            
            # Show Label
            caption = labels[i]
            ax.text(x1, y1 + 8, caption,
                color='w', size=11, backgroundcolor="none")

    if outer_bbox is not None:

        x1_tot, x2_tot, y1_tot, y2_tot = outer_bbox

        p2 = patches.Rectangle((x1_tot, y1_tot), x2_tot - x1_tot, y2_tot - y1_tot, linewidth=2,
                            alpha=0.7, linestyle="dashed",
                            edgecolor='blue', facecolor='none')
        ax.add_patch(p2)


In [None]:
from torchvision import transforms, utils


traindata = SVHNDataset(metadata_train, traindata_dir)

firstcrop = FirstCrop(0.3)
rescale = Rescale((64, 64))
random_crop = RandomCrop((54, 54))
to_tensor = ToTensor()

transform = transforms.Compose([firstcrop,
                                rescale,
                                random_crop,
                                ])

index = np.random.randint(len(traindata))
print("Index: ", index)
sample = traindata[index] 
visualize_sample(sample)

for i, tsfrm in enumerate([firstcrop, rescale, random_crop]):
    
    sample = tsfrm(sample)
    visualize_sample(sample)


In [None]:
# Create dataloader

from torch.utils.data import DataLoader

batch_size = 32

firstcrop = FirstCrop(0.3)
rescale = Rescale((64, 64))
random_crop = RandomCrop((54, 54))
to_tensor = ToTensor()


transform = transforms.Compose([firstcrop,
                                rescale,
                                random_crop,
                                to_tensor])

transformed_dataset = SVHNDataset(metadata_train, data_dir=traindata_dir, transform=transform)


indices = np.arange(len(metadata_train))
indices = np.random.permutation(indices)


train_idx = indices[:round(0.8*len(indices))]
valid_idx = indices[round(0.8*len(indices)):]
sample_idx = indices[:100]

train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx)
sample_sampler = torch.utils.data.SubsetRandomSampler(sample_idx)


train_loader = DataLoader(transformed_dataset, 
                          batch_size=batch_size, 
                          shuffle=False, 
                          num_workers=4, 
                          sampler=train_sampler)

valid_loader = DataLoader(transformed_dataset, 
                          batch_size=batch_size, 
                          shuffle=False, 
                          num_workers=4, 
                          sampler=valid_sampler)

sample_loader = DataLoader(transformed_dataset, 
                           batch_size=batch_size, 
                           shuffle=False, 
                           num_workers=4, 
                           sampler=sample_sampler)

In [None]:
# Define a placeholder CNN

import torch.nn as nn
import torch.nn.functional as F


class BaselineCNN(nn.Module):

    def __init__(self):
        super(BaselineCNN, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 32, 5)
        self.conv2 = nn.Conv2d(32, 64, 3)

        self.pool = nn.MaxPool2d(2, 2)
        
        self.fc1 = nn.Linear(7744, 4096)
        self.fc2 = nn.Linear(4096, 7)
     

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1) # Flatten based on batch size
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x




In [None]:
baseline_cnn = BaselineCNN()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device used: ", device)

In [None]:
import time
from torch.nn import CrossEntropyLoss


def train_model(model, train_loader, valid_loader, device, num_epochs=10, lr=0.001, model_out=None):

    since = time.time()
    model.to(device)
    train_loss_history = []
    valid_loss_history = []

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    loss_ndigits = CrossEntropyLoss()

    print("# Start training #")
    for epoch in range(num_epochs):

        train_loss = 0
        train_n_iter = 0

        # Set model to train mode
        model.train()

        # Iterate over train data
        for i, batch in enumerate(sample_loader):
            # get the inputs
            inputs, targets = batch['image'], batch['target'] 

            inputs = inputs.to(device)
            target_ndigits = targets[:,0].long()

            target_ndigits.to(device)


            # Zero the gradient buffer
            optimizer.zero_grad()  

            # Forward
            outputs = model(inputs)

            loss = loss_ndigits(outputs, target_ndigits)

            # Backward
            loss.backward()

            # Optimize
            optimizer.step()

            # Statistics
            train_loss += loss.item()
            train_n_iter += 1

        valid_loss = 0
        valid_n_iter = 0

        # Set model to evaluate mode
        model.eval()

        # Iterate over valid data
        # Iterate over train data
        for i, batch in enumerate(valid_loader):
            # get the inputs
            inputs, targets = batch['image'], batch['target'] 

            inputs = inputs.to(device)

            target_ndigits = targets[:,0].long()
            target_ndigits.to(device)

            # Forward
            outputs = model(inputs)

            loss = loss_ndigits(outputs, target_ndigits)

            # Statistics
            valid_loss += loss.item()
            valid_n_iter += 1

        train_loss_history.append(train_loss / train_n_iter)
        valid_loss_history.append(valid_loss / valid_n_iter)

        print('\nEpoch: {}/{}'.format(epoch + 1, num_epochs))
        print('\tTrain Loss: {:.4f}'.format(train_loss / train_n_iter))
        print('\tValid Loss: {:.4f}'.format(valid_loss / valid_n_iter))

    time_elapsed = time.time() - since

    print('\n\nTraining complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    
    if model_out:
        print('Saving model ...')
        torch.save(model, model_out)
        print('Model saved to :', model_out)


In [None]:
model_out = "models/my_model.pth"

train_model(baseline_cnn, train_loader=sample_loader, 
            valid_loader=sample_loader, 
            device=device, model_out=model_out)

Get useful insight into data

In [None]:
n_digits_hist = np.zeros(7)

n_digits_train = 0
n_digits_extra = 0

for ii in metadata_train:
    
    n_digits = int(len(metadata_train[ii]['metadata']['label']))
    n_digits_train += n_digits
    if n_digits < 6:
        n_digits_hist[n_digits] += 1
    else:
        n_digits_hist[6] += 1
        
    
for ii in metadata_extra:
    
    n_digits = int(len(metadata_extra[ii]['metadata']['label']))

    n_digits_extra += n_digits
    
    if n_digits < 6:
        n_digits_hist[n_digits] += 1
    else:
        n_digits_hist[6] += 1



print('total number of digits: ', n_digits_train + n_digits_extra)
print('total number of sequences', sum(n_digits_hist))

In [None]:
plt.bar(np.arange(len(n_digits_hist)), n_digits_hist)

In [None]:
tot = 0
for ii, jj in enumerate(n_digits_hist):
    tot += ii*jj
    
print(tot)

In [None]:
# Get smallest dimensions of images possible
# Slow since you have to load every image into memory

im_width = []
im_height =  []
for jj in range(len(traindata)):
    
    shape = np.asarray(traindata[jj]['image']).shape
    im_height.append(shape[0])
    im_width.append(shape[1])
    
im_width = np.asarray(im_width)
im_height = np.asarray(im_height)

In [None]:
# Explore dataset for cleaning

# Minimum width and height of images
print("minimum image width", np.min(im_width))
print("minimum image height", np.min(im_height))


#
total = np.sum(np.logical_or(im_height < 28, im_width < 28))

print('total number of image in dataset: ', len(traindata))
print('total number of images that are too small', total)

In [None]:
## Show sample image that is too small

index = np.argmin(im_height)
visualize_sample(traindata, idx=index)

sample = traindata[index]
print(sample['metadata']['label'])
np.asarray(sample['image']).shape

In [None]:
# Add example of at least one transform
# use imgaug