# Bengali.AI - Training and Evaluation

The notebook was created in Kaggle. It contains the training and evaluation (with a validation set) pipeline.

I changed `torch.utils.data._utils.collate` to change `torch.stack()` in to `torch.cat()` at line 24 below. The behaviour of the `DataLoader` was made simpler and I needed to do it to prevent errors due to different tensor shapes for each tuple that was retrieved from `BengaliDataset.__getitem__()`. So please do not be alarmed by this enormously complex code in the below cell; I did not write it and do not understand it myself. I just needed to change it to get the preferred `DataLoader` behaviour, i.e. no errors.

In [1]:
import re

from torch._six import container_abcs, string_classes, int_classes

np_str_obj_array_pattern = re.compile(r'[SaUO]')
default_collate_err_msg_format = (
    "default_collate: batch must contain tensors, numpy arrays, numbers, "
    "dicts or lists; found {}")

def _new_default_collate(batch):
    r"""Puts each data field into a tensor with outer dimension batch size"""

    elem = batch[0]
    elem_type = type(elem)
    if isinstance(elem, torch.Tensor):
        out = None
        if torch.utils.data.get_worker_info() is not None:
            # If we're in a background process, concatenate directly into a
            # shared memory tensor to avoid an extra copy
            numel = sum([x.numel() for x in batch])
            storage = elem.storage()._new_shared(numel)
            out = elem.new(storage)
        return torch.cat(batch, 0, out=out)
    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
            and elem_type.__name__ != 'string_':
        elem = batch[0]
        if elem_type.__name__ == 'ndarray':
            # array of string classes and object
            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
                raise TypeError(default_collate_err_msg_format.format(elem.dtype))

            return _new_default_collate([torch.as_tensor(b) for b in batch])
        elif elem.shape == ():  # scalars
            return torch.as_tensor(batch)
    elif isinstance(elem, float):
        return torch.tensor(batch, dtype=torch.float64)
    elif isinstance(elem, int_classes):
        return torch.tensor(batch)
    elif isinstance(elem, string_classes):
        return batch
    elif isinstance(elem, container_abcs.Mapping):
        return {key: _new_default_collate([d[key] for d in batch]) for key in elem}
    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
        return elem_type(*(_new_default_collate(samples) for samples in zip(*batch)))
    elif isinstance(elem, container_abcs.Sequence):
        transposed = zip(*batch)
        return [_new_default_collate(samples) for samples in transposed]

    raise TypeError(default_collate_err_msg_format.format(elem_type))

In [3]:
import gc
import sys
from datetime import datetime

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torchvision
from torchvision import transforms
from torchsummary import summary  # pip install torchsummary
from tqdm.notebook import tqdm
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib notebook

In [4]:
train_images = np.load('train_image_data.npy')
train_images.shape

(200840, 128, 128)

In [5]:
train_labels = pd.read_csv('train.csv').iloc[:, 1:-1]
train_labels.head()

Unnamed: 0,grapheme_root,vowel_diacritic,consonant_diacritic
0,15,9,5
1,159,0,0
2,22,3,5
3,53,2,2
4,71,9,5


In [6]:
# train-validation split 80/20
train_images, val_images = train_test_split(train_images, test_size=0.20,
                                            random_state=2020)
train_labels, val_labels = train_test_split(train_labels, test_size=0.20,
                                            random_state=2020)
gc.collect() # garbage collection

31

In [7]:
class ZeroNet(nn.Module):
  
    def __init__(self, device, kernel_size=3):
        super(ZeroNet, self).__init__()

        # images are 128 * 128
        # conv channels based on practice from MNIST networks
        # input channels 1, output channels 10
        self.conv1 = nn.Conv2d(1, 10, kernel_size=kernel_size)
        
        # input channels 10, output channels 20, 
        self.conv2 = nn.Conv2d(10, 20, kernel_size=kernel_size)
        self.conv2_drop = nn.Dropout2d()

        # extra fully-connected layers to determine labels
        # 128 * 128 * 20/2 = 3380
        self.fc1 = nn.Linear(3380, 256)
        self.fc2 = nn.Linear(256, 168)
        self.fc3 = nn.Linear(256, 11)
        self.fc4 = nn.Linear(256, 7)

        # put model on GPU
        self.device = device
        self.to(self.device)
        
    def _split_vectors(self, vectors, num_augments):
        """Splits the latent vectors into tensors for each subproblem.
        
        Splits the latent vectors according to the number of augmentations per
        image for each subproblem. It returns three tensors that contain a 
        subset of the latent vectors in vecs to increase efficiency.
        
        Args:
            vectors      = [torch.Tensor] the latent vectors to be split
            num_augments = [torch.Tensor] number of augmentations per sub-
                                          problem with shape (BATCH_SIZE, 3)
                                          
        Returns [torch.Tensor]*3:
            The latent vectors for the grapheme_root, vowel_diacritic,
            and consonant_diacritic subproblems.
        """
        if num_augments is None:
            return vectors, vectors, vectors
        
        # determine the slices of the latent vectors for each subproblem
        max_augments, _ = num_augments.max(dim=1, keepdim=True)
        diffs = torch.cat((torch.zeros(1, 1).long(), max_augments))                           
        start_indices = torch.cumsum(diffs, dim=0)[:-1]
        slices = torch.cat((start_indices, start_indices + num_augments), dim=1)
        
        # determine the indices of the latent vectors for each subproblem
        graph = torch.cat([torch.arange(st,end) for st,end in slices[:, [0,1]]])
        vowel = torch.cat([torch.arange(st,end) for st,end in slices[:, [0,2]]])
        conso = torch.cat([torch.arange(st,end) for st,end in slices[:, [0,3]]])

        return vectors[graph], vectors[vowel], vectors[conso]

    def forward(self, x, num_augments=None):
        """Foward pass of the CNN.
        
        Args:
            x            = [torch.Tensor] images with shape (N, 1, SIZE, SIZE)
            num_augments = [torch.Tensor] number of augmentations per sub-
                                          problem with shape (BATCH_SIZE, 3)
        
        Returns [torch.Tensor]*3:
            Non-normalized predictions for each class for each subproblem.
        """
        # put images on GPU
        x = x.to(self.device)

        x = F.relu(F.max_pool2d(self.conv1(x), 3))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 3))
        x = x.view(len(x), -1)  # flatten representation
        x = F.relu(self.fc1(x))
        # x = F.dropout(x, training=self.training)
        # x = self.fc2(x)
        
        x_graph, x_vowel, x_conso = self._split_vectors(x, num_augments)
        y_graph = self.fc2(x_graph)
        y_vowel = self.fc3(x_vowel)
        y_conso = self.fc4(x_conso)
        return y_graph, y_vowel, y_conso

In [8]:
class Cutout(object):
    """Class to augment images with cutout: https://arxiv.org/abs/1708.04552.
    
    Attributes:
        num_squares = [int] number of squares to cut out of the image
        length      = [int] the length (in pixels) of each square
    """

    def __init__(self, num_squares, length):    
        """Initialize cutout augmentation.
        
        Args:
            num_squares = [int] number of squares to cut out of the image
            length      = [int] the length (in pixels) of each square
        """
        self.num_squares = num_squares
        self.length = length

    def __call__(self, image):
        """Randomly mask out one or more squares from an image.

        Args:
            image = [torch.Tensor] image of shape (1, SIZE, SIZE)

        Returns [torch.Tensor]:
            Image with num_squares of dimension length x length cut out of it.
        """
        # determine center of squares
        coords = torch.randint(high=SIZE, size=(2, self.num_squares))

        # determine top-left and bottom-right corners of squares
        x1, y1 = torch.clamp(coords - self.length // 2, 0, SIZE)
        x2, y2 = torch.clamp(coords + self.length // 2, 0, SIZE)

        # cut squares out of image
        for x1, y1, x2, y2 in zip(x1, y1, x2, y2):
            image[:, y1:y2, x1:x2] = 0

        return image
    
    
class BengaliDataset(Dataset):
    """Class to get images and labels.
    
    Attributes:
        images         = [ndarray] images array with shape (N, SIZE, SIZE)
        transform      = [Compose] applies a random affine transformation,
                                   normalizes to z-scores, and applies cutout
                                   transformation to a Numpy array image
        normalize      = [Normalize] normalizes Numpy array image to z-scores
        labels         = [torch.Tensor] images labels tensor of shape (N, 3)
        mod_counts     = [torch.Tensor] remainders of dividing each class
                                        frequency by the highest frequency
        ratio_counts   = [torch.Tensor] floors of dividing each class
                                        frequency by the highest frequency
        current_counts = [torch.Tensor] number of retrieved items of each
                                        class in current iteration of epoch
        augment        = [bool] whether or not the images are transformed
        balance        = [bool] whether or not the classes are balanced
    """
    
    def __init__(self, images, labels, augment=False, balance=False):
        """Initialize dataset.
        
        Args:
            images  = [ndarray] images array with shape (N, SIZE, SIZE)
            labels  = [DataFrame] image labels DataFrame of shape (N, 3)
            augment = [bool] whether or not the images are transformed
            balance = [bool] whether or not the classes are balanced            
        """
        super(Dataset, self).__init__()
        
        # initialize transformations from torchvision.transforms
        self.images = images
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.RandomAffine(
                degrees=(-8, 8),
                translate=(1/24, 1/24),
                scale=(8/9, 10/9)
            ),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.071371482,), std=(0.20764154,)),
            Cutout(8, 12)
        ])
        self.normalize = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.071371482,), std=(0.20764154,))
        ])
        
        # initialize labels and counts for class balancing
        self.labels = torch.tensor(labels.to_numpy())
        counts = labels.apply(pd.Series.value_counts).to_numpy().T
        max_counts = np.nanmax(counts, axis=1, keepdims=True)
        self.mod_counts = torch.tensor(max_counts % counts)
        self.ratio_counts = torch.tensor(max_counts // counts)
        self.current_counts = torch.zeros_like(self.mod_counts)
        
        self.augment = augment
        self.balance = balance
        
    def reset(self):
        """Reset number of retrieved items of each class in current epoch."""
        self.current_counts = torch.zeros_like(self.mod_counts)

    def __len__(self):
        return len(self.images)
    
    def _num_augmentations(self, labels):
        """Computes number of augmentations for given image labels.
        
        Args:
            labels = [torch.Tensor] image labels of shape (3,)
            
        Returns [torch.Tensor]:
            If self.balance is False, a tensor filled with ones is returned.
            Otherwise, the number of augmentations will ensure that all the
            classes are seen the same number of times for each subproblem.
        """
        if not self.balance:  # one augmentation
            return torch.tensor([1]*len(labels))
        
        # select current and modular counts for given labels
        current_counts = self.current_counts[[0, 1, 2], labels]
        self.current_counts[[0, 1, 2], labels] += 1
        mod_counts = self.mod_counts[[0, 1, 2], labels]

        # determine number of augmentations with possible extra augmentation
        extra_augment = current_counts < mod_counts
        num_augments = self.ratio_counts[[0, 1, 2], labels] + extra_augment

        return num_augments.long()

    def _augment_or_normalize(self, image):
        """Augments (including normalization) or normalizes image.
        
        Args:
            image = [ndarray] Numpy array image of shape (SIZE, SIZE)
            
        Returns [torch.Tensor]
            Augmented or normalized image with shape (1, 1, SIZE, SIZE).
        """
        if self.augment:  # random affine, normalize, cutout
            image = self.transform(image)
        else:  # normalize
            image = self.normalize(image)

        return image.unsqueeze(0)

    def __getitem__(self, idx):
        """Get images, labels, and number of augmentations.
        
        Args:
            idx = [int] index of original image and labels
            
        Returns [torch.Tensor]*5:
            images       = images tensor of shape (N, 1, SIZE, SIZE)
            labels_graph = labels tensor of grapheme_root subproblem
            labels_vowel = labels tensor of vowel_diacritic subproblem
            labels_conso = labels tensor of consonant_diacritic subproblem
            num_augments = number of augmentations of shape (1, 3)
        """
        # select image and labels
        image = self.images[idx]
        labels = self.labels[idx]
        
        # determine number of augmentations per subproblem
        num_augments = self._num_augmentations(labels)
        
        # transform or normalize image
        images = self._augment_or_normalize(image)
        for _ in range(max(num_augments) - 1):
            images = torch.cat((images, self._augment_or_normalize(image)))

        # repeat labels given number of augmentations
        labels = [labels[i].repeat(num_augments[i]) for i in range(len(labels))]

        # return images, labels, and number of augmentations as a 5-tuple
        return (images,) + tuple(labels) + (num_augments.unsqueeze(0),)

In [9]:
def eval_metric(pred_dict, true_dict):
    """
    Competition evaluation metric adapted from:
    https://www.kaggle.com/c/bengaliai-cv19/overview/evaluation
    The metric describes the weighted average of component 
    macro-averaged recalls.
    
    Args:
        pred_dict = [dict] dictionary with components as keys and
                           lists of predictions as values
        true_dict = [dict] dictionary with components as key and
                           lists of targets as values
    
    Returns [float]*4:
        grapheme  = grapheme_root component macro-average recall
        vowel     = vowel_diacritic component macro-average recall
        consonant = consonant_diacritic component macro-average recall
        total     = weighted average of component macro-averaged recalls
    """
    scores = []
    for key in ['grapheme', 'vowel', 'consonant']:
        score = recall_score(true_dict[key], pred_dict[key], average='macro')
        scores.append(score)
        
    scores.append(np.average(scores, weights=[2, 1, 1]))
    return scores

In [10]:
def update_dicts(pred_dict, true_dict, preds, targets):
    """Updates two dictionaries given batches of values.
    
    Args:
        pred_dict = [dict] dictionary with components as keys and
                           lists of predictions as values
        true_dict = [dict] dictionary with components as key and
                           lists of targets as values
        preds     = [tuple] sequence of tensors of (raw) predictions
        targets   = [tuple] sequence of tensors of targets
    """
    for key, y, t in zip(['grapheme', 'vowel', 'consonant'], preds, targets):
        _, pred = torch.max(y.data, 1)
        pred_list = pred.tolist()
        target_list = t.tolist()
        pred_dict[key] += pred_list
        true_dict[key] += target_list

Naming convention:
- `x` = input
- `t` = target
- `y` = predicted output


In [11]:
class CrossEntropySumLoss(nn.Module):
    """Neural network module to compute sum of cross entropy losses.

    Attributes:
        device = [torch.device] device to compute the loss on
    """

    def __init__(self, device):
        """Initializes the loss module

        Args:
            device = [torch.device] device to compute the loss on
        """
        super(CrossEntropySumLoss, self).__init__()
        self.device = device

    def forward(self, input, target):
        """Sums cross entropy losses of given predictions and targets.
        
        Args:
            input  = [tuple] sequence of tensors of (raw) predictions
            target = [tuple] sequence of tensors of targets
        
        Returns [torch.Tensor]:
            The grapheme_root, vowel_dacritic, consonant_diacritic,
            and combined losses given the predictions and targets.
        """
        losses = torch.zeros(1, dtype=torch.float32, device=self.device)
        for y, t in zip(input, target):
            t = t.to(self.device)
            loss = F.cross_entropy(y, t).view(1)
            losses = torch.cat((loss, losses))

        losses[-1] = sum(losses[:-1])
        return losses

In [12]:
num_iterations = 0
num_batches = 0
running_losses = torch.zeros(4)
pred_dict = {'grapheme': [], 'vowel': [], 'consonant': []}
true_dict = {'grapheme': [], 'vowel': [], 'consonant': []}

def show_metrics(writer, losses=None, preds=None, targets=None,
                 inc=True, eval_freq=100, end=False):
    """Show the losses and scores on TensorBoard.
    
    Args:
        writer    = [SummaryWriter] TensorBoard writer of metrics
        losses    = [torch.Tensor] subproblem losses and combined loss
        preds     = [tuple] sequence of tensors of (raw) predictions
        targets   = [tuple] sequence of tensors of targets
        inc       = [bool] whether to increment the number of iterations
        eval_freq = [int] number of iterations before the next TensorBoard
                          update; if set to -1, TensorBoard never updates
        end       = [bool] always shows metrics after epoch has ended
    """
    global num_iterations
    global num_batches
    global running_losses
    global pred_dict
    global true_dict   
    
    if not end:
        # increment total number of training iterations during run
        num_iterations += inc

        # increment number of batches during current epoch
        num_batches += 1
        
        # accumulate metrics to smooth plots
        running_losses += losses.data.cpu()
        update_dicts(pred_dict, true_dict, preds, targets)
    
    # show metrics every eval_freq iterations or at the end of an epoch
    if num_iterations % eval_freq == (eval_freq - 1) or end:
        # show losses in TensorBoard
        losses = running_losses / num_batches
        writer.add_scalar('Loss/grapheme_root',
                          losses[0], num_iterations)
        writer.add_scalar('Loss/vowel_diacritic',
                          losses[1], num_iterations)
        writer.add_scalar('Loss/consonant_diacritic',
                          losses[2], num_iterations)
        writer.add_scalar('Loss/total',
                          losses[3], num_iterations)

        # show scores in TensorBoard
        scores = eval_metric(pred_dict, true_dict)
        writer.add_scalar('Score/grapheme_root',
                          scores[0], num_iterations)
        writer.add_scalar('Score/vowel_diacritic',
                          scores[1], num_iterations)
        writer.add_scalar('Score/consonant_diacritic',
                          scores[2], num_iterations)
        writer.add_scalar('Score/total',
                          scores[3], num_iterations)
        
        # reset running variables
        num_batches = 0
        running_losses = torch.zeros(4)
        pred_dict = {'grapheme': [], 'vowel': [], 'consonant': []}
        true_dict = {'grapheme': [], 'vowel': [], 'consonant': []}

In [13]:
def validation(model, val_loader, val_writer, criterion):
    """Computes loss and score of current state of model on validation dataset.
    
    Args:
        model      = [nn.Module] model to test with validation dataset
        val_loader = [DataLoader] validation data loader
        val_writer = [SummaryWriter] TensorBoard writer of validation metrics
        criterion  = [nn.Module] neural network module to compute loss
    """
    # set model mode to evaluation
    model.eval()
    
    with torch.no_grad():
        for data in val_loader:
            x, t_graph, t_vowel, t_conso, _ = data
            
            # predict
            y = model(x)
            
            # loss
            t = t_graph, t_vowel, t_conso
            losses = criterion(y, t)
            
            # accumulate but do not show validation metrics
            show_metrics(val_writer, losses, y, t, inc=False, eval_freq=-1)
            
    # show validation metrics on TensorBoard
    show_metrics(val_writer, end=True)
    
    # set model mode back to training
    model = model.train()

In [14]:
def train(model, train_dataset, train_loader, train_writer,
          val_loader, val_writer, optimizer, criterion, num_epochs=10):
    """Trains the model given train data and validates it given validation data.
    
    Args:
        model         = [nn.Module] model to train and validate
        train_dataset = [Dataset] train dataset
        train_loader  = [DataLoader] train data loader
        train_writer  = [SummaryWriter] TensorBoard writer of train metrics
        val_loader    = [DataLoader] validation data loader
        val_writer    = [SummaryWriter] TensorBoard writer of validation metrics
        optimizer     = [Optimizer] optimizer to update the model
        criterion     = [nn.Module] neural network module to compute loss
        num_epochs    = [int] number of iterations of the train dataset
    """
    for epoch in range(num_epochs):
        for data in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
            x, t_graph, t_vowel, t_conso, num_augments = data # depends on what is returned by Dataset.__getitem__                

            # predict
            y = model(x, num_augments)

            # loss
            t = t_graph, t_vowel, t_conso
            losses = criterion(y, t)

            # update
            optimizer.zero_grad() 
            losses[-1].backward()
            optimizer.step()
            
            # show train metrics every 100 iterations in TensorBoard
            show_metrics(train_writer, losses, y, t)

        # show train metrics at end of epoch
        show_metrics(train_writer, end=True)
                
        # evaluate model on validation data
        validation(model, val_loader, val_writer, criterion)
        
        # reset dataset to keep class balance
        train_dataset.reset()

In [15]:
SIZE = 128
BATCH_SIZE = 128
PATH = 'model.pt'

# use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

# initialize network and show summary
model = ZeroNet(device).train()
summary(model, (1, SIZE, SIZE), device=str(device))  # input_size = (1, SIZE, SIZE)

# initialize optimizer and criterion
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = CrossEntropySumLoss(device)

# training set
train_dataset = BengaliDataset(train_images, train_labels,
                               augment=False, balance=False)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=4 * sys.platform.startswith('linux'),
                          collate_fn=_new_default_collate, pin_memory=True)

# validation set
val_dataset = BengaliDataset(val_images, val_labels,
                             augment=False, balance=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
                        num_workers=4 * sys.platform.startswith('linux'),
                        collate_fn=_new_default_collate, pin_memory=True)

# TensorBoard writers
current_time = datetime.now().strftime("%Y-%m-%d/%H'%M'%S")
train_writer = SummaryWriter(f'runs/{current_time}/train')
train_writer.add_graph(model, iter(train_loader).next()[0])
val_writer = SummaryWriter(f'runs/{current_time}/validation')

Device: cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 10, 126, 126]             100
            Conv2d-2           [-1, 20, 40, 40]           1,820
         Dropout2d-3           [-1, 20, 40, 40]               0
            Linear-4                  [-1, 256]         865,536
            Linear-5                  [-1, 168]          43,176
            Linear-6                   [-1, 11]           2,827
            Linear-7                    [-1, 7]           1,799
Total params: 915,258
Trainable params: 915,258
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.06
Forward/backward pass size (MB): 1.70
Params size (MB): 3.49
Estimated Total Size (MB): 5.26
----------------------------------------------------------------




OSError: [Errno 22] Invalid argument

In [26]:
train(model, train_dataset, train_loader, train_writer,
      val_loader, val_writer, optimizer, criterion, num_epochs=50)
torch.save(model.state_dict(), PATH)

HBox(children=(IntProgress(value=0, description='Epoch 1/50', max=1256, style=ProgressStyle(description_width=…

KeyboardInterrupt: ignored