Copyright © 2017-2021 ABBYY Production LLC

In [1]:
#@title
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Cifar-10 neural net tutorial

This tutorial contains the following steps:

* Download CIFAR-10 dataset
* Prepare CIFAR-10 dataset
* Build the net for training
* Train the net on the dataset
* Prepare the net for inference
* Serialize the net
* Evaluate the net

## Download dataset

*Note*: This section doesn't have any NeoML-specific code. It just downloads dataset from the internet. If you are not running this notebook, you may [skip](#Prepare-data) this section.

In [2]:
import os

def calc_md5(file_name):
    """Calculates md5 hash of an existing file"""
    import hashlib
    curr_hash = hashlib.md5()
    with open(file_name, 'rb') as file_in:
        chunk = file_in.read(8192)
        while chunk:
            curr_hash.update(chunk)
            chunk = file_in.read(8192)
    return curr_hash.hexdigest()


# Download data
url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
file_name = url[url.rfind('/')+1:]
ARCHIVE_SIZE = 170498071
ARCHIVE_MD5 = 'c58f30108f718f92721af3b95e74349a'

# Download when archive is missing or broken
if (not os.path.isfile(file_name)) \
        or os.path.getsize(file_name) != ARCHIVE_SIZE \
        or calc_md5(file_name) != ARCHIVE_MD5:
    import requests
    with requests.get(url, stream=True) as url_stream:
        url_stream.raise_for_status()
        with open(file_name, 'wb') as file_out:
            for chunk in url_stream.iter_content(chunk_size=8192):
                file_out.write(chunk)

# Unpack data
import tarfile
tar = tarfile.open(file_name, 'r:gz')
tar.extractall()
tar.close()

## Prepare dataset

In this section data is loaded from files into numpy arrays and pre-processed. Pre-processing includes the following:

* Data type conversion (NeoML works with 32-bit types for both integer and float data)
* Normalization
* Image format conversion (NeoML works with channel-last images)

In [3]:
import numpy as np

np.random.seed(666)


def load_batch_file(file_name):
    """Loads data from one of the batch files"""
    import pickle
    with open(file_name, 'rb') as file_in:
        result = pickle.load(file_in, encoding='bytes')
    return result


def transform_data(X):
    """Normalizes and transposes data for NeoML"""
    X = X.astype(np.float32)
    X = (X - 127.5) / 255.
    X = X.reshape((X.shape[0], 3, 32, 32))
    X = X.transpose((0, 2, 3, 1))  # NeoML uses channel-last pack
    return X


# Preparing data
batch_name = 'cifar-10-batches-py/data_batch_{0}'
train_data = [load_batch_file(batch_name.format(i)) for i in range(1, 6)]
X_train = np.concatenate(list(x[b'data'] for x in train_data), axis=0)
X_train = transform_data(X_train)
y_train = np.concatenate(list(x[b'labels'] for x in train_data), axis=0)
y_train = y_train.astype(np.int32)

test_data = load_batch_file('cifar-10-batches-py/test_batch')
X_test = test_data[b'data']
X_test = transform_data(X_test)
y_test = np.array(test_data[b'labels'], dtype=np.int32)

## Build the network

### Create math engine (choose device)

First of all we need to create math engine. It's an entity responsible for computational operations and data allocation for neural networks. It determines the device used for neural network training and inference.

For faster training in this tutorial we'll use GPU.

*Note:* If NeoML won't manage to find any compatible GPU, it'll create CPU math engine. You may check which math engine was created by looking at its `info` attribute.

In [4]:
import neoml

# If you wanna use CPU, you should use neoml.MathEngine.CpuMathEngine()
math_engine = neoml.MathEngine.GpuMathEngine(0)
print('Device: ', math_engine.info)

Device:  CUDA: GeForce RTX 2060


### Build the DNN

First of all we need the `neoml.Dnn.Dnn` object which represents a neural network (a graph of layers). Every net needs a math engine to perform its operations and its math engine can't be changed after creation.

In [5]:
dnn = neoml.Dnn.Dnn(math_engine)

The data is feeded to the network via special `neoml.Dnn.Source` layer.

In [6]:
data = neoml.Dnn.Source(dnn, 'data')  # Source for data

The net in this tutorial will be consisting of a few convolutional blocks. Here you may see how layers can be connected to each other.

In [7]:
class ConvBlock:
    """Block of dropout->conv->batch_norm->relu6"""
    def __init__(self, inputs, filter_count, name):
        self.dropout = neoml.Dnn.Dropout(inputs, rate=0.1, spatial=True,
                                         batchwise=True, name=name+'_dropout')
        self.conv = neoml.Dnn.Conv(self.dropout, filter_count=filter_count,
                                   filter_size=(3, 3), stride_size=(2, 2),
                                   padding_size=(1, 1), name=name+'_conv')
        self.bn = neoml.Dnn.BatchNormalization(self.conv, channel_based=True,
                                               name=name+'_bn')
        self.output = neoml.Dnn.ReLU(self.bn, threshold=6., name=name+'_relu6')


# Add a few convolutional blocks
# First convolutional block takes Source layer's data as input
block1 = ConvBlock(data, filter_count=16, name='block1')  # -> (16,  16)
# Next convolutional blocks take as input the output of previous block
block2 = ConvBlock(block1.output, filter_count=32, name='block2')  # -> (8, 8)
block3 = ConvBlock(block2.output, filter_count=64, name='block3')  # -> (4, 4)

Afterwards we'll use fully-connected layer to generate logits (non-normalized probabilities) over classes.

In [8]:
# Fully connected flattens its input automatically
n_classes = 10  # Number of classes in CIFAR-10 dataset
fc = neoml.Dnn.FullyConnected(block3.output, n_classes, name='fc')

In order to train net we need to define the loss function. In NeoML it should be done by adding one (or more) loss layers. In this tutorial we'll be optimizing cross-entropy loss.

*Note*: in case of multiple loss layers you may use `neoml.Dnn.Loss.loss_weight` properties to balance loss layers between each other.

In [9]:
# Before loss layer itself we need to create source layer for correct labels
labels = neoml.Dnn.Source(dnn, 'labels')
# Here you can see how to create a layer with multiple inputs
# Softmax will be applied within cross-entropy (no need for explicit softmax layer here)
loss = neoml.Dnn.CrossEntropyLoss((fc, labels), name='loss')

We'll be calculating accuracy by the means of NeoML. To do this we'll need special `neoml.Dnn.Accuracy` layer (and `neoml.Dnn.Sink` layer for extracting accuracy's output).

In [10]:
# Auxilary layers in order to get statistics
accuracy = neoml.Dnn.Accuracy((fc, labels), name='accuracy')
# accuracy layers writes its result to its output
# We need additional sink layer to extract it
accuracy_sink = neoml.Dnn.Sink(accuracy, name='accuracy_sink')

### Create solver

Solver is an object which is responsible for weight optimization (based on gradient values). In this sample we'll use `neoml.Dnn.AdaptiveGradient` solver (neoml's realization of [Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam)).

In [11]:
lr = 1e-3 # Learning rate

# Create solver
dnn.solver = neoml.Dnn.AdaptiveGradient(math_engine, learning_rate=lr,
                                        l1=0., l2=0.,  # No regularization
                                        max_gradient_norm=1.,  # clip grad
                                        moment_decay_rate=0.9,
                                        second_moment_decay_rate=0.999)

## Train network

The neoml's nets accept data only as `neoml.Blob.Blob`.

Blobs are 7-dimensional arrays located in device memory. Each dimension has a specific purpose:

1. `BatchLength` - temporal axis (used in recurrent layers)
2. `BatchWidth` - classic batch
3. `ListSize` - list axis, used when objects are related to the same entity, but without ordering (unlike `BatchLength`)
4. `Height` - height of the image
5. `Width` - width of the image
6. `Depth` - depth of the 3-dimensional image
7. `Channels` - channels of the image (also used when object is a 1-dimensional vector)

In our case we will use `ndarray` to split data into batches. Blobs will be created based on these batches right before feeding them to the net.

In [12]:
def make_blob(data, math_engine):
    """Wraps numpy data into neoml blob"""
    shape = data.shape
    if len(shape) == 4:  # images
        # Data is a batch of 2-dimensional multi-channel images
        # Wrap it into (BatchWidth, Height, Width, Channels) blob
        blob_shape = (1, shape[0], 1, shape[1], shape[2], 1, shape[3])
        return neoml.Blob.asblob(math_engine, data, blob_shape)
    elif len(shape) == 1:  # dense labels
        # Data contains dense labels (batch of integers)
        # Wrap it into blob of (BatchWidth,) shape
        return neoml.Blob.asblob(math_engine, data,
                                 (1, shape[0], 1, 1, 1, 1, 1))
    else:
        assert(False)


def cifar10_array_iter(X, y, batch_size):
    """Slices numpy arrays into batches"""
    start = 0
    data_size = y.shape[0]
    while start < data_size:
        yield X[start : start+batch_size], y[start : start+batch_size]
        start += batch_size


def cifar10_blob_iter(X, y, batch_size, math_engine):
    """Slices numpy arrays into batches and wraps them in blobs"""
    for X_b, y_b in cifar10_array_iter(X, y, batch_size):
        yield make_blob(X_b, math_engine), make_blob(y_b, math_engine)

In order to train net you should call `dnn.learn` with data as its argument.

In order to run net without traning you should call `dnn.run` with data as its argument.

The data argument is a `dict` where keys are `neoml.Dnn.Source` layers' names and values are corresponding `neoml.Blob.Blob`s.

In [13]:
def run_net(X, y, batch_size, dnn, is_train):
    """Runs dnn on given data"""
    start = time.time()
    total_loss = 0.
    run_iter = dnn.learn if is_train else dnn.run
    math_engine = dnn.math_engine
    layers = dnn.layers
    loss = layers['loss']
    accuracy = layers['accuracy']
    sink = layers['accuracy_sink']

    accuracy.reset = True  # Reset previous statistics
    # Iterate over batches
    for X_batch, y_batch in cifar10_blob_iter(X, y, batch_size, math_engine):
        # Run the network on the batch data
        run_iter({'data': X_batch, 'labels': y_batch})
        total_loss += loss.last_loss * y_batch.batch_width  # Update epoch loss
        accuracy.reset = False  # Don't reset statistics within one epoch

    avg_loss = total_loss / y.shape[0]
    avg_acc = sink.get_blob().asarray()[0]
    run_time = time.time() - start
    return avg_loss, avg_acc, run_time

For educational purpose we store and load progress during training.

The training progess can be stored by `dnn.store_checkpoint` method.

You may resume training from the checkpoint by calling `dnn.load_checkpoint`.

**Important**: neoml's checkpoints contain all the information required for training (*including the net architecture*). That allows us to `load_checkpoint` into any `neoml.Dnn.Dnn` object without the need to re-create architecture or solver before loading. However, this approach leads to the creation of new layer/solver/blob objects during each `dnn.load_checkpoint`. If you had any previously created python variables which were pointing to the objects of the net *before loading* (like `solver`, `data` variables here), you must re-initialize them with the new ones.

In [14]:
import time

# Network params
batch_size = 50

n_epoch = 10
for epoch in range(n_epoch):
    # Train
    avg_loss, acc, run_time = run_net(X_train, y_train, batch_size,
                                      dnn, is_train=True)
    print(f'Train #{epoch}\tLoss: {avg_loss:.4f}\t'
          f'Accuracy: {acc:.4f}\tTime: {run_time:.2f} sec')
    # Test
    avg_loss, acc, run_time = run_net(X_test, y_test, batch_size,
                                      dnn, is_train=False)
    print(f'Test  #{epoch}\tLoss: {avg_loss:.4f}\t'
          f'Accuracy: {acc:.4f}\tTime: {run_time:.2f} sec')
    if epoch == 1:
        # If you want to save training progress you can do it via checkpoints
        # It stores dnn weights and other training data (solver stats, etc.)
        print('Creating checkpoint...')
        dnn.store_checkpoint('cifar10_sample.checkpoint')
    if epoch == 5:
        # If you want you can resume training from the checkpoint
        print('Loading checkpoint... (this will roll dnn back to epoch #1)')
        dnn.load_checkpoint('cifar10_sample.checkpoint')
        # Be careful! dnn now points to the new net
        # But other layer/solver variables still pointing to the old net!

Train #0	Loss: 1.5371	Accuracy: 0.4499	Time: 6.12 sec
Test  #0	Loss: 1.2951	Accuracy: 0.5341	Time: 0.71 sec
Train #1	Loss: 1.2288	Accuracy: 0.5630	Time: 5.87 sec
Test  #1	Loss: 1.1361	Accuracy: 0.5951	Time: 0.71 sec
Creating checkpoint...
Train #2	Loss: 1.1138	Accuracy: 0.6064	Time: 5.90 sec
Test  #2	Loss: 1.2091	Accuracy: 0.5761	Time: 0.70 sec
Train #3	Loss: 1.0385	Accuracy: 0.6321	Time: 5.91 sec
Test  #3	Loss: 1.0687	Accuracy: 0.6177	Time: 0.71 sec
Train #4	Loss: 0.9907	Accuracy: 0.6520	Time: 5.92 sec
Test  #4	Loss: 1.0566	Accuracy: 0.6293	Time: 0.70 sec
Train #5	Loss: 0.9494	Accuracy: 0.6647	Time: 5.93 sec
Test  #5	Loss: 1.0407	Accuracy: 0.6361	Time: 0.70 sec
Loading checkpoint... (this will roll dnn back to epoch #1)
Train #6	Loss: 1.1142	Accuracy: 0.6035	Time: 5.91 sec
Test  #6	Loss: 1.0858	Accuracy: 0.6211	Time: 0.70 sec
Train #7	Loss: 1.0434	Accuracy: 0.6333	Time: 5.95 sec
Test  #7	Loss: 1.1010	Accuracy: 0.6111	Time: 0.71 sec
Train #8	Loss: 0.9904	Accuracy: 0.6490	Time: 5.89 sec

## Prepare net for inference

First of all, before inference we must delete training-only layers. Every layer requiring correct labels should be deleted.

In [15]:
# Remove training-only layers
dnn.delete_layer('labels')
dnn.delete_layer('loss')
dnn.delete_layer('accuracy')
dnn.delete_layer('accuracy_sink')

But we need a sink layer to extract logits. If you need *exact* probabilities you should add `neoml.Dnn.Softmax` layer before sink. In our case we're interesting in maximum index only, that's why softmax can be omitted.

In [16]:
# Add sink for dnn output
sink = neoml.Dnn.Sink(dnn.layers['fc'], name='sink')

Also we can fuse batch normalization with previous convolution or fully connected layer. That'll reduce the number of operations during inference.

In [17]:
def fuse_batch_norm(dnn, block_name):
    """Fuses batch_norm into convolution
    As a result reduces inference time
    Should be used after training
    """
    bn_name = block_name + '_bn'
    if not dnn.has_layer(bn_name):
        # Batch norm has already been fused
        return
    bn_layer = dnn.layers[bn_name]
    conv_name = block_name + '_conv'
    conv_layer = dnn.layers[conv_name]
    # Fuse batch normalization
    conv_layer.apply_batch_normalization(bn_layer)
    # Delete layer from net (conv already 'contains' it)
    dnn.delete_layer(bn_name)
    # Connect layer after batchnorm to convolution
    # because batchnorm was removed from the dnn
    output_name = block_name + '_relu6'
    dnn.layers[output_name].connect(conv_layer)


# Fuse batchnorms into convolutions
fuse_batch_norm(dnn, 'block1')
fuse_batch_norm(dnn, 'block2')
fuse_batch_norm(dnn, 'block2')

## Serialize trained net

Use `dnn.store` to save trained. This method stores all the info required for inference. Later you may load saved net by `dnn.load` into any `neoml.Dnn.Dnn` object.

In [18]:
# Store trained net
# In that case it's better to use method load/store
# Unlike checkpoints those aren't working with training-related data
# As a result they use less disk space
dnn.store('cifar10_sample.dnn')

In [19]:
# Load trained net
# It's not needed and done here for tutorial purpose only
dnn.load('cifar10_sample.dnn')

Once again, `load` leads to creation of the new layer objects. And previously created `sink` variable now points to the layer object, which doesn't belong to the dnn. Let's fix it!

In [20]:
sink = dnn.layers['sink']

## Evaluate trained net

We get the blob with the results of the last `dnn.run` by `sink.get_blob`. Then we convert it into numpy array via `blob.asarray` and calculating accuracy by the means of numpy.

We should get here the same accuracy, as during the test of the last epoch!

In [21]:
# Evaluate inference
inference_acc = 0.
for X_b, y_b in cifar10_array_iter(X_test, y_test, batch_size):
    dnn.run({'data': make_blob(X_b, math_engine)})
    # Extract data from sink
    # unnormalized probs of shape (batch_size, n_classes)
    logits = sink.get_blob().asarray()
    # Calculate accuracy
    inference_acc += (np.argmax(logits, axis=1) == y_b).sum()
inference_acc /= len(X_test)

# This number must be equal to the test accuracy of the last epoch
print(f'Inference net test accuracy: {inference_acc:.4f}')

Inference net test accuracy: 0.6476
