In [1]:
from caffe2.python import core, workspace, model_helper, net_drawer, memonger, brew, optimizer
from caffe2.python import data_parallel_model as dpm
from caffe2.python.models import alexnet
from caffe2.proto import caffe2_pb2

import numpy as np
import time
import os
from IPython import display
    
workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])

# This section checks if you have the training and testing databases
current_folder = os.path.join(os.path.expanduser('~'), 'caffe2_notebooks')
#data_folder = "/data"
data_folder = "/Data"

# Train/test data
#train_data_db = os.path.join(data_folder, "imagenet_cars_boats_train")
train_data_db = os.path.join(data_folder, "train_db")
train_data_db_type = "lmdb"
#train_data_count = 1280
train_data_count = 1281167
#test_data_db = os.path.join(data_folder, "imagenet_cars_boats_val")
test_data_db = os.path.join(data_folder, "val_db")
test_data_db_type = "lmdb"
#test_data_count = 96
test_data_count = 50000

# Make the data folder if it doesn't exist
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
else:
    print("Data folder found at {}".format(data_folder))

Ignoring @/caffe2/caffe2/contrib/nccl:nccl_ops as it is not a valid file.
Ignoring @/caffe2/caffe2/contrib/gloo:gloo_ops as it is not a valid file.
Ignoring @/caffe2/caffe2/contrib/gloo:gloo_ops_gpu as it is not a valid file.
Data folder found at /Data


In [2]:
# Configure how you want to train the model and with how many GPUs
# This is set to use two GPUs in a single machine, but if you have more GPUs, extend the array [0, 1, 2, n]
gpus = [0,1,2,3,4,5]

# Batch size of 128 sums up to roughly 5GB of memory per device
batch_per_device = 128
num_gpus = len(gpus)
print num_gpus
total_batch_size = batch_per_device * num_gpus

# This model discriminates between two labels: car or boat
num_labels = 1000

# Initial learning rate (scale with total batch size)
base_learning_rate = 0.01

# only intends to influence the learning rate after 10 epochs
#stepsize = int(10 * train_data_count / total_batch_size)

# Weight decay (L2 regularization)
weight_decay = 1e-4

6


In [3]:

workspace.ResetWorkspace()
# 1. Use the model helper to create a CNN for us
train_arg_scope = {
    'order': 'NCHW',
    'use_cudnn': True,
    'cudnn_exhaustive_search': True,
#    'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
}
train_model = model_helper.ModelHelper(
    # Arbitrary name for referencing the network in your workspace: you could call it tacos or boatzncarz
    name="train", arg_scope=train_arg_scope
)


# 2. Create a database reader
# This training data reader is shared between all GPUs.
# When reading data, the trainer runs ImageInputOp for each GPU to retrieve their own unique batch of training data.
# CreateDB is inherited by ModelHelper from model_helper.py
# We are going to name it "train_reader" and pass in the db configurations we set earlier
reader = train_model.CreateDB(
    "train_reader",
    db=train_data_db,
    db_type=train_data_db_type,
)

In [4]:
def add_image_input_ops(model):
    # utilize the ImageInput operator to prep the images
    data, label = brew.image_input(
        model,
        reader,
        ["data", "label"],
        batch_size=batch_per_device,
        use_gpu_transform=True if model._device_type == 1 else False,
        use_caffe_datum=True,
        # mean: to remove color values that are common
        mean=128.,
        # std is going to be modified randomly to influence the mean subtraction
        std=128.,
        # scale to rescale each image to a common size
        scale=256,
        # crop to the square each image to exact dimensions
        crop=227,
        # not running in test mode
        is_test=False,
        # mirroring of the images will occur randomly
        mirror=1
    )
    # prevent back-propagation: optional performance improvement; may not be observable at small scale
    data = model.net.StopGradient(data, data)

In [5]:
def create_alexnet_model_ops(model, loss_scale=1.0):
    # Creates an alexnet network
    pred = alexnet.create_alexnet(
        model,
        "data",
        num_input_channels=3,
        num_labels=num_labels,
        no_bias=True,
        no_loss=True,
    )
    softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                          ['softmax', 'loss'])
    loss = model.Scale(loss, scale=loss_scale)
    brew.accuracy(model, [softmax, "label"], "accuracy")
    return [loss]

In [6]:
def add_optimizer(model):
    #stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
    optimizer.add_weight_decay(model, weight_decay)
    opt = optimizer.build_multi_precision_sgd(
        model,
        base_learning_rate,
        momentum=0.9,
        nesterov=1,
        policy="fixed",
        #stepsize=stepsz,
        gamma=0.1
    )
    return opt

In [7]:
def add_post_sync_ops(model):
    """Add ops applied after initial parameter sync."""
    for param_info in model.GetOptimizationParamInfo(model.GetParams()):
        if param_info.blob_copy is not None:
            model.param_init_net.HalfToFloat(
                param_info.blob,
                param_info.blob_copy[core.DataType.FLOAT]
            )

In [None]:
# assumes you're using the functions created in Part 4, 5, 6
dpm.Parallelize(
    train_model,
    input_builder_fun=add_image_input_ops,
    forward_pass_builder_fun=create_alexnet_model_ops,
    optimizer_builder_fun=add_optimizer,
    post_sync_builder_fun=add_post_sync_ops,
    devices=gpus,
    optimize_gradient_memory=True,
)

workspace.RunNetOnce(train_model.param_init_net)
workspace.CreateNet(train_model.net)

No handlers could be found for logger "data_parallel_model"


True

In [None]:
# Start looping through epochs where we run the batches of images to cover the entire dataset
# Usually you would want to run a lot more epochs to increase your model's accuracy
num_epochs = 1
iter_interval = 40
T1 = time.time()
for epoch in range(num_epochs):
    # Split up the images evenly: total images / batch size
    num_iters = int(train_data_count / total_batch_size)
    for iter in range(num_iters):
        # Stopwatch start!
        t1 = time.time()
        # Run this iteration!
        workspace.RunNet(train_model.net.Proto().name)
        t2 = time.time()
        dt = t2 - t1
        
        # Stopwatch stopped! How'd we do?
        if iter%iter_interval == 0:
            print((
                "Finished iteration {:>" + str(len(str(num_iters))) + "}/{}" +
                " (epoch {:>" + str(len(str(num_epochs))) + "}/{})" + 
                " ({:.2f} images/sec)").
                format(iter+1, num_iters, epoch+1, num_epochs, total_batch_size/dt))
T2 = time.time()
DT = T2 - T1
print (("Overall GPU performance ({:.2f} images/sec)").
      format(num_iters*total_batch_size/DT))

Finished iteration    1/1668 (epoch 1/1) (112.54 images/sec)
Finished iteration   41/1668 (epoch 1/1) (2638.11 images/sec)
Finished iteration   81/1668 (epoch 1/1) (2832.14 images/sec)
Finished iteration  121/1668 (epoch 1/1) (2691.88 images/sec)
Finished iteration  161/1668 (epoch 1/1) (2584.45 images/sec)
Finished iteration  201/1668 (epoch 1/1) (2814.55 images/sec)
Finished iteration  241/1668 (epoch 1/1) (2554.95 images/sec)
Finished iteration  281/1668 (epoch 1/1) (2790.05 images/sec)
Finished iteration  321/1668 (epoch 1/1) (2806.91 images/sec)
Finished iteration  361/1668 (epoch 1/1) (3031.02 images/sec)
Finished iteration  401/1668 (epoch 1/1) (2725.98 images/sec)
Finished iteration  441/1668 (epoch 1/1) (2709.37 images/sec)
Finished iteration  481/1668 (epoch 1/1) (2811.56 images/sec)
Finished iteration  521/1668 (epoch 1/1) (2728.10 images/sec)
Finished iteration  561/1668 (epoch 1/1) (2817.64 images/sec)
Finished iteration  601/1668 (epoch 1/1) (2689.59 images/sec)
Finished 