In [1]:
# GPU: 32*40 in 17.1s = 74.9/s
# CPU: 32*8 in 40s = 6.4/s

## 0. Prepare ResNet50 from PyTorch

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'  # One GPU
import sys
import numpy as np
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Numpy:  1.13.3


In [3]:
!cat /proc/cpuinfo | grep processor | wc -l

12


In [4]:
!nvidia-smi --query-gpu=gpu_name --format=csv

name
Tesla K80
Tesla K80


In [5]:
BATCH_SIZE = 32
RESNET_FEATURES = 2048
BATCHES_GPU = 40
BATCHES_CPU = 8

In [6]:
def give_fake_data(batches):
    """ Create an array of fake data to run inference on"""
    np.random.seed(0)
    dta = np.random.rand(BATCH_SIZE*batches, 224, 224, 3).astype(np.float32)
    return dta, np.swapaxes(dta, 1, 3)

In [7]:
def yield_mb(X, batchsize):
    """ Function yield (complete) mini_batches of data"""
    for i in range(len(X)//batchsize):
        yield i, X[i*batchsize:(i+1)*batchsize]

In [8]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_CPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(256, 224, 224, 3) (256, 3, 224, 224)


## 1. Run in Caffe2 (CPU)

In [9]:
import onnx
import onnx_caffe2.backend
#from caffe2.python.predictor import mobile_exporter

In [10]:
# Load the ONNX ModelProto object. model is a standard Python protobuf object
model_caffe2 = onnx.load("resnet50_feat.onnx")

In [11]:
prepared_backend = onnx_caffe2.backend.prepare(model_caffe2)

In [12]:
def predict_fn(back, classifier, data, batchsize):
    """ Return features from classifier """
    out = np.zeros((len(data), RESNET_FEATURES), np.float32)
    for idx, dta in yield_mb(data, batchsize):
        W = {classifier.graph.input[0].name: dta}
        pred = back.run(W)[0]
        out[idx*batchsize:(idx+1)*batchsize] = pred.squeeze()
    return out

In [13]:
%%time
# CPU: 40s
features_caffe2 = predict_fn(prepared_backend, model_caffe2, fake_input_data_cf, BATCH_SIZE)

CPU times: user 7min 26s, sys: 7.36 s, total: 7min 33s
Wall time: 40 s


## 2. Run in Caffe2 (GPU)

In [14]:
gpu_backend = onnx_caffe2.backend.prepare(model_caffe2, device="CUDA:0")

CUDA operators do not support 64-bit doubles, please use arr.astype(np.float32) or np.int32 for ints. Blob: 0 type: float64


In [15]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_GPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(1280, 224, 224, 3) (1280, 3, 224, 224)


In [16]:
%%time
# GPU: 17.1s
features_caffe2 = predict_fn(gpu_backend, model_caffe2, fake_input_data_cf, BATCH_SIZE)

CPU times: user 14 s, sys: 2.98 s, total: 16.9 s
Wall time: 17.1 s
