In [0]:
!pip install mxnet-cu100

Collecting mxnet-cu100
[?25l  Downloading https://files.pythonhosted.org/packages/3d/84/d098e0607ee6207448b6af65315f5d45946b49e4f48160eade6cdd64ce4e/mxnet_cu100-1.5.1.post0-py2.py3-none-manylinux1_x86_64.whl (540.1MB)
[K     |████████████████████████████████| 540.1MB 30kB/s 
Installing collected packages: mxnet-cu100
Successfully installed mxnet-cu100-1.5.1.post0


In [11]:
import mxnet as mx
from mxnet import nd
ctx = mx.gpu()

# 延迟执行可以提高程序的性能

from time import time

start = time()
x = nd.random_uniform(shape=(2000, 2000), ctx=ctx)
y = nd.dot(x, x) # 并没有真正执行，而是在需要使用的时候再执行

print('workloads are queued: %f sec' %(time() - start))
print(y)
print('workloads are finished: %f sec' %(time() - start))

workloads are queued: 0.010755 sec

[[479.4833  481.7204  499.30383 ... 481.99655 486.93176 495.87143]
 [479.94864 485.32785 495.4385  ... 495.37183 476.82202 498.1794 ]
 [492.63098 500.33438 507.08658 ... 494.17532 490.71448 500.85248]
 ...
 [492.67075 500.1156  508.92    ... 502.39157 489.79236 510.70807]
 [480.65863 490.68118 498.50598 ... 487.54398 489.11212 504.79608]
 [486.09225 496.91916 505.76697 ... 497.94516 486.3714  507.12802]]
<NDArray 2000x2000 @gpu(0)>
workloads are finished: 0.026173 sec


In [12]:
# 如果立即执行的话，需要如下
start = time()
y = nd.dot(x, x)
y.wait_to_read()
time() - start

0.010281801223754883

In [14]:
# 或者
start = time()
y = nd.dot(x, x)
z = nd.dot(x, x)
nd.waitall()
time() - start


[[479.4833  481.7204  499.30383 ... 481.99655 486.93176 495.87143]
 [479.94864 485.32785 495.4385  ... 495.37183 476.82202 498.1794 ]
 [492.63098 500.33438 507.08658 ... 494.17532 490.71448 500.85248]
 ...
 [492.67075 500.1156  508.92    ... 502.39157 489.79236 510.70807]
 [480.65863 490.68118 498.50598 ... 487.54398 489.11212 504.79608]
 [486.09225 496.91916 505.76697 ... 497.94516 486.3714  507.12802]]
<NDArray 2000x2000 @gpu(0)>

In [16]:
!nvidia-smi

Thu Oct 31 03:37:19 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.50       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    58W / 149W |    342MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [0]:
from mxnet import nd
from mxnet import gluon

scale = .01
w1 = nd.random_normal(shape=(20, 1, 3, 3)) * scale
b1 = nd.zeros(shape=20)
w2 = nd.random_normal(shape=(50, 20, 5, 5)) * scale
b2 = nd.zeros(shape=50)
w3 = nd.random_normal(shape=(800, 128)) * scale
b3 = nd.zeros(shape=128)
w4 = nd.random_normal(shape=(128, 10)) * scale
b4 = nd.zeros(shape=10)

params = [w1, b1, w2, b2, w3, b3, w4, b4]

In [0]:
def lenet(x, params):
  h1_conv = nd.Convolution(data=x, weight=params[0], bias=params[1], kernel=(3, 3), num_filter=20)
  h1_activation = nd.relu(h1_conv)
  h1 = nd.Pooling(data=h1_activation, pool_type='avg', kernel=(2, 2), stride=(2, 2))

  h2_conv = nd.Convolution(data=h1, weight=params[2], bias=params[3], kernel=(5, 5), num_filter=50)
  h2_activation = nd.relu(h2_conv)
  h2 = nd.Pooling(data=h2_activation, pool_type='avg', kernel=(2, 2), stride=(2, 2))
  h2 = nd.flatten(h2)

  h3_linear = nd.dot(h2, params[4]) + params[5]
  h3 = nd.relu(h3_linear)

  y_hat = nd.dot(h3, params[6]) + params[7]
  return y_hat

In [21]:
loss = gluon.loss.SoftmaxCrossEntropyLoss()

from mxnet import gpu

# 将参数分发到GPU上
def get_params(params, ctx):
  new_params = [p.copyto(ctx) for p in params]
  for p in new_params:
    p.attach_grad()
  return new_params


new_params = get_params(params, gpu(0))
print('b1 weight=', new_params[1])
print('b1 grad=', new_params[1].grad)

b1 weight= 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 20 @gpu(0)>
b1 grad= 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 20 @gpu(0)>


In [23]:
# 给定分布在多个GPU之间的数据， 定义一个函数，将这些数据加起来，然后广播到所有的GPU上

def allreduce(data):
  for i in range(1, len(data)):
    data[0][:] += data[i].copyto(data[0].context)
  for i in range(1, len(data)):
    data[0].copyto(data[i])


data = [nd.ones((1, 2), ctx=gpu(i))*(i+1) for i in range(1)]
allreduce(data)
print(data)

[
[[1. 1.]]
<NDArray 1x2 @gpu(0)>]


In [26]:
def split_and_load(data, ctx):
  n, k = data.shape[0], len(ctx)
  m = n // k
  return [data[i*m:(i+1)*m].as_in_context(ctx[i]) for i in range(k)]


batch = nd.arange(16).reshape((4, 4))
ctx = [gpu(0)]
splitted = split_and_load(batch, ctx)
print('input: ', batch)
print('load into', ctx)
print('output: ', splitted)

input:  
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]]
<NDArray 4x4 @cpu(0)>
load into [gpu(0)]
output:  [
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]]
<NDArray 4x4 @gpu(0)>]


In [0]:
from mxnet import autograd

import utils


def train_batch(data, label, params, ctx, lr):
  data_list = split_and_load(data, ctx)
  label_list = split_and_load(label, ctx)
  with autograd.record():
    losses = [loss(lenet(x, w), y) for x, y, w in zip(data_list, label_list, params)]
  for l in losses:
    l.backward()
  for i in range(len(params[0])):
    allreduce([params[c][i].grad for c in range(len(ctx))])
  for p in params:
    utils.SGD(p, lr/data.shape[0])

In [0]:
from time import time

def train(num_gpus, batch_size, lr):
  train_data, test_data = utils.load_data_fashion_mnist_new(batch_size=batch_size)
  ctx = [gpu(i) for i in range(num_gpus)]
  dev_params = [get_params(params, c) for c in ctx]
  for epoch in range(5):
    start = time()
    for data, label in train_data:
      train_batch(data, label, dev_params, ctx, lr)
    nd.waitall()
    print('Epoch %d, training time = %f sec' %(epoch, time() - start))
    net = lambda data: lenet(data, dev_params[0])
    test_acc = utils.evaluate_accuracy(test_data, net, ctx[0])
    print('     validatioin accuracy = %f' %(test_acc))

In [35]:
train(1, 256, 0.3)

Epoch 0, training time = 2.574747 sec
     validatioin accuracy = 0.100060
Epoch 1, training time = 2.458804 sec
     validatioin accuracy = 0.737179
Epoch 2, training time = 2.442271 sec
     validatioin accuracy = 0.790765
Epoch 3, training time = 2.441624 sec
     validatioin accuracy = 0.782953
Epoch 4, training time = 2.462935 sec
     validatioin accuracy = 0.825921
