In [1]:
import os
import argparse
import random
import time
import numpy as np
# import moxing as mox

from mindspore import context
from mindspore import Tensor
from mindspore.nn.optim.momentum import Momentum
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model
from mindspore.context import ParallelMode
from mindspore.train.callback import Callback, LossMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.communication.management import init
import mindspore.dataset.engine as de

from dataset import create_dataset
from resnet import resnet50

random.seed(1)
np.random.seed(1)
de.config.set_seed(1)
device_num = 1

MindSpore version 1.1.1 and "topi" wheel package version 0.6.0 does not match, reference to the match info on: https://www.mindspore.cn/install




In [2]:
class PerformanceCallback(Callback):
    """
    Training performance callback.

    Args:
        batch_size (int): Batch number for one step.
    """
    def __init__(self, batch_size):
        super(PerformanceCallback, self).__init__()
        self.batch_size = batch_size
        self.last_step = 0
        self.epoch_begin_time = 0

    def step_begin(self, run_context):
        self.epoch_begin_time = time.time()

    def step_end(self, run_context):
        params = run_context.original_args()
        cost_time = time.time() - self.epoch_begin_time
        train_steps = params.cur_step_num -self.last_step
        print(f'epoch {params.cur_epoch_num} cost time = {cost_time}, train step num: {train_steps}, '
              f'one step time: {1000*cost_time/train_steps} ms, '
              f'train samples per second of cluster: {device_num*train_steps*self.batch_size/cost_time:.1f}\n')
        self.last_step = run_context.original_args().cur_step_num


In [3]:
def get_lr(global_step,
           total_epochs,
           steps_per_epoch,
           lr_init=0.01,
           lr_max=0.1,
           warmup_epochs=5):
    """
    Generate learning rate array.

    Args:
        global_step (int): Initial step of training.
        total_epochs (int): Total epoch of training.
        steps_per_epoch (float): Steps of one epoch.
        lr_init (float): Initial learning rate. Default: 0.01.
        lr_max (float): Maximum learning rate. Default: 0.1.
        warmup_epochs (int): The number of warming up epochs. Default: 5.

    Returns:
        np.array, learning rate array.
    """
    lr_each_step = []
    total_steps = steps_per_epoch * total_epochs
    warmup_steps = steps_per_epoch * warmup_epochs
    if warmup_steps != 0:
        inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
    else:
        inc_each_step = 0
    for i in range(int(total_steps)):
        if i < warmup_steps:
            lr = float(lr_init) + inc_each_step * float(i)
        else:
            base = ( 1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)) )
            lr = float(lr_max) * base * base
            if lr < 0.0:
                lr = 0.0
        lr_each_step.append(lr)

    current_step = global_step
    lr_each_step = np.array(lr_each_step).astype(np.float32)
    learning_rate = lr_each_step[current_step:]

    return learning_rate

In [4]:
def resnet50_train():
    device_id = 0
    device_num = 1
    epoch_size = 90
    batch_size = 32
    class_num = 10
    loss_scale_num = 1024
    local_data_path = '/home/share/dataset/cifar-10-batches-bin/' # your cifar10 path

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id)

    if device_num > 1:
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
        init()
        local_data_path = os.path.join(local_data_path, str(device_id))

    # data download
    print('Download data.')
#     mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=1, batch_size=batch_size)
    eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False,
                                   repeat_num=1, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num = class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [performance_cb, loss_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)
    
    if device_num == 1 or device_id == 0:
        print(f'=================================Start run evaluation.=================================')
        output = model.eval(eval_dataset)
        print(f'Evaluation result: {output}.')

In [5]:
resnet50_train()
print('ResNet50 training success!')

Download data.
Create train and evaluate dataset.
Create dataset success.




Start run training, total epoch: 20.
epoch 1 cost time = 129.5071153640747, train step num: 1562, one step time: 82.91108538032952 ms, train samples per second of cluster: 386.0

epoch: 1 step: 1562, loss is 1.4099786
epoch 2 cost time = 26.813992261886597, train step num: 1562, one step time: 17.166448311066965 ms, train samples per second of cluster: 1864.1

epoch: 2 step: 1562, loss is 1.7900116
epoch 3 cost time = 26.816537618637085, train step num: 1562, one step time: 17.168077860843205 ms, train samples per second of cluster: 1863.9

epoch: 3 step: 1562, loss is 1.2243073
epoch 4 cost time = 26.817380666732788, train step num: 1562, one step time: 17.168617584335973 ms, train samples per second of cluster: 1863.9

epoch: 4 step: 1562, loss is 1.0838339
epoch 5 cost time = 26.81809091567993, train step num: 1562, one step time: 17.16907228916769 ms, train samples per second of cluster: 1863.8

epoch: 5 step: 1562, loss is 0.62525547
epoch 6 cost time = 26.817250967025757, train s