In [1]:
"""
######################## train YOLOv3 example ########################
train YOLOv3 and get network model files(.ckpt) :
python train.py --image_dir /data --anno_path /data/coco/train_coco.txt --mindrecord_dir=/data/Mindrecord_train

If the mindrecord_dir is empty, it wil generate mindrecord file by image_dir and anno_path.
Note if mindrecord_dir isn't empty, it will use mindrecord_dir rather than image_dir and anno_path.
"""

import os
import argparse
import ast
from easydict import EasyDict as edict

import numpy as np
import mindspore.nn as nn
from mindspore import context, Tensor
from mindspore.communication.management import init
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.context import ParallelMode
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common.initializer import initializer
from mindspore.common import set_seed

import sys
sys.path.insert(0,'./yolov3/yolov3_resnet18/')
print(sys.path)
from src.yolov3 import yolov3_resnet18, YoloWithLossCell, TrainingWrapper
from src.dataset import create_yolo_dataset, data_to_mindrecord_byte_image
from src.config import ConfigYOLOV3ResNet18

import moxing as mox

set_seed(1)

def get_lr(learning_rate, start_step, global_step, decay_step, decay_rate, steps=False):
    """Set learning rate."""
    lr_each_step = []
    for i in range(global_step):
        if steps:
            lr_each_step.append(learning_rate * (decay_rate ** (i // decay_step)))
        else:
            lr_each_step.append(learning_rate * (decay_rate ** (i / decay_step)))
    lr_each_step = np.array(lr_each_step).astype(np.float32)
    lr_each_step = lr_each_step[start_step:]
    return lr_each_step


def init_net_param(network, init_value='ones'):
    """Init the parameters in network."""
    params = network.trainable_params()
    for p in params:
        if isinstance(p.data, Tensor) and 'beta' not in p.name and 'gamma' not in p.name and 'bias' not in p.name:
            p.set_data(initializer(init_value, p.data.shape, p.data.dtype))


def main(args_opt):
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
    if args_opt.distribute:
        device_num = args_opt.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                          device_num=device_num)
        init()
        rank = args_opt.device_id % device_num
    else:
        rank = 0
        device_num = 1

    print("Start create dataset!")

    # It will generate mindrecord file in args_opt.mindrecord_dir,
    # and the file name is yolo.mindrecord0, 1, ... file_num.
    if not os.path.isdir(args_opt.mindrecord_dir):
        os.makedirs(args_opt.mindrecord_dir)

    prefix = "yolo.mindrecord"
    mindrecord_file = os.path.join(args_opt.mindrecord_dir, prefix)
    if args_opt.mode=="source":
        print("Create Mindrecord.")
        data_to_mindrecord_byte_image(args_opt.image_dir,
                                  args_opt.mindrecord_dir,
                                  prefix,
                                  1)
        print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir))
        all_files = os.listdir(args_opt.mindrecord_dir)
        print(all_files)
        #mox.file.copy_parallel(src_url=args_opt.mindrecord_dir, dst_url=os.path.join(args_opt.data_url,'train')
    elif args_opt.mode=="mindspore":
        pass
    else:
        raise ValueError('image_dir {} does not exist'.format(args_opt.image_dir))

    if not args_opt.only_create_dataset:
        loss_scale = float(args_opt.loss_scale)

        # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0.
        print(mindrecord_file)
        dataset = create_yolo_dataset(mindrecord_file,
                                      batch_size=args_opt.batch_size, device_num=device_num, rank=rank)
        dataset_size = dataset.get_dataset_size()
        print(dataset_size)
        print("Create dataset done!")

        net = yolov3_resnet18(ConfigYOLOV3ResNet18())
        net = YoloWithLossCell(net, ConfigYOLOV3ResNet18())
        init_net_param(net, "XavierUniform")

        # checkpoint
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=os.path.join(cfg.ckpt_dir,'ckpt_' + str(rank) + '/'), config=ckpt_config)

        if args_opt.pre_trained:
            if args_opt.pre_trained_epoch_size <= 0:
                raise KeyError("pre_trained_epoch_size must be greater than 0.")
            param_dict = load_checkpoint(args_opt.pre_trained)
            load_param_into_net(net, param_dict)
        total_epoch_size = 60
        if args_opt.distribute:
            total_epoch_size = 160
        lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size,
                           global_step=total_epoch_size * dataset_size,
                           decay_step=1000, decay_rate=0.95, steps=True))
        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)

        callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]

        model = Model(net)
        dataset_sink_mode = cfg.dataset_sink_mode
        print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.")
        model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)

['./yolov3/yolov3_resnet18/', '/home/ma-user/work', '', '/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages', '/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg', '/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg', '/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages', '/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe', '/home/ma-user/work', '/home/ma-user/miniconda3/envs/Mindspore-1.0.0-python3.7-aarch64/lib/python37.zip', '/home/ma-user/miniconda3/envs/Mindspore-1.0.0-python3.7-aarch64/lib/python3.7', '/home/ma-user/miniconda3/envs/Mindspore-1.0.0-python3.7-aarch64/lib/python3.7/lib-dynload', '/home/ma-user/miniconda3/envs/Mindspore-1.0.0-python3.7-aarch64/lib/python3.7/site-packages', '/home/ma-user/miniconda3/envs/Mindspore-1.0.0-python3.7-aarch64/lib/python3.7/site-packages/IPython/extensions', '/home/ma-user/.ipython', '/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe']


INFO:root:Using MoXing-v1.17.3-43fbf97f
INFO:root:Using OBS-Python-SDK-3.20.7


In [2]:
#if __name__ == '__main__':
cfg = edict({
    "only_create_dataset": False,
    "distribute": False,
    "device_id": 0,
    "device_num": 1,
    "dataset_sink_mode": False,

    "lr": 0.001,
    "epoch_size": 90,
    "batch_size": 32,
    "loss_scale" : 1024,

    "pre_trained": None,
    "pre_trained_epoch_size":0,

    "ckpt_dir": "./ckpt",
    "save_checkpoint_epochs" :1,

    "mode": "source",       #{mindspore,source }

    "mindrecord_dir": "./mindrecord_dir"

})
args_opt = edict({
    "data_url": 's3://{user_obs}/mask_detection_500/mask_detection_500',
    #"data_url": 's3://yyq-2/DATA/code/yolov3/mask_detection_500/mask_detection_500',
    "ckpt_url": None,
    train_url": 's3:/{user_obs}/'    # ckpt path
    #"train_url": 's3://yyq-2/DATA/code/yolov3/yolov3_out/60/'
})    

import moxing as mox
if args_opt.ckpt_url is not None:
    ckpt_path = './ckpt/'
    mox.file.copy_parallel(src_url=args_opt.ckpt_url, dst_url=ckpt_path)
    cfg.pre_trained = ckpt_path

data_path = './data/' 
#  WAY1: copy dataset from your own OBS bucket to container/cache.
mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=data_path)
# WAY2: copy dataset from other's OBS bucket, which has been set public read or public read&write.
# mox.file.copy_parallel(src_url="s3://share-course/dataset/YOLOv3_ResNet18", dst_url=data_path)

if cfg.mode=="mindspore":
    #cfg.mindrecord_dir = os.path.join(data_path,'train')
    cfg.mindrecord_dir = os.path.join(data_path)
else:
    cfg.image_dir = os.path.join(data_path, "train")

main(cfg)
mox.file.copy_parallel(src_url=cfg.ckpt_dir, dst_url=args_opt.train_url)

INFO:root:Listing OBS: 1000
INFO:root:pid: None.	1000/1008


Start create dataset!
Create Mindrecord.
Create Mindrecord Done, at ./mindrecord_dir
['yolo.mindrecord', 'yolo.mindrecord.db']
./mindrecord_dir/yolo.mindrecord
15
Create dataset done!
Start train YOLOv3, the first epoch will be slower because of the graph compilation.
epoch: 1 step: 1, loss is 10275.359
epoch: 1 step: 2, loss is 10550.436
epoch: 1 step: 3, loss is 8843.978
epoch: 1 step: 4, loss is 7310.397
epoch: 1 step: 5, loss is 6131.6533
epoch: 1 step: 6, loss is 5151.744
epoch: 1 step: 7, loss is 4435.633
epoch: 1 step: 8, loss is 3946.457
epoch: 1 step: 9, loss is 3400.7925
epoch: 1 step: 10, loss is 3160.4407
epoch: 1 step: 11, loss is 2798.0378
epoch: 1 step: 12, loss is 2579.184
epoch: 1 step: 13, loss is 2400.7332
epoch: 1 step: 14, loss is 2208.5928
epoch: 1 step: 15, loss is 2049.6133
Epoch time: 150613.805, per step time: 10040.920
epoch: 2 step: 1, loss is 1919.8008
epoch: 2 step: 2, loss is 1902.464
epoch: 2 step: 3, loss is 1707.8522
epoch: 2 step: 4, loss is 1698.6813

epoch: 14 step: 8, loss is 183.9448
epoch: 14 step: 9, loss is 186.68384
epoch: 14 step: 10, loss is 183.80074
epoch: 14 step: 11, loss is 182.07248
epoch: 14 step: 12, loss is 162.40765
epoch: 14 step: 13, loss is 150.38365
epoch: 14 step: 14, loss is 176.02692
epoch: 14 step: 15, loss is 166.25607
Epoch time: 5916.991, per step time: 394.466
epoch: 15 step: 1, loss is 169.77774
epoch: 15 step: 2, loss is 177.01846
epoch: 15 step: 3, loss is 171.16464
epoch: 15 step: 4, loss is 166.72235
epoch: 15 step: 5, loss is 165.75577
epoch: 15 step: 6, loss is 163.01215
epoch: 15 step: 7, loss is 178.79477
epoch: 15 step: 8, loss is 161.75522
epoch: 15 step: 9, loss is 166.5491
epoch: 15 step: 10, loss is 158.17113
epoch: 15 step: 11, loss is 171.13185
epoch: 15 step: 12, loss is 178.6719
epoch: 15 step: 13, loss is 160.92082
epoch: 15 step: 14, loss is 155.83699
epoch: 15 step: 15, loss is 169.15526
Epoch time: 5842.688, per step time: 389.513
epoch: 16 step: 1, loss is 151.31537
epoch: 16 ste

epoch: 28 step: 1, loss is 114.913605
epoch: 28 step: 2, loss is 104.93718
epoch: 28 step: 3, loss is 106.80825
epoch: 28 step: 4, loss is 112.80114
epoch: 28 step: 5, loss is 114.969284
epoch: 28 step: 6, loss is 108.41026
epoch: 28 step: 7, loss is 117.989365
epoch: 28 step: 8, loss is 106.6445
epoch: 28 step: 9, loss is 124.08214
epoch: 28 step: 10, loss is 113.623436
epoch: 28 step: 11, loss is 102.339386
epoch: 28 step: 12, loss is 125.6375
epoch: 28 step: 13, loss is 99.5077
epoch: 28 step: 14, loss is 104.77777
epoch: 28 step: 15, loss is 89.29025
Epoch time: 6244.668, per step time: 416.311
epoch: 29 step: 1, loss is 108.030014
epoch: 29 step: 2, loss is 107.94013
epoch: 29 step: 3, loss is 98.15129
epoch: 29 step: 4, loss is 115.30843
epoch: 29 step: 5, loss is 109.11078
epoch: 29 step: 6, loss is 99.09792
epoch: 29 step: 7, loss is 88.239494
epoch: 29 step: 8, loss is 112.89288
epoch: 29 step: 9, loss is 113.5801
epoch: 29 step: 10, loss is 96.890816
epoch: 29 step: 11, loss 

epoch: 41 step: 13, loss is 92.1104
epoch: 41 step: 14, loss is 70.86305
epoch: 41 step: 15, loss is 91.4418
Epoch time: 5789.803, per step time: 385.987
epoch: 42 step: 1, loss is 79.04148
epoch: 42 step: 2, loss is 79.705864
epoch: 42 step: 3, loss is 79.1788
epoch: 42 step: 4, loss is 85.3883
epoch: 42 step: 5, loss is 95.09233
epoch: 42 step: 6, loss is 75.32596
epoch: 42 step: 7, loss is 75.33815
epoch: 42 step: 8, loss is 75.721436
epoch: 42 step: 9, loss is 77.22475
epoch: 42 step: 10, loss is 78.97667
epoch: 42 step: 11, loss is 89.99948
epoch: 42 step: 12, loss is 81.50026
epoch: 42 step: 13, loss is 76.33093
epoch: 42 step: 14, loss is 76.958176
epoch: 42 step: 15, loss is 79.61762
Epoch time: 5943.258, per step time: 396.217
epoch: 43 step: 1, loss is 93.88044
epoch: 43 step: 2, loss is 69.30841
epoch: 43 step: 3, loss is 90.43352
epoch: 43 step: 4, loss is 72.675995
epoch: 43 step: 5, loss is 73.18686
epoch: 43 step: 6, loss is 60.315243
epoch: 43 step: 7, loss is 81.64166


epoch: 55 step: 10, loss is 68.963684
epoch: 55 step: 11, loss is 64.97585
epoch: 55 step: 12, loss is 61.932304
epoch: 55 step: 13, loss is 57.23661
epoch: 55 step: 14, loss is 53.13035
epoch: 55 step: 15, loss is 78.71594
Epoch time: 6484.315, per step time: 432.288
epoch: 56 step: 1, loss is 82.22478
epoch: 56 step: 2, loss is 70.13024
epoch: 56 step: 3, loss is 75.975075
epoch: 56 step: 4, loss is 67.112854
epoch: 56 step: 5, loss is 75.177414
epoch: 56 step: 6, loss is 58.473312
epoch: 56 step: 7, loss is 58.548737
epoch: 56 step: 8, loss is 63.88556
epoch: 56 step: 9, loss is 75.889404
epoch: 56 step: 10, loss is 67.68087
epoch: 56 step: 11, loss is 67.41693
epoch: 56 step: 12, loss is 73.01726
epoch: 56 step: 13, loss is 60.901134
epoch: 56 step: 14, loss is 56.930298
epoch: 56 step: 15, loss is 60.213463
Epoch time: 6235.470, per step time: 415.698
epoch: 57 step: 1, loss is 73.26108
epoch: 57 step: 2, loss is 65.820564
epoch: 57 step: 3, loss is 58.716064
epoch: 57 step: 4, lo

epoch: 69 step: 6, loss is 64.13274
epoch: 69 step: 7, loss is 68.82736
epoch: 69 step: 8, loss is 54.879498
epoch: 69 step: 9, loss is 56.895363
epoch: 69 step: 10, loss is 61.79829
epoch: 69 step: 11, loss is 62.024574
epoch: 69 step: 12, loss is 62.128143
epoch: 69 step: 13, loss is 59.290714
epoch: 69 step: 14, loss is 65.853165
epoch: 69 step: 15, loss is 82.14555
Epoch time: 6775.775, per step time: 451.718
epoch: 70 step: 1, loss is 64.6479
epoch: 70 step: 2, loss is 49.580482
epoch: 70 step: 3, loss is 64.91866
epoch: 70 step: 4, loss is 69.952
epoch: 70 step: 5, loss is 54.642727
epoch: 70 step: 6, loss is 75.38677
epoch: 70 step: 7, loss is 77.47923
epoch: 70 step: 8, loss is 53.594807
epoch: 70 step: 9, loss is 60.814262
epoch: 70 step: 10, loss is 65.286575
epoch: 70 step: 11, loss is 60.667694
epoch: 70 step: 12, loss is 59.543434
epoch: 70 step: 13, loss is 49.14731
epoch: 70 step: 14, loss is 64.03902
epoch: 70 step: 15, loss is 55.27012
Epoch time: 6394.318, per step ti

epoch: 83 step: 1, loss is 59.656765
epoch: 83 step: 2, loss is 67.32318
epoch: 83 step: 3, loss is 57.732433
epoch: 83 step: 4, loss is 59.699314
epoch: 83 step: 5, loss is 66.48266
epoch: 83 step: 6, loss is 61.32727
epoch: 83 step: 7, loss is 54.86912
epoch: 83 step: 8, loss is 64.705696
epoch: 83 step: 9, loss is 56.86838
epoch: 83 step: 10, loss is 52.7367
epoch: 83 step: 11, loss is 61.413784
epoch: 83 step: 12, loss is 72.64162
epoch: 83 step: 13, loss is 60.533943
epoch: 83 step: 14, loss is 54.858315
epoch: 83 step: 15, loss is 80.20487
Epoch time: 6387.272, per step time: 425.818
epoch: 84 step: 1, loss is 77.51143
epoch: 84 step: 2, loss is 50.30311
epoch: 84 step: 3, loss is 65.095314
epoch: 84 step: 4, loss is 59.79352
epoch: 84 step: 5, loss is 71.996994
epoch: 84 step: 6, loss is 61.19602
epoch: 84 step: 7, loss is 61.989502
epoch: 84 step: 8, loss is 56.101734
epoch: 84 step: 9, loss is 67.25811
epoch: 84 step: 10, loss is 61.57486
epoch: 84 step: 11, loss is 56.90882
e