CUDA error: device-side assert triggered (train a model include neg) #1815

wendaomu · 2019-12-15T01:29:27Z

I use the code from dev/allow_empty_gt to train a model, and I have some negative sample, so when I make dataset in the coco format, I set the x,y,w,h, label 0. I have 6 classes to classify,so I set the num_classess 7.

I want to know whether the data I made produce the error?

python tools/train.py configs/my_cascade_rcnn_r50_fpn_1x.py --work_dir result/
2019-12-15 09:30:21,276 - INFO - Distributed training: False
2019-12-15 09:30:21,276 - INFO - MMDetection Version: 1.0.rc0+unknown
2019-12-15 09:30:21,276 - INFO - Config: # model settings
model = dict(
type='CascadeRCNN',
num_stages=3,
pretrained='torchvision://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=7,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=7,
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1],
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=7,
target_means=[0., 0., 0., 0.],
target_stds=[0.033, 0.033, 0.067, 0.067],
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
])

model training and testing settings

train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.7,
min_pos_iou=0.7,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)
],
stage_loss_weights=[1, 0.5, 0.25])
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100),
keep_all_stages=False)

dataset settings

dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
imgs_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'images/train2017/',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'images/val2017/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'images/val2017/',
pipeline=test_pipeline))

optimizer

optimizer = dict(type='SGD', lr=0.0001, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

learning policy

lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 10,
step=[8, 11])
checkpoint_config = dict(interval=1)

yapf:disable

log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])

yapf:enable

runtime settings

total_epochs = 25
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/cascade_rcnn_r50_fpn_1x'
load_from = None
resume_from = None
workflow = [('train', 1)]

2019-12-15 09:30:21,998 - INFO - load model from: torchvision://resnet50
2019-12-15 09:30:30,103 - WARNING - The model and loaded state dict do not match exactly

unexpected key in source state_dict: fc.weight, fc.bias

loading annotations into memory...
Done (t=1.19s)
creating index...
index created!
2019-12-15 09:30:41,608 - INFO - Start running, host: admin@fuxilabor_labor0_S4_Odps_S98_dsw_prepaid_cnbj_1181_201912080304, work_dir:/data/nas/workspace/jupyter/Project/lb_project/code/mmdetection-empty_gt/result
2019-12-15 09:30:41,608 - INFO - workflow: [('train', 1)], max: 25 epochs
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:57: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensor<Dtype, 2, int, DefaultPtrTraits>, THCDeviceTensor<long, 1, int, DefaultPtrTraits>, THCDeviceTensor<Dtype, 1, int, DefaultPtrTraits>, Dtype *, int, int) [with Dtype = float]: block: [1,0,0], thread: [512,0,0] Assertion cur_target >= 0 && cur_target < n_classes failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:57: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensor<Dtype, 2, int, DefaultPtrTraits>, THCDeviceTensor<long, 1, int, DefaultPtrTraits>, THCDeviceTensor<Dtype, 1, int, DefaultPtrTraits>, Dtype *, int, int) [with Dtype = float]: block: [1,0,0], thread: [513,0,0] Assertion cur_target >= 0 && cur_target < n_classes failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:57: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensor<Dtype, 2, int, DefaultPtrTraits>, THCDeviceTensor<long, 1, int, DefaultPtrTraits>, THCDeviceTensor<Dtype, 1, int, DefaultPtrTraits>, Dtype *, int, int) [with Dtype = float]: block: [1,0,0], thread: [514,0,0] Assertion cur_target >= 0 && cur_target < n_classes failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:57: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensor<Dtype, 2, int, DefaultPtrTraits>, THCDeviceTensor<long, 1, int, DefaultPtrTraits>, THCDeviceTensor<Dtype, 1, int, DefaultPtrTraits>, Dtype *, int, int) [with Dtype = float]: block: [1,0,0], thread: [515,0,0] Assertion cur_target >= 0 && cur_target < n_classes failed.
Traceback (most recent call last):
File "tools/train.py", line 110, in
main()
File "tools/train.py", line 106, in main
logger=logger)
File "/home/admin/jupyter/Project/lb_project/code/mmdetection/mmdet/apis/train.py", line 60, in train_detector
_non_dist_train(model, dataset, cfg, validate=validate)
File "/home/admin/jupyter/Project/lb_project/code/mmdetection/mmdet/apis/train.py", line 232, in _non_dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data/nas/workspace/envs/python3.6/site-packages/mmcv/runner/runner.py", line 358, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data/nas/workspace/envs/python3.6/site-packages/mmcv/runner/runner.py", line 264, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/home/admin/jupyter/Project/lb_project/code/mmdetection/mmdet/apis/train.py", line 38, in batch_processor
losses = model(**data)
File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in call
result = self.forward(*input, **kwargs)
File "/opt/conda/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 150, in forward
return self.module(*inputs[0], **kwargs[0])
File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in call
result = self.forward(*input, **kwargs)
File "/home/admin/jupyter/Project/lb_project/code/mmdetection/mmdet/core/fp16/decorators.py", line 49, in new_func
return old_func(*args, **kwargs)
File "/home/admin/jupyter/Project/lb_project/code/mmdetection/mmdet/models/detectors/base.py", line 117, in forward
return self.forward_train(img, img_meta, **kwargs)
File "/home/admin/jupyter/Project/lb_project/code/mmdetection/mmdet/models/detectors/cascade_rcnn.py", line 247, in forward_train
loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets)
File "/home/admin/jupyter/Project/lb_project/code/mmdetection/mmdet/core/fp16/decorators.py", line 127, in new_func
return old_func(*args, **kwargs)
File "/home/admin/jupyter/Project/lb_project/code/mmdetection/mmdet/models/bbox_heads/bbox_head.py", line 120, in loss
pos_bbox_pred = bbox_pred.view(bbox_pred.size(0), 4)[pos_inds]
RuntimeError: copy_if failed to synchronize: device-side assert triggered
terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: device-side assert triggered (insert_events at /pytorch/c10/cuda/CUDACachingAllocator.cpp:569)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x33 (0x7f3454350813 in /opt/conda/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: + 0x16126 (0x7f345458b126 in /opt/conda/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: + 0x16b11 (0x7f345458bb11 in /opt/conda/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: c10::TensorImpl::release_resources() + 0x4d (0x7f3454340f0d in /opt/conda/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #4: + 0x4af752 (0x7f3454dda752 in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #5: + 0x4af796 (0x7f3454dda796 in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_python.so)

frame #25: __libc_start_main + 0xf5 (0x7f345fe22b15 in /lib64/libc.so.6)

Aborted

The text was updated successfully, but these errors were encountered:

yuyijie1995 · 2019-12-15T05:32:06Z

I got the same error:
RuntimeError: tabulate: failed to synchronize: cudaErrorAssert: device-side assert triggered
terminate called after throwing an instance of 'c10::Error'

After adding my own data aug method, model can be trained normally several epochs,then this error happened.

wendaomu · 2019-12-15T06:00:37Z

I set the label of neg image empty and then solve the problem.

hellock · 2019-12-16T17:01:36Z

Label 0 in annotation files does not mean negative, just leave it empty.

SunNYNO1 · 2020-01-10T14:04:07Z

hello,i meet the same questions, but i dont understand 'label 0' , how to leave it empty?
i add some data to dataset, this is my xml:
<?xml version="1.0" ?> <annotation> <filename>2008_000032_2020.jpg</filename> <size> <width>500</width> <height>375</height> <depth>3</depth> </size> <object> <name>car</name> <bndbox> <xmin>257</xmin> <xmax>500</xmax> <ymin>234</ymin> <ymax>375</ymax> </bndbox> <difficult>0</difficult> </object> </annotation>

SunNYNO1 · 2020-01-14T15:42:25Z

能不能回答一下，刚开始以为是xml的格式不正确，修改后依然报同样的错误。并没有找到label为0的地方，我的类别如下，和pascal原本的类别是相同的：
['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
'tvmonitor']
求解，抱拳，谢谢～

Beastmaster · 2020-04-15T14:12:49Z

Please check groundtruth label range and number of output channels. If label range is [0-2], you must output 3 channels to match the range. If label index 2 is to ignore, please pass "ignore_index=2" in NLLLoss, and you may output 2 channels.

hellock closed this as completed Jan 7, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CUDA error: device-side assert triggered (train a model include neg) #1815

CUDA error: device-side assert triggered (train a model include neg) #1815

wendaomu commented Dec 15, 2019 •

edited

Loading

yuyijie1995 commented Dec 15, 2019

wendaomu commented Dec 15, 2019

hellock commented Dec 16, 2019

SunNYNO1 commented Jan 10, 2020

SunNYNO1 commented Jan 14, 2020

Beastmaster commented Apr 15, 2020

CUDA error: device-side assert triggered (train a model include neg) #1815

CUDA error: device-side assert triggered (train a model include neg) #1815

Comments

wendaomu commented Dec 15, 2019 • edited Loading

model training and testing settings

dataset settings

optimizer

learning policy

yapf:disable

yapf:enable

runtime settings

yuyijie1995 commented Dec 15, 2019

wendaomu commented Dec 15, 2019

hellock commented Dec 16, 2019

SunNYNO1 commented Jan 10, 2020

SunNYNO1 commented Jan 14, 2020

Beastmaster commented Apr 15, 2020

wendaomu commented Dec 15, 2019 •

edited

Loading