You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
`Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 58, in detach
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
return reduction.recv_handle(conn)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/reduction.py", line 189, in recv_handle
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return recvfds(s, 1)[0]
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/reduction.py", line 157, in recvfds
msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_SPACE(bytes_size))
return _ForkingPickler.loads(res)
ConnectionResetError File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
: [Errno 104] Connection reset by peer
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 509, in Client
deliver_challenge(c, authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 740, in deliver_challenge
response = connection.recv_bytes(256) # reject large message
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 757, in answer_challenge
response = connection.recv_bytes(256) # reject large message
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
2021-07-21 14:23:59 | INFO | yolox.core.trainer:183 - Training of experiment is done and the best AP is 0.00
2021-07-21 14:23:59 | ERROR | yolox.core.launch:104 - An error has been caught in function '_distributed_worker', process 'SpawnProcess-1' (75815), thread 'MainThread' (139758017422912):
Traceback (most recent call last):
File "", line 1, in
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
│ │ └ 3
│ └ 36
└ <function _main at 0x7f1bea0ad820>
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/spawn.py", line 129, in _main
return self._bootstrap(parent_sentinel)
│ │ └ 3
│ └ <function BaseProcess._bootstrap at 0x7f1bea1178b0>
└
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
│ └ <function BaseProcess.run at 0x7f1bea107ee0>
└
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
│ │ │ │ │ └ {}
│ │ │ │ └
│ │ │ └ (<function _distributed_worker at 0x7f1b92170670>, 0, (<function main at 0x7f1b092a2160>, 4, 4, 0, 'nccl', 'tcp://127.0.0.1:5...
│ │ └
│ └ <function _wrap at 0x7f1bd6a41040>
└
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
│ │ └ (<function main at 0x7f1b092a2160>, 4, 4, 0, 'nccl', 'tcp://127.0.0.1:57533', (╒══════════════════╤══════════════════════════...
│ └ 0
└ <function _distributed_worker at 0x7f1b92170670>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/launch.py", line 104, in _distributed_worker
main_func(*args)
│ └ (╒══════════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════...
└ <function main at 0x7f1b092a2160>
File "/data/zhangyong/workspace/detect/YOLOX-main/tools/train.py", line 101, in main
trainer.train()
│ └ <function Trainer.train at 0x7f1b10c650d0>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/trainer.py", line 70, in train
self.train_in_epoch()
│ └ <function Trainer.train_in_epoch at 0x7f1b09321430>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/trainer.py", line 80, in train_in_epoch
self.after_epoch()
│ └ <function Trainer.after_epoch at 0x7f1b093298b0>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/allreduce_norm.py", line 99, in all_reduce_norm
states = all_reduce(states, op="mean")
│ └ OrderedDict([('module.backbone.backbone.stem.conv.bn.weight', tensor([0.7139, 0.7288, 0.4413, 1.3903, 0.7023, 0.4169, 1.4530,...
└ <function all_reduce at 0x7f1bd66788b0>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/allreduce_norm.py", line 68, in all_reduce
group = _get_global_gloo_group()
└ <functools._lru_cache_wrapper object at 0x7f1bd66783a0>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/dist.py", line 103, in _get_global_gloo_group
return dist.new_group(backend="gloo")
│ └ <function new_group at 0x7f1bd6cfc940>
└ <module 'torch.distributed' from '/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/init.py'>
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 2032, in new_group
pg = _new_process_group_helper(group_world_size,
│ └ 4
└ <function _new_process_group_helper at 0x7f1bd6cfb790>
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 517, in _new_process_group_helper
pg = ProcessGroupGloo(
└ <class 'torch.distributed.ProcessGroupGloo'>
RuntimeError: [enforce fail at /pytorch/third_party/gloo/gloo/transport/tcp/device.cc:83] ifa != nullptr. Unable to find address for: ib0`
The text was updated successfully, but these errors were encountered:
`Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 58, in detach
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
return reduction.recv_handle(conn)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/reduction.py", line 189, in recv_handle
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return recvfds(s, 1)[0]
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/reduction.py", line 157, in recvfds
msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_SPACE(bytes_size))
return _ForkingPickler.loads(res)
ConnectionResetError File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
: [Errno 104] Connection reset by peer
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 509, in Client
deliver_challenge(c, authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 740, in deliver_challenge
response = connection.recv_bytes(256) # reject large message
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 757, in answer_challenge
response = connection.recv_bytes(256) # reject large message
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
2021-07-21 14:23:59 | INFO | yolox.core.trainer:183 - Training of experiment is done and the best AP is 0.00
2021-07-21 14:23:59 | ERROR | yolox.core.launch:104 - An error has been caught in function '_distributed_worker', process 'SpawnProcess-1' (75815), thread 'MainThread' (139758017422912):
Traceback (most recent call last):
File "", line 1, in
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
│ │ └ 3
│ └ 36
└ <function _main at 0x7f1bea0ad820>
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/spawn.py", line 129, in _main
return self._bootstrap(parent_sentinel)
│ │ └ 3
│ └ <function BaseProcess._bootstrap at 0x7f1bea1178b0>
└
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
│ └ <function BaseProcess.run at 0x7f1bea107ee0>
└
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
│ │ │ │ │ └ {}
│ │ │ │ └
│ │ │ └ (<function _distributed_worker at 0x7f1b92170670>, 0, (<function main at 0x7f1b092a2160>, 4, 4, 0, 'nccl', 'tcp://127.0.0.1:5...
│ │ └
│ └ <function _wrap at 0x7f1bd6a41040>
└
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
│ │ └ (<function main at 0x7f1b092a2160>, 4, 4, 0, 'nccl', 'tcp://127.0.0.1:57533', (╒══════════════════╤══════════════════════════...
│ └ 0
└ <function _distributed_worker at 0x7f1b92170670>
File "/data/zhangyong/workspace/detect/YOLOX-main/tools/train.py", line 101, in main
trainer.train()
│ └ <function Trainer.train at 0x7f1b10c650d0>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/trainer.py", line 70, in train
self.train_in_epoch()
│ └ <function Trainer.train_in_epoch at 0x7f1b09321430>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/trainer.py", line 80, in train_in_epoch
self.after_epoch()
│ └ <function Trainer.after_epoch at 0x7f1b093298b0>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/trainer.py", line 209, in after_epoch
all_reduce_norm(self.model)
│ │ └ DistributedDataParallel(
│ │ (module): YOLOX(
│ │ (backbone): YOLOPAFPN(
│ │ (backbone): CSPDarknet(
│ │ (stem): Focus(
│ │ ...
│ └ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>
└ <function all_reduce_norm at 0x7f1bd6678940>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/allreduce_norm.py", line 99, in all_reduce_norm
states = all_reduce(states, op="mean")
│ └ OrderedDict([('module.backbone.backbone.stem.conv.bn.weight', tensor([0.7139, 0.7288, 0.4413, 1.3903, 0.7023, 0.4169, 1.4530,...
└ <function all_reduce at 0x7f1bd66788b0>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/allreduce_norm.py", line 68, in all_reduce
group = _get_global_gloo_group()
└ <functools._lru_cache_wrapper object at 0x7f1bd66783a0>
File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/dist.py", line 103, in _get_global_gloo_group
return dist.new_group(backend="gloo")
│ └ <function new_group at 0x7f1bd6cfc940>
└ <module 'torch.distributed' from '/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/init.py'>
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 2032, in new_group
pg = _new_process_group_helper(group_world_size,
│ └ 4
└ <function _new_process_group_helper at 0x7f1bd6cfb790>
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 517, in _new_process_group_helper
pg = ProcessGroupGloo(
└ <class 'torch.distributed.ProcessGroupGloo'>
RuntimeError: [enforce fail at /pytorch/third_party/gloo/gloo/transport/tcp/device.cc:83] ifa != nullptr. Unable to find address for: ib0`
The text was updated successfully, but these errors were encountered: