Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ConnectionResetError #43

Closed
gentlebreeze1 opened this issue Jul 21, 2021 · 1 comment
Closed

ConnectionResetError #43

gentlebreeze1 opened this issue Jul 21, 2021 · 1 comment

Comments

@gentlebreeze1
Copy link

`Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 58, in detach
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
return reduction.recv_handle(conn)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/reduction.py", line 189, in recv_handle
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return recvfds(s, 1)[0]
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/reduction.py", line 157, in recvfds
msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_SPACE(bytes_size))
return _ForkingPickler.loads(res)
ConnectionResetError File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
: [Errno 104] Connection reset by peer
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 509, in Client
deliver_challenge(c, authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 740, in deliver_challenge
response = connection.recv_bytes(256) # reject large message
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Exception in thread Thread-1:
Traceback (most recent call last):
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/root/anaconda3/envs/yolox2/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
fd = df.detach()
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 757, in answer_challenge
response = connection.recv_bytes(256) # reject large message
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
2021-07-21 14:23:59 | INFO | yolox.core.trainer:183 - Training of experiment is done and the best AP is 0.00
2021-07-21 14:23:59 | ERROR | yolox.core.launch:104 - An error has been caught in function '_distributed_worker', process 'SpawnProcess-1' (75815), thread 'MainThread' (139758017422912):
Traceback (most recent call last):

File "", line 1, in
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
│ │ └ 3
│ └ 36
└ <function _main at 0x7f1bea0ad820>
File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/spawn.py", line 129, in _main
return self._bootstrap(parent_sentinel)
│ │ └ 3
│ └ <function BaseProcess._bootstrap at 0x7f1bea1178b0>

File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
│ └ <function BaseProcess.run at 0x7f1bea107ee0>

File "/root/anaconda3/envs/yolox2/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
│ │ │ │ │ └ {}
│ │ │ │ └
│ │ │ └ (<function _distributed_worker at 0x7f1b92170670>, 0, (<function main at 0x7f1b092a2160>, 4, 4, 0, 'nccl', 'tcp://127.0.0.1:5...
│ │ └
│ └ <function _wrap at 0x7f1bd6a41040>

File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
│ │ └ (<function main at 0x7f1b092a2160>, 4, 4, 0, 'nccl', 'tcp://127.0.0.1:57533', (╒══════════════════╤══════════════════════════...
│ └ 0
└ <function _distributed_worker at 0x7f1b92170670>

File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/launch.py", line 104, in _distributed_worker
main_func(*args)
│ └ (╒══════════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════...
└ <function main at 0x7f1b092a2160>

File "/data/zhangyong/workspace/detect/YOLOX-main/tools/train.py", line 101, in main
trainer.train()
│ └ <function Trainer.train at 0x7f1b10c650d0>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>

File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/trainer.py", line 70, in train
self.train_in_epoch()
│ └ <function Trainer.train_in_epoch at 0x7f1b09321430>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>

File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/trainer.py", line 80, in train_in_epoch
self.after_epoch()
│ └ <function Trainer.after_epoch at 0x7f1b093298b0>
└ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>

File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/core/trainer.py", line 209, in after_epoch
all_reduce_norm(self.model)
│ │ └ DistributedDataParallel(
│ │ (module): YOLOX(
│ │ (backbone): YOLOPAFPN(
│ │ (backbone): CSPDarknet(
│ │ (stem): Focus(
│ │ ...
│ └ <yolox.core.trainer.Trainer object at 0x7f1b092c2070>
└ <function all_reduce_norm at 0x7f1bd6678940>

File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/allreduce_norm.py", line 99, in all_reduce_norm
states = all_reduce(states, op="mean")
│ └ OrderedDict([('module.backbone.backbone.stem.conv.bn.weight', tensor([0.7139, 0.7288, 0.4413, 1.3903, 0.7023, 0.4169, 1.4530,...
└ <function all_reduce at 0x7f1bd66788b0>

File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/allreduce_norm.py", line 68, in all_reduce
group = _get_global_gloo_group()
└ <functools._lru_cache_wrapper object at 0x7f1bd66783a0>

File "/data/zhangyong/workspace/detect/YOLOX-main/yolox/utils/dist.py", line 103, in _get_global_gloo_group
return dist.new_group(backend="gloo")
│ └ <function new_group at 0x7f1bd6cfc940>
└ <module 'torch.distributed' from '/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/init.py'>

File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 2032, in new_group
pg = _new_process_group_helper(group_world_size,
│ └ 4
└ <function _new_process_group_helper at 0x7f1bd6cfb790>
File "/root/anaconda3/envs/yolox2/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 517, in _new_process_group_helper
pg = ProcessGroupGloo(
└ <class 'torch.distributed.ProcessGroupGloo'>

RuntimeError: [enforce fail at /pytorch/third_party/gloo/gloo/transport/tcp/device.cc:83] ifa != nullptr. Unable to find address for: ib0`

@GOATmessi7
Copy link
Member

You may pull the latest update and retry it. We fix the NCCL configure issue at #49

xolbynz pushed a commit to intflow/YOLOX that referenced this issue Sep 2, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants