Skip to content

When Training Finish, It is crashed! #276

@segatecm

Description

@segatecm

Report Memory Error! Please Help!

Epoch 0: 100%|█| 3/3 [00:47<00:00, 15.70s/it, loss=0.00529, v_num=3, train/loss_simple_step=0.00275, train/loss_vlb_steTraceback (most recent call last):
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\torch\serialization.py", line 379, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\torch\serialization.py", line 604, in _save
zip_file.write_record(name, storage.data_ptr(), num_bytes)
MemoryError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\torch\serialization.py", line 380, in save
return
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\torch\serialization.py", line 259, in exit
self.file_like.write_end_of_file()
RuntimeError: [enforce fail at C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\caffe2\serialize\inline_container.cc:319] . unexpected pos 6137082560 vs 6137082456

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "tutorial_train.py", line 37, in
trainer.fit(model, dataloader)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 735, in fit
self._call_and_handle_interrupt(
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 682, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 770, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1193, in _run
self._dispatch()
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1272, in _dispatch
self.training_type_plugin.start_training(self)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\plugins\training_type\training_type_plugin.py", line 202, in start_training
self._results = trainer.run_stage()
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1282, in run_stage
return self._run_train()
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1312, in _run_train
self.fit_loop.run()
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\loops\base.py", line 145, in run
self.advance(*args, **kwargs)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\loops\fit_loop.py", line 234, in advance
self.epoch_loop.run(data_fetcher)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\loops\base.py", line 151, in run
output = self.on_run_end()
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\loops\epoch\training_epoch_loop.py", line 302, in on_run_end
self.trainer.call_hook("on_train_epoch_end")
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1477, in call_hook
callback_fx(*args, **kwargs)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\callback_hook.py", line 93, in on_train_epoch_end
callback.on_train_epoch_end(self, self.lightning_module)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py", line 315, in on_train_epoch_end
self.save_checkpoint(trainer)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py", line 384, in save_checkpoint
self._save_none_monitor_checkpoint(trainer, monitor_candidates)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py", line 681, in _save_none_monitor_checkpoint
trainer.save_checkpoint(filepath, self.save_weights_only)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1901, in save_checkpoint
self.checkpoint_connector.save_checkpoint(filepath, weights_only)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\trainer\connectors\checkpoint_connector.py", line 472, in save_checkpoint
self.trainer.training_type_plugin.save_checkpoint(_checkpoint, filepath)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\plugins\training_type\training_type_plugin.py", line 294, in save_checkpoint
return self.checkpoint_io.save_checkpoint(checkpoint, filepath)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\plugins\io\torch_plugin.py", line 37, in save_checkpoint
atomic_save(checkpoint, path)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\pytorch_lightning\utilities\cloud_io.py", line 68, in atomic_save
torch.save(checkpoint, bytesbuffer)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\torch\serialization.py", line 381, in save
_legacy_save(obj, opened_file, pickle_module, pickle_protocol)
File "C:\Users\ybXunLianJi\miniconda3\envs\controlnet\lib\site-packages\torch\serialization.py", line 225, in exit
self.file_like.flush()
ValueError: I/O operation on closed file.

///////////////////////////////////////////////////////////////////
Python 3.8.5, Windows 10, 4090,

Package Version


absl-py 1.4.0
aiohttp 3.8.4
aiosignal 1.3.1
antlr4-python3-runtime 4.8
async-timeout 4.0.2
attrs 22.2.0
cachetools 5.3.0
certifi 2022.12.7
charset-normalizer 3.1.0
colorama 0.4.6
einops 0.3.0
filelock 3.9.0
frozenlist 1.3.3
fsspec 2023.3.0
ftfy 6.1.1
future 0.18.3
google-auth 2.16.2
google-auth-oauthlib 0.4.6
grpcio 1.51.3
huggingface-hub 0.13.1
idna 3.4
importlib-metadata 6.0.0
Markdown 3.4.1
MarkupSafe 2.1.2
multidict 6.0.4
numpy 1.23.1
oauthlib 3.2.2
omegaconf 2.1.1
open-clip-torch 2.0.2
opencv-python 4.3.0.38
packaging 23.0
Pillow 9.4.0
pip 23.0.1
protobuf 4.22.1
pyasn1 0.4.8
pyasn1-modules 0.2.8
pyDeprecate 0.3.1
pytorch-lightning 1.5.0
PyYAML 6.0
regex 2022.10.31
requests 2.28.2
requests-oauthlib 1.3.1
rsa 4.9
setuptools 65.6.3
six 1.16.0
tensorboard 2.12.0
tensorboard-data-server 0.7.0
tensorboard-plugin-wit 1.8.1
tokenizers 0.12.1
torch 1.12.1+cu113
torchaudio 0.12.1+cu113
torchmetrics 0.11.4
torchvision 0.13.1+cu113
tqdm 4.65.0
transformers 4.19.2
typing_extensions 4.5.0
urllib3 1.26.15
wcwidth 0.2.6
Werkzeug 2.2.3
wheel 0.38.4
wincertstore 0.2
yarl 1.8.2
zipp 3.15.0
///////////////////////////////////////

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions