Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option(s) to do automatic cleanup of "old" models/predictions/... #169

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
8d06add
- add cleanup models, training and predictions
KriWay Mar 21, 2024
8abcd33
- add sample project
KriWay Mar 25, 2024
34487ed
- add test cleanup simulate
KriWay Mar 25, 2024
1d11b50
- refactor test_cleanup
KriWay Mar 25, 2024
cfcad9a
- refactor test_cleanup
KriWay Mar 25, 2024
a69405e
- update changelog.md
KriWay Mar 26, 2024
d6d8fc0
Update CHANGELOG.md
KriWay Mar 27, 2024
7507938
- refactor
KriWay Mar 28, 2024
1a8892b
- refactor cleanup
KriWay Mar 28, 2024
60d1440
- refactor
KriWay Mar 28, 2024
8da5ec8
- remove cleanup sample project
KriWay Mar 28, 2024
b5c374c
- removed parameter config_path
KriWay Apr 15, 2024
db57293
- testdata created in the tests itself
KriWay Apr 15, 2024
b800927
- added function clean_project_dir
KriWay Apr 15, 2024
223047d
- refactor cleanup_old_data
KriWay Apr 16, 2024
90b2332
- add test cleanup_not_existing_dir
KriWay Apr 16, 2024
e78fb85
- remove test
KriWay Apr 16, 2024
8de60e1
- add tests for non existing dirs
KriWay Apr 16, 2024
fc3315b
- projects_dir as parameter
KriWay Apr 16, 2024
ea71ae9
- removed default value for simulate
KriWay Apr 16, 2024
3891e10
- refactor logging
KriWay Apr 16, 2024
472cf4e
- refactor test non_existing_dir
KriWay Apr 16, 2024
1f65a3a
- with open replaced by touch
KriWay Apr 16, 2024
d2f3657
- use tmp_path to create project files
KriWay Apr 17, 2024
b95e3e4
- add test aidetection_info
KriWay Apr 17, 2024
2aefea5
- replaced pipelines by comma's
KriWay Apr 18, 2024
484fff6
- removed pipeline
KriWay Apr 18, 2024
79e177e
- refactor cleanup_predictions
KriWay Apr 19, 2024
c827b49
- refactor cleanup.py
KriWay Apr 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -7,6 +7,7 @@
- Add support to train subject on different pixel sizes (#143)
- Add support to overrule configuration parameters via command line arguments (#152)
- Several small improvements (#128)
- Add option(s) to do automatic cleanup of "old" models/predictions/... (#52)

### Bugs fixed

Expand Down
53 changes: 53 additions & 0 deletions helper_scripts/cleanup_old_data.py
@@ -0,0 +1,53 @@
import logging
import os
from pathlib import Path

from orthoseg.lib import cleanup

from orthoseg.helpers import config_helper as conf
from orthoseg.util import log_util

logger = logging.getLogger(__name__)


def cleanup_old_data(projects_dir: Path, simulate:bool = False):
# Exclude directories where name starts with '_'
projects = [
subdir for subdir in os.listdir(projects_dir)
if os.path.isdir(projects_dir/subdir) and not subdir.startswith("_")
]
for project in projects:
config_path = projects_dir / project / f"{project}.ini"
if config_path.exists():
conf.read_orthoseg_config(config_path=config_path)
global logger
logger = log_util.main_log_init(
log_dir=conf.dirs.getpath("log_dir"),
log_basefilename=cleanup_old_data.__name__
)
cleanup.clean_project_dir(
model_dir=conf.dirs.getpath("model_dir"),
model_versions_to_retain=conf.cleanup.getint("model_versions_to_retain"),
training_dir=conf.dirs.getpath("training_dir"),
training_versions_to_retain=conf.cleanup.getint(
"training_versions_to_retain"
),
output_vector_dir=conf.dirs.getpath("output_vector_dir"),
prediction_versions_to_retain=conf.cleanup.getint(
"prediction_versions_to_retain"
),
simulate=simulate,
)
else:
# logger.info(f"Config_path ({config_path}) doesn't exist.")
print(f"Config_path ({config_path}) doesn't exist.")



# If the script is ran directly...
if __name__ == "__main__":
# cleanup()
cleanup_old_data(
projects_dir=Path(r"X:\Monitoring\OrthoSeg"),
simulate=True
)
2 changes: 2 additions & 0 deletions orthoseg/helpers/config_helper.py
Expand Up @@ -112,6 +112,8 @@ def read_orthoseg_config(config_path: Path, overrules: List[str] = []):
logging = config["logging"]
global email
email = config["email"]
global cleanup
cleanup = config["cleanup"]

# Some checks to make sure the config is loaded properly
segment_subject = general.get("segment_subject")
Expand Down
242 changes: 242 additions & 0 deletions orthoseg/lib/cleanup.py
@@ -0,0 +1,242 @@
"""
Automatic cleanup of 'old' models, predictions and training data directories.
"""

from glob import glob
import logging
import os
import shutil
from pathlib import Path
from typing import List


from orthoseg.model import model_helper
from orthoseg.util.data import aidetection_info

# Get a logger...
logger = logging.getLogger(__name__)


def clean_models(
model_dir: Path,
versions_to_retain: int,
simulate: bool,
) -> List[str]:
"""
Cleanup models.

Args:
model_dir (Path): Path to the directory with the models to be cleaned
versions_to_retain (int): Versions to retain
simulate (bool): Simulate cleanup, files are logged, no files are deleted

Raises:
Exception: ERROR while deleting file

Returns:
List[str]: List of models to be cleaned
"""
logger.info(f"{model_dir=}, {versions_to_retain=}, {simulate=}")
models_to_cleanup = []

if model_dir.exists():
models = model_helper.get_models(model_dir=model_dir)
traindata_id = [model["traindata_id"] for model in models]
traindata_id.sort()
traindata_id_to_cleanup = traindata_id[
: len(traindata_id) - versions_to_retain
if len(traindata_id) >= versions_to_retain
else 0
]
models_to_cleanup = [
model["basefilename"]
for model in models
if model["traindata_id"] in traindata_id_to_cleanup
]

for model in models_to_cleanup:
file_path = f"{model_dir}/{model}*.*"
file_list = glob(pathname=file_path)
for file in file_list:
removed_file = Path(file).name
if simulate:
logger.info(f"{removed_file=}")
else:
try:
os.remove(file)
logger.info(f"{removed_file=}")
except OSError as ex:
message = f"ERROR while deleting file {file}"
logger.exception(message)
raise Exception(message) from ex

Check warning on line 71 in orthoseg/lib/cleanup.py

View check run for this annotation

Codecov / codecov/patch

orthoseg/lib/cleanup.py#L68-L71

Added lines #L68 - L71 were not covered by tests
else:
logger.info(f"Directory {model_dir.name} doesn't exist")
return models_to_cleanup


def clean_training_data_directories(
training_dir: Path,
versions_to_retain: int,
simulate: bool,
) -> List[str]:
"""
Cleanup training data directories.

Args:
training_dir (Path): Path to the directory with the training data
versions_to_retain (int): Versions to retain
simulate (bool): Simulate cleanup, files are logged, no files are deleted

Raises:
Exception: ERROR while deleting file

Returns:
List[str]: List of training directories to be cleaned
"""
logger.info(f"{training_dir=}, {versions_to_retain=}, {simulate=}")
traindata_dirs_to_cleanup = []

if training_dir.exists():
training_dirs = [dir for dir in os.listdir(training_dir) if dir.isnumeric()]
training_dirs.sort()
traindata_dirs_to_cleanup = training_dirs[
: len(training_dirs) - versions_to_retain
if len(training_dirs) >= versions_to_retain
else 0
]
for dir in traindata_dirs_to_cleanup:
removed_dir = dir
if simulate:
logger.info(f"{removed_dir=}")
else:
try:
shutil.rmtree(f"{training_dir}/{dir}")
logger.info(f"{removed_dir=}")
except Exception as ex:
message = f"ERROR while deleting directory {training_dir}/{dir}"
logger.exception(message)
raise Exception(message) from ex

Check warning on line 118 in orthoseg/lib/cleanup.py

View check run for this annotation

Codecov / codecov/patch

orthoseg/lib/cleanup.py#L115-L118

Added lines #L115 - L118 were not covered by tests
else:
logger.info(f"Directory {training_dir.name} doesn't exist")
return traindata_dirs_to_cleanup


def clean_predictions(
output_vector_dir: Path,
versions_to_retain: int,
simulate: bool,
) -> List[str]:
"""
Cleanup predictions.

Args:
output_vector_dir (Path): Path to the directory containing
the vector predictions
versions_to_retain (int): Versions to retain
simulate (bool): Simulate cleanup, files are logged, no files are deleted

Raises:
Exception: ERROR while deleting file

Returns:
List[str]: List of training directories to be cleaned
"""
predictions_to_cleanup: List[str] = []
if output_vector_dir.exists():
file_path = f"{output_vector_dir}/*.*"
file_list = glob(pathname=file_path)
try:
ai_detection_infos = [
aidetection_info(path=Path(file)) for file in file_list
]
postprocessing = [x.postprocessing for x in ai_detection_infos]
postprocessing = list(dict.fromkeys(postprocessing))
logger.info(f"{output_vector_dir=}, {versions_to_retain=}, {simulate=}")
for p in postprocessing:
traindata_versions = [
ai_detection_info.traindata_version
for ai_detection_info in ai_detection_infos
if p == ai_detection_info.postprocessing
]
traindata_versions.sort()
traindata_versions_to_cleanup = traindata_versions[
: len(traindata_versions) - versions_to_retain
if len(traindata_versions) >= versions_to_retain
else 0
]
predictions_to_cleanup.extend(
[
ai_detection_info
for ai_detection_info in ai_detection_infos
if ai_detection_info.traindata_version
in traindata_versions_to_cleanup
and ai_detection_info.postprocessing == p
]
)
for prediction in predictions_to_cleanup:
removed_prediction = prediction.path.name
if simulate:
logger.info(f"{removed_prediction=}")
else:
try:
os.remove(prediction.path)
logger.info(f"{removed_prediction=}")
except Exception as ex:
message = f"ERROR while deleting file {prediction.path}"
logger.exception(message)
raise Exception(message) from ex
except Exception as ex:
logger.info(f"{ex}")

Check warning on line 189 in orthoseg/lib/cleanup.py

View check run for this annotation

Codecov / codecov/patch

orthoseg/lib/cleanup.py#L184-L189

Added lines #L184 - L189 were not covered by tests
else:
logger.info(f"Directory {output_vector_dir.name} doesn't exist")
return predictions_to_cleanup


def clean_project_dir(
model_dir: Path,
model_versions_to_retain: int,
training_dir: Path,
training_versions_to_retain: int,
output_vector_dir: Path,
prediction_versions_to_retain: int,
simulate: bool,
):
"""
Cleanup project directory.

Args:
model_dir (Path): Path to the directory with the models to be cleaned
model_versions_to_retain (int): Model versions to retain
training_dir (Path): Path to the directory with the training data to be cleaned
training_versions_to_retain (int): Training data versions to retain
output_vector_dir (Path): Path to the directory
with the predictions to be cleaned
prediction_versions_to_retain (int): Prediction versions to retain
simulate (bool): Simulate cleanup, files are logged, no files are deleted
"""
removed = {}
removed["models"] = clean_models(
model_dir=model_dir,
versions_to_retain=model_versions_to_retain,
simulate=simulate,
)
removed["training_dirs"] = clean_training_data_directories(
training_dir=training_dir,
versions_to_retain=training_versions_to_retain,
simulate=simulate,
)
output_vector_parent_dir = output_vector_dir.parent
if output_vector_parent_dir.exists():
prediction_dirs = os.listdir(output_vector_parent_dir)
removed["predictions"] = []
for prediction_dir in prediction_dirs:
removed["predictions"].extend(
clean_predictions(
output_vector_dir=output_vector_parent_dir / prediction_dir,
versions_to_retain=prediction_versions_to_retain,
simulate=simulate,
)
)
else:
logger.info(f"Directory {output_vector_parent_dir.name} doesn't exist")

Check warning on line 241 in orthoseg/lib/cleanup.py

View check run for this annotation

Codecov / codecov/patch

orthoseg/lib/cleanup.py#L241

Added line #L241 was not covered by tests
return removed
21 changes: 18 additions & 3 deletions orthoseg/predict.py
Expand Up @@ -18,7 +18,7 @@

from orthoseg.helpers import config_helper as conf
from orthoseg.helpers import email_helper
from orthoseg.lib import predicter
from orthoseg.lib import cleanup, predicter
import orthoseg.model.model_factory as mf
import orthoseg.model.model_helper as mh
from orthoseg.util import log_util
Expand Down Expand Up @@ -108,8 +108,6 @@ def predict(config_path: Path, config_overrules: List[str] = []):
if not input_image_dir.exists():
raise Exception(f"input image dir doesn't exist: {input_image_dir}")

# TODO: add something to delete old data, predictions???

# Create base filename of model to use
# TODO: is force data version the most logical, or rather implement
# force weights file or ?
Expand Down Expand Up @@ -307,6 +305,23 @@ def predict(config_path: Path, config_overrules: List[str] = []):
message = f"Completed predict for config {config_path.stem}"
logger.info(message)
email_helper.sendmail(message)

# Cleanup old data
cleanup.clean_models(
model_dir=conf.dirs.getpath("model_dir"),
versions_to_retain=conf.cleanup.getint("model_versions_to_retain"),
simulate=conf.cleanup.getboolean("simulate"),
)
cleanup.clean_training_data_directories(
training_dir=conf.dirs.getpath("training_dir"),
versions_to_retain=conf.cleanup.getint("training_versions_to_retain"),
simulate=conf.cleanup.getboolean("simulate"),
)
cleanup.clean_predictions(
output_vector_dir=conf.dirs.getpath("output_vector_dir"),
versions_to_retain=conf.cleanup.getint("prediction_versions_to_retain"),
simulate=conf.cleanup.getboolean("simulate"),
)
except Exception as ex:
message = f"ERROR while running predict for task {config_path.stem}"
logger.exception(message)
Expand Down
8 changes: 7 additions & 1 deletion orthoseg/project_defaults.ini
Expand Up @@ -480,4 +480,10 @@ logconfig = {
"level": "INFO",
"handlers": ["console", "file"]
}
}
}

[cleanup]
simulate = False
model_versions_to_retain = 3
training_versions_to_retain = 3
prediction_versions_to_retain = 3