Skip to content

Commit

Permalink
Merge pull request #712 from mv1388/fix_job_scheduler_logging_upload
Browse files Browse the repository at this point in the history
Iterated logging when using job scheduler
  • Loading branch information
mv1388 committed Jul 22, 2022
2 parents 7c55462 + 5d3690f commit 2dacdf5
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 11 deletions.
19 changes: 19 additions & 0 deletions bin/AWS/run_experiment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ function usage()
-p, --project-root STR path to the project root on the execution server/AWS
-l, --log-path STR path to the local log file which will be uploaded to s3
--log-s3-upload-dir STR path to the logs folder on S3 to which the training log should be uploaded
-i, --log-iteration INT index of an executed job via the scheduler
-c, --cleanup-script post execution cleanup script
--aws-region STR create the instance in the specified region. Default is Ireland (eu-west-1)
-h, --help show this help message and exit
Expand All @@ -30,6 +31,7 @@ experiment_script_file="aws_run_experiments_project.sh"
project_root_path=~/project
log_file_path=
log_s3_dir_path="s3://model-result/training_logs"
log_iteration=-1
post_experiment_run_cleanup=false
aws_region="eu-west-1"

Expand Down Expand Up @@ -57,6 +59,10 @@ case $key in
log_s3_dir_path="$2"
shift 2 # past argument value
;;
-i|--log-iteration)
log_iteration="$2"
shift 2 # past argument value
;;
-c|--cleanup-script)
post_experiment_run_cleanup=true
shift 1 # past argument value
Expand Down Expand Up @@ -98,6 +104,19 @@ source $project_root_path/AWS_run_scripts/AWS_core_scripts/$experiment_script_fi

if [[ $log_file_path != "" && -f $log_file_path ]]; then
filtered_log_file_path="$(dirname $log_file_path)/filtered_$(basename $log_file_path)"

if [ "$log_iteration" -ge 0 ]; then
log_file_path_without_ext="${log_file_path%.*}"
extension="${log_file_path##*.}"
log_file_path_iteration="${log_file_path_without_ext}_train_job_${log_iteration}.${extension}"

cp $log_file_path $log_file_path_iteration
log_file_path=$log_file_path_iteration

filtered_log_file_path_without_ext="${filtered_log_file_path%.*}"
filtered_log_file_path="${filtered_log_file_path_without_ext}_train_job_${log_iteration}.${extension}"
fi

sed -n -e '/ STARTING THE TRAINING JOB /, $p' $log_file_path | grep -v '%|.*|' > $filtered_log_file_path

s3_log_path="$log_s3_dir_path/$(basename $log_file_path)"
Expand Down
13 changes: 2 additions & 11 deletions bin/training_job_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def run_jobs(self, logging_path, log_s3_dir_path, aws_region):
self.job_queue.loc[job_todo.index, 'job_status'] = 'running'
self.job_queue.to_csv(self.job_queue_file_path, index=False)

logging_path_iteration = self.get_job_logging_path(logging_path)
log_upload_setting = f"--log-path {logging_path_iteration} --log-s3-upload-dir {log_s3_dir_path}"
log_upload_setting = \
f"--log-path {logging_path} --log-s3-upload-dir {log_s3_dir_path} --log-iteration {self.job_counter}"

process_return = subprocess.run(
f"{os.path.expanduser('~/project/run_experiment.sh')} "
Expand All @@ -57,15 +57,6 @@ def is_job_available(self):
self.job_queue = pd.read_csv(self.job_queue_file_path)
return not all(el == 'done' for el in self.job_queue['job_status'])

def get_job_logging_path(self, logging_path):
path_extension = os.path.expanduser(logging_path).split('.')
if len(path_extension) != 2:
raise ValueError

logging_path, extension = path_extension

return f'{logging_path}_train_job_{self.job_counter}.{extension}'

def add_job(self, experiment_script_file, project_root_path):
if os.path.exists(self.job_queue_file_path):
self.job_queue = pd.read_csv(self.job_queue_file_path)
Expand Down

0 comments on commit 2dacdf5

Please sign in to comment.