Skip to content

Commit

Permalink
Merge pull request #694 from mv1388/training_job_queue_scheduler
Browse files Browse the repository at this point in the history
Training job queue scheduler
  • Loading branch information
mv1388 committed Jul 16, 2022
2 parents d5eeb7d + 8bece2d commit b56f846
Show file tree
Hide file tree
Showing 5 changed files with 221 additions and 10 deletions.
3 changes: 3 additions & 0 deletions bin/AWS/prepare_instance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ if [ $pypi_install == false ]; then
fi
scp -i $key_path download_data.sh $username@$ec2_instance_address:~/project
scp -i $key_path run_experiment.sh $username@$ec2_instance_address:~/project
scp -i $key_path ../training_job_scheduler.py $username@$ec2_instance_address:~

echo "#!/usr/bin/env bash
Expand All @@ -166,6 +167,8 @@ pip install -U numpy
pip install --ignore-installed greenlet
pip install seaborn==0.9.0
pip install 'typer[all]'
#conda install -y -c conda-forge jsonnet
#conda install -y -c anaconda seaborn=0.9.0
Expand Down
13 changes: 10 additions & 3 deletions bin/AWS/run_experiment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
# ./finish_prepare_instance.sh
# ./run_experiment.sh (optional: -t / --terminate)

project_root_path=~/project
export PYTHONPATH=${PYTHONPATH}:$project_root_path

# usage function
function usage()
Expand All @@ -17,9 +15,10 @@ function usage()
optional arguments:
-t, --terminate the instance will be terminated when training is done
-e, --experiment-script STR name of the experiment bash script to be executed in order to start the training
-p, --project-root STR path to the project root on the execution server/AWS
-l, --log-path STR path to the local log file which will be uploaded to s3
--log-s3-upload-dir STR path to the logs folder on S3 to which the training log should be uploaded
-c, --cleanup-script STR post execution cleanup script
-c, --cleanup-script post execution cleanup script
--aws-region STR create the instance in the specified region. Default is Ireland (eu-west-1)
-h, --help show this help message and exit
Expand All @@ -28,6 +27,7 @@ HEREDOC

terminate_cmd=false
experiment_script_file="aws_run_experiments_project.sh"
project_root_path=~/project
log_file_path=
log_s3_dir_path="s3://model-result/training_logs"
post_experiment_run_cleanup=false
Expand All @@ -45,6 +45,10 @@ case $key in
experiment_script_file="$2"
shift 2 # past argument value
;;
-p|--project-root)
project_root_path="$2"
shift 2 # past argument value
;;
-l|--log-path)
log_file_path="$2"
shift 2 # past argument value
Expand Down Expand Up @@ -73,6 +77,9 @@ case $key in
esac
done


export PYTHONPATH=${PYTHONPATH}:$project_root_path

# Set the region either to Ireland or Frankfurt
export AWS_DEFAULT_REGION=$aws_region

Expand Down
16 changes: 14 additions & 2 deletions bin/AWS/submit_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ function usage()
-o, --os-name STR username depending on the OS chosen. Default is ubuntu
-t, --terminate the instance will be terminated when training is done
-s, --ssh-start automatically ssh into the instance when the training starts
--without-scheduler run experiment without the training job scheduler
--on-demand create on-demand instance instead of spot instance
--central-region create the instance in the central region (Frankfurt)
--pypi install package from PyPI instead of the local package version
Expand All @@ -52,6 +53,7 @@ default_log=false
username="ubuntu"
terminate_cmd=false
ssh_at_start=false
run_with_scheduler=true
spot_instance=true
aws_region="eu-west-1"
local_pypi_install=""
Expand Down Expand Up @@ -128,6 +130,10 @@ case $key in
ssh_at_start=true
shift 1 # past argument value
;;
--without-scheduler)
run_with_scheduler=false
shift 1 # past argument value
;;
--on-demand)
spot_instance=false
shift 1 # past argument value
Expand Down Expand Up @@ -229,8 +235,14 @@ echo "Preparing instance"
#########################################################
printf "\n========================================================\n"
echo "Running the job"
ssh -i $key_path $username@$ec2_instance_address \
"source activate $py_env ; tmux new-session -d -s 'training' './finish_prepare_instance.sh ; cd project ; ./run_experiment.sh $terminate_setting --experiment-script $experiment_script_file $log_upload_setting --cleanup-script --aws-region $aws_region' \; pipe-pane 'cat > $logging_path'"
if [ "$run_with_scheduler" == true ]; then
ssh -i $key_path $username@$ec2_instance_address \
"source activate $py_env ; tmux new-session -d -s 'training' './finish_prepare_instance.sh ; cd project ; python ~/training_job_scheduler.py add-job --experiment-script $experiment_script_file ; python ~/training_job_scheduler.py run $terminate_setting $log_upload_setting --aws-region $aws_region' \; pipe-pane 'cat > $logging_path'"

else
ssh -i $key_path $username@$ec2_instance_address \
"source activate $py_env ; tmux new-session -d -s 'training' './finish_prepare_instance.sh ; cd project ; ./run_experiment.sh $terminate_setting --experiment-script $experiment_script_file $log_upload_setting --cleanup-script --aws-region $aws_region' \; pipe-pane 'cat > $logging_path'"
fi

echo "Instance IP: $ec2_instance_address"
echo "To easily ssh connect into the running job session execute:"
Expand Down
37 changes: 32 additions & 5 deletions bin/AWS/update_experiments_on_AWS.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ function usage()
Usage: ./update_experiments_on_AWS.sh [--address STR] [--project STR]
arguments:
-a, --address STR ec2 instance Public DNS address
-p, --project STR path to the project to be optionally uploaded to the running ec2 instance
-h, --help show this help message and exit
-a, --address STR ec2 instance Public DNS address
-p, --project STR path to the project to be optionally uploaded to the running ec2 instance
-h, --help show this help message and exit
optional arguments:
-k, --key STR path to ssh key
-k, --key STR path to ssh key
--add-job add training job to the running training job scheduler
--experiment-script STR name of the experiment bash script to be executed in order to start the training
--aws-project-root STR path to the aws-based project root
HEREDOC
}
Expand All @@ -22,6 +25,14 @@ key_path=$(jq -r '.key_path' configs/my_config.json)
ec2_instance_address=
local_project_path=

add_training_scheduler_job=false
experiment_script_file="aws_run_experiments_project.sh"
aws_project_root_path=~/project

username="ubuntu"
py_env="pytorch_latest_p36"


while [[ $# -gt 0 ]]; do
key="$1"

Expand All @@ -38,6 +49,18 @@ case $key in
local_project_path="$2"
shift 2 # past argument value
;;
--add-job)
add_training_scheduler_job=true
shift 1 # past argument value
;;
-e|--experiment-script)
experiment_script_file="$2"
shift 2 # past argument value
;;
--aws-project-root)
aws_project_root_path="$2"
shift 2 # past argument value
;;
-h|--help )
usage;
exit;
Expand All @@ -59,4 +82,8 @@ fi

echo Re-ploading project folder $local_project_path

source $local_project_path/AWS_run_scripts/AWS_core_scripts/aws_project_upload.sh $key_path $ec2_instance_address "~/project" $local_project_path
source $local_project_path/AWS_run_scripts/AWS_core_scripts/aws_project_upload.sh $key_path $ec2_instance_address $aws_project_root_path $local_project_path

if [ $add_training_scheduler_job == true ]; then
ssh -i $key_path $username@$ec2_instance_address "source activate $py_env ; python ~/training_job_scheduler.py add-job --experiment-script $experiment_script_file --project-root $aws_project_root_path"
fi
162 changes: 162 additions & 0 deletions bin/training_job_scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import os
import subprocess
import time
import datetime
import pandas as pd

import typer


class TrainingJobScheduler:
def __init__(self, job_queue_file_path):
"""Model training job queue scheduler
Args:
job_queue_file_path (str): File path of the job queue on the execution server/AWS
"""
self.job_queue_file_path = os.path.expanduser(job_queue_file_path)
self.job_queue = None

self.job_counter = 0

def run_jobs(self, logging_path, log_s3_dir_path, aws_region):
self.job_queue = pd.read_csv(self.job_queue_file_path)

while self.is_job_available():
if len(self.job_queue[self.job_queue['job_status'] == 'running']):
raise ValueError

jobs_waiting = self.job_queue[self.job_queue['job_status'] == 'waiting']
job_todo = jobs_waiting.head(1)

self.job_queue.loc[job_todo.index, 'job_status'] = 'running'
self.job_queue.to_csv(self.job_queue_file_path, index=False)

logging_path_iteration = self.get_job_logging_path(logging_path)
log_upload_setting = f"--log-path {logging_path_iteration} --log-s3-upload-dir {log_s3_dir_path}"

process_return = subprocess.run(
f"{os.path.expanduser('~/project/run_experiment.sh')} "
f"--experiment-script {job_todo.iloc[0]['experiment_script_file']} "
f"--project-root {job_todo.iloc[0]['project_root_path']} "
f"{log_upload_setting} "
f"--cleanup-script "
f"--aws-region {aws_region}",
shell=True
)

# re-read the queue file to get in any additions to the queue during the model training run
self.job_queue = pd.read_csv(self.job_queue_file_path)
self.job_queue.loc[job_todo.index, 'job_status'] = 'done'
self.job_queue.loc[job_todo.index, 'job_return_code'] = process_return.returncode
self.job_queue.to_csv(self.job_queue_file_path, index=False)

self.job_counter += 1

def is_job_available(self):
self.job_queue = pd.read_csv(self.job_queue_file_path)
return not all(el == 'done' for el in self.job_queue['job_status'])

def get_job_logging_path(self, logging_path):
path_extension = os.path.expanduser(logging_path).split('.')
if len(path_extension) != 2:
raise ValueError

logging_path, extension = path_extension

return f'{logging_path}_train_job_{self.job_counter}.{extension}'

def add_job(self, experiment_script_file, project_root_path):
if os.path.exists(self.job_queue_file_path):
self.job_queue = pd.read_csv(self.job_queue_file_path)
else:
self.job_queue = pd.DataFrame(columns=['job_status', 'experiment_script_file',
'project_root_path', 'job_return_code', 'timestamp'])

self.job_queue = self.job_queue.append({
'job_status': 'waiting',
'experiment_script_file': experiment_script_file,
'project_root_path': project_root_path,
'timestamp': datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
}, ignore_index=True)

self.job_queue.to_csv(self.job_queue_file_path, index=False)

def __str__(self):
return str(pd.read_csv(self.job_queue_file_path))


app = typer.Typer(help='Training Job Scheduler CLI')


@app.command(help='Run training jobs execution loop which goes runs through provided jobs in the queue')
def run(
log_path: str = typer.Option(
os.path.expanduser(f"~/project/training_{datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H_%M_%S')}.log"),
help='Logging file path on the execution server'
),
log_s3_upload_dir: str = typer.Option(
's3://model-result/training_logs',
help='Path to the logs folder on S3 to which the training log should be uploaded'
),
aws_region: str = typer.Option(
'eu-west-1',
help='AWS region code'
),
terminate: bool = typer.Option(
False,
help='The instance will be terminated when all the training is done'
),
job_queue_file_path: str = typer.Option(
'~/training_job_queue.csv',
help='File path of the job queue on the execution server/AWS'
)
):
job_scheduler = TrainingJobScheduler(job_queue_file_path)
print('Jobs currently in the queue:')
print(job_scheduler)

job_scheduler.run_jobs(log_path, log_s3_upload_dir, aws_region)

if terminate:
print('Terminating the instance')
subprocess.run(
'aws ec2 terminate-instances --instance-ids $(ec2metadata --instance-id | cut -d " " -f 2)',
shell=True
)


@app.command(help='Add a new training job to the job queue')
def add_job(
experiment_script: str = typer.Option(
'aws_run_experiments_project.sh',
help='Name of the experiment bash script to be executed in order to start the training'
),
project_root: str = typer.Option(
'~/project',
help='Path to the project root on the execution server/AWS'
),
job_queue_file_path: str = typer.Option(
'~/training_job_queue.csv',
help='File path of the job queue on the execution server/AWS'
)
):
job_scheduler = TrainingJobScheduler(job_queue_file_path)
job_scheduler.add_job(experiment_script, project_root)

print('Job added!')
print(job_scheduler)


@app.command(help='List the job queue contents')
def list_queue(
job_queue_file_path: str = typer.Option(
'~/training_job_queue.csv',
help='File path of the job queue on the execution server/AWS'
)
):
print(TrainingJobScheduler(job_queue_file_path))


if __name__ == "__main__":
app()

0 comments on commit b56f846

Please sign in to comment.