Merge pull request #694 from mv1388/training_job_queue_scheduler

Training job queue scheduler
mv1388 · Jul 16, 2022 · b56f846 · b56f846
2 parents d5eeb7d + 8bece2d
commit b56f846
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 10 deletions.
diff --git a/bin/AWS/prepare_instance.sh b/bin/AWS/prepare_instance.sh
@@ -147,6 +147,7 @@ if [ $pypi_install == false ]; then
 fi
 scp -i $key_path download_data.sh  $username@$ec2_instance_address:~/project
 scp -i $key_path run_experiment.sh  $username@$ec2_instance_address:~/project
+scp -i $key_path ../training_job_scheduler.py  $username@$ec2_instance_address:~
 
 echo "#!/usr/bin/env bash
 
@@ -166,6 +167,8 @@ pip install -U numpy
 pip install --ignore-installed greenlet
 pip install seaborn==0.9.0
 
+pip install 'typer[all]'
+
 #conda install -y -c conda-forge jsonnet
 #conda install -y -c anaconda seaborn=0.9.0
 

diff --git a/bin/AWS/run_experiment.sh b/bin/AWS/run_experiment.sh
@@ -4,8 +4,6 @@
 # ./finish_prepare_instance.sh
 # ./run_experiment.sh (optional: -t / --terminate)
 
-project_root_path=~/project
-export PYTHONPATH=${PYTHONPATH}:$project_root_path
 
 # usage function
 function usage()
@@ -17,9 +15,10 @@ function usage()
    optional arguments:
      -t, --terminate                the instance will be terminated when training is done
      -e, --experiment-script STR    name of the experiment bash script to be executed in order to start the training
+     -p, --project-root STR         path to the project root on the execution server/AWS
      -l, --log-path STR             path to the local log file which will be uploaded to s3
      --log-s3-upload-dir STR        path to the logs folder on S3 to which the training log should be uploaded
-     -c, --cleanup-script STR       post execution cleanup script
+     -c, --cleanup-script           post execution cleanup script
      --aws-region STR               create the instance in the specified region. Default is Ireland (eu-west-1)
      -h, --help                     show this help message and exit
 
@@ -28,6 +27,7 @@ HEREDOC
 
 terminate_cmd=false
 experiment_script_file="aws_run_experiments_project.sh"
+project_root_path=~/project
 log_file_path=
 log_s3_dir_path="s3://model-result/training_logs"
 post_experiment_run_cleanup=false
@@ -45,6 +45,10 @@ case $key in
     experiment_script_file="$2"
     shift 2 # past argument value
     ;;
+    -p|--project-root)
+    project_root_path="$2"
+    shift 2 # past argument value
+    ;;
     -l|--log-path)
     log_file_path="$2"
     shift 2 # past argument value
@@ -73,6 +77,9 @@ case $key in
 esac
 done
 
+
+export PYTHONPATH=${PYTHONPATH}:$project_root_path
+
 # Set the region either to Ireland or Frankfurt
 export AWS_DEFAULT_REGION=$aws_region
 

diff --git a/bin/AWS/submit_job.sh b/bin/AWS/submit_job.sh
@@ -30,6 +30,7 @@ function usage()
      -o, --os-name STR              username depending on the OS chosen. Default is ubuntu
      -t, --terminate                the instance will be terminated when training is done
      -s, --ssh-start                automatically ssh into the instance when the training starts
+     --without-scheduler            run experiment without the training job scheduler
      --on-demand                    create on-demand instance instead of spot instance
      --central-region               create the instance in the central region (Frankfurt)
      --pypi                         install package from PyPI instead of the local package version
@@ -52,6 +53,7 @@ default_log=false
 username="ubuntu"
 terminate_cmd=false
 ssh_at_start=false
+run_with_scheduler=true
 spot_instance=true
 aws_region="eu-west-1"
 local_pypi_install=""
@@ -128,6 +130,10 @@ case $key in
     ssh_at_start=true
     shift 1 # past argument value
     ;;
+    --without-scheduler)
+    run_with_scheduler=false
+    shift 1 # past argument value
+    ;;
     --on-demand)
     spot_instance=false
     shift 1 # past argument value
@@ -229,8 +235,14 @@ echo "Preparing instance"
 #########################################################
 printf "\n========================================================\n"
 echo "Running the job"
-ssh -i $key_path $username@$ec2_instance_address \
-    "source activate $py_env ; tmux new-session -d -s 'training' './finish_prepare_instance.sh ; cd project ; ./run_experiment.sh $terminate_setting --experiment-script $experiment_script_file $log_upload_setting --cleanup-script --aws-region $aws_region' \; pipe-pane 'cat > $logging_path'"
+if [ "$run_with_scheduler" == true ]; then
+  ssh -i $key_path $username@$ec2_instance_address \
+      "source activate $py_env ; tmux new-session -d -s 'training' './finish_prepare_instance.sh ; cd project ; python ~/training_job_scheduler.py add-job --experiment-script $experiment_script_file ; python ~/training_job_scheduler.py run $terminate_setting $log_upload_setting --aws-region $aws_region' \; pipe-pane 'cat > $logging_path'"
+
+else
+  ssh -i $key_path $username@$ec2_instance_address \
+      "source activate $py_env ; tmux new-session -d -s 'training' './finish_prepare_instance.sh ; cd project ; ./run_experiment.sh $terminate_setting --experiment-script $experiment_script_file $log_upload_setting --cleanup-script --aws-region $aws_region' \; pipe-pane 'cat > $logging_path'"
+fi
 
 echo "Instance IP: $ec2_instance_address"
 echo "To easily ssh connect into the running job session execute:"

diff --git a/bin/AWS/update_experiments_on_AWS.sh b/bin/AWS/update_experiments_on_AWS.sh
@@ -8,12 +8,15 @@ function usage()
    Usage: ./update_experiments_on_AWS.sh [--address STR] [--project STR]
 
    arguments:
-     -a, --address STR      ec2 instance Public DNS address
-     -p, --project STR      path to the project to be optionally uploaded to the running ec2 instance
-     -h, --help             show this help message and exit
+     -a, --address STR        ec2 instance Public DNS address
+     -p, --project STR        path to the project to be optionally uploaded to the running ec2 instance
+     -h, --help               show this help message and exit
 
    optional arguments:
-     -k, --key STR          path to ssh key
+     -k, --key STR            path to ssh key
+     --add-job                add training job to the running training job scheduler
+     --experiment-script STR  name of the experiment bash script to be executed in order to start the training
+     --aws-project-root STR   path to the aws-based project root
 
 HEREDOC
 }
@@ -22,6 +25,14 @@ key_path=$(jq -r '.key_path' configs/my_config.json)
 ec2_instance_address=
 local_project_path=
 
+add_training_scheduler_job=false
+experiment_script_file="aws_run_experiments_project.sh"
+aws_project_root_path=~/project
+
+username="ubuntu"
+py_env="pytorch_latest_p36"
+
+
 while [[ $# -gt 0 ]]; do
 key="$1"
 
@@ -38,6 +49,18 @@ case $key in
     local_project_path="$2"
     shift 2 # past argument value
     ;;
+    --add-job)
+    add_training_scheduler_job=true
+    shift 1 # past argument value
+    ;;
+    -e|--experiment-script)
+    experiment_script_file="$2"
+    shift 2 # past argument value
+    ;;
+    --aws-project-root)
+    aws_project_root_path="$2"
+    shift 2 # past argument value
+    ;;
     -h|--help )
     usage;
     exit;
@@ -59,4 +82,8 @@ fi
 
 echo Re-ploading project folder $local_project_path
 
-source $local_project_path/AWS_run_scripts/AWS_core_scripts/aws_project_upload.sh $key_path $ec2_instance_address "~/project" $local_project_path
+source $local_project_path/AWS_run_scripts/AWS_core_scripts/aws_project_upload.sh $key_path $ec2_instance_address $aws_project_root_path $local_project_path
+
+if [ $add_training_scheduler_job == true ]; then
+    ssh -i $key_path $username@$ec2_instance_address "source activate $py_env ; python ~/training_job_scheduler.py add-job --experiment-script $experiment_script_file --project-root $aws_project_root_path"
+fi
diff --git a/bin/training_job_scheduler.py b/bin/training_job_scheduler.py
@@ -0,0 +1,162 @@
+import os
+import subprocess
+import time
+import datetime
+import pandas as pd
+
+import typer
+
+
+class TrainingJobScheduler:
+    def __init__(self, job_queue_file_path):
+        """Model training job queue scheduler
+
+        Args:
+            job_queue_file_path (str): File path of the job queue on the execution server/AWS
+        """
+        self.job_queue_file_path = os.path.expanduser(job_queue_file_path)
+        self.job_queue = None
+
+        self.job_counter = 0
+
+    def run_jobs(self, logging_path, log_s3_dir_path, aws_region):
+        self.job_queue = pd.read_csv(self.job_queue_file_path)
+
+        while self.is_job_available():
+            if len(self.job_queue[self.job_queue['job_status'] == 'running']):
+                raise ValueError
+
+            jobs_waiting = self.job_queue[self.job_queue['job_status'] == 'waiting']
+            job_todo = jobs_waiting.head(1)
+
+            self.job_queue.loc[job_todo.index, 'job_status'] = 'running'
+            self.job_queue.to_csv(self.job_queue_file_path, index=False)
+
+            logging_path_iteration = self.get_job_logging_path(logging_path)
+            log_upload_setting = f"--log-path {logging_path_iteration} --log-s3-upload-dir {log_s3_dir_path}"
+
+            process_return = subprocess.run(
+                f"{os.path.expanduser('~/project/run_experiment.sh')} "
+                f"--experiment-script {job_todo.iloc[0]['experiment_script_file']} "
+                f"--project-root {job_todo.iloc[0]['project_root_path']} "
+                f"{log_upload_setting} "
+                f"--cleanup-script "
+                f"--aws-region {aws_region}",
+                shell=True
+            )
+
+            # re-read the queue file to get in any additions to the queue during the model training run
+            self.job_queue = pd.read_csv(self.job_queue_file_path)
+            self.job_queue.loc[job_todo.index, 'job_status'] = 'done'
+            self.job_queue.loc[job_todo.index, 'job_return_code'] = process_return.returncode
+            self.job_queue.to_csv(self.job_queue_file_path, index=False)
+
+            self.job_counter += 1
+
+    def is_job_available(self):
+        self.job_queue = pd.read_csv(self.job_queue_file_path)
+        return not all(el == 'done' for el in self.job_queue['job_status'])
+
+    def get_job_logging_path(self, logging_path):
+        path_extension = os.path.expanduser(logging_path).split('.')
+        if len(path_extension) != 2:
+            raise ValueError
+
+        logging_path, extension = path_extension
+
+        return f'{logging_path}_train_job_{self.job_counter}.{extension}'
+
+    def add_job(self, experiment_script_file, project_root_path):
+        if os.path.exists(self.job_queue_file_path):
+            self.job_queue = pd.read_csv(self.job_queue_file_path)
+        else:
+            self.job_queue = pd.DataFrame(columns=['job_status', 'experiment_script_file',
+                                                   'project_root_path', 'job_return_code', 'timestamp'])
+
+        self.job_queue = self.job_queue.append({
+            'job_status': 'waiting',
+            'experiment_script_file': experiment_script_file,
+            'project_root_path': project_root_path,
+            'timestamp': datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
+        }, ignore_index=True)
+
+        self.job_queue.to_csv(self.job_queue_file_path, index=False)
+
+    def __str__(self):
+        return str(pd.read_csv(self.job_queue_file_path))
+
+
+app = typer.Typer(help='Training Job Scheduler CLI')
+
+
+@app.command(help='Run training jobs execution loop which goes runs through provided jobs in the queue')
+def run(
+        log_path: str = typer.Option(
+            os.path.expanduser(f"~/project/training_{datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H_%M_%S')}.log"),
+            help='Logging file path on the execution server'
+        ),
+        log_s3_upload_dir: str = typer.Option(
+            's3://model-result/training_logs',
+            help='Path to the logs folder on S3 to which the training log should be uploaded'
+        ),
+        aws_region: str = typer.Option(
+            'eu-west-1',
+            help='AWS region code'
+        ),
+        terminate: bool = typer.Option(
+            False,
+            help='The instance will be terminated when all the training is done'
+        ),
+        job_queue_file_path: str = typer.Option(
+            '~/training_job_queue.csv',
+            help='File path of the job queue on the execution server/AWS'
+        )
+):
+    job_scheduler = TrainingJobScheduler(job_queue_file_path)
+    print('Jobs currently in the queue:')
+    print(job_scheduler)
+
+    job_scheduler.run_jobs(log_path, log_s3_upload_dir, aws_region)
+
+    if terminate:
+        print('Terminating the instance')
+        subprocess.run(
+            'aws ec2 terminate-instances --instance-ids $(ec2metadata --instance-id | cut -d " " -f 2)',
+            shell=True
+        )
+
+
+@app.command(help='Add a new training job to the job queue')
+def add_job(
+        experiment_script: str = typer.Option(
+            'aws_run_experiments_project.sh',
+            help='Name of the experiment bash script to be executed in order to start the training'
+        ),
+        project_root: str = typer.Option(
+            '~/project',
+            help='Path to the project root on the execution server/AWS'
+        ),
+        job_queue_file_path: str = typer.Option(
+            '~/training_job_queue.csv',
+            help='File path of the job queue on the execution server/AWS'
+        )
+):
+    job_scheduler = TrainingJobScheduler(job_queue_file_path)
+    job_scheduler.add_job(experiment_script, project_root)
+
+    print('Job added!')
+    print(job_scheduler)
+
+
+@app.command(help='List the job queue contents')
+def list_queue(
+        job_queue_file_path: str = typer.Option(
+            '~/training_job_queue.csv',
+            help='File path of the job queue on the execution server/AWS'
+        )
+):
+    print(TrainingJobScheduler(job_queue_file_path))
+
+
+if __name__ == "__main__":
+    app()