In [10]:
import subprocess
from concurrent.futures import ThreadPoolExecutor
import time

def run_slurm_job(reg,train_size):
    sbatch_script = f"""#!/bin/bash
#SBATCH --job-name={reg}-{train_size} # Job name
#SBATCH --nodes=1 # number of nodes
#SBATCH --ntasks-per-node=1 # number of tasks
#SBATCH --time=0-06:00 # time limit (D-HH:MM)
#SBATCH -p gpuq # partition
#SBATCH --gres=gpu:1 # number of GPUs
#SBATCH --mem=64G # memory pool for all cores
#SBATCH --output ./sbatch/run-%j-{reg}-{train_size}.txt       # Standard out goes to this file
#SBATCH --error ./sbatch/error-%j-{reg}-{train_size}.txt      # Standard err goes to this file

begin=`date +%s`
echo node: $HOSTNAME
echo start time: `date`
echo ...........

source /home/${{USER}}/.bash_profile
conda activate /mnt/beegfs/nragu/tsunami/env
sleep 10
python main.py with 'reg={reg}' 'train_size={train_size}'

echo ...........
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed seconds
"""

    sbatch_file = f"sbatch_{reg}_{train_size}.sh"
    with open(sbatch_file, "w") as f:
        f.write(sbatch_script)

    command = f"sbatch {sbatch_file}"
    subprocess.run(command, shell=True)

train_sizes = [ '2400','1200','600','300']
regions = ['SR','CT']

#launch as job queue, 4 jobs at a time
max_concurrent_jobs = 1

def run_jobs():
    with ThreadPoolExecutor(max_workers=max_concurrent_jobs) as executor:
        futures = [executor.submit(run_slurm_job, reg, size) for reg in regions for size in train_sizes]
        # Wait for all jobs to complete
        for future in futures:
            future.result()


run_jobs()

Submitted batch job 48899
Submitted batch job 48900
Submitted batch job 48901
Submitted batch job 48902
Submitted batch job 48903
Submitted batch job 48904
Submitted batch job 48905
Submitted batch job 48906
