In [16]:
import numpy as np
import pandas as pd
import os
import shutil
from subprocess import Popen, PIPE
from IPython.display import clear_output
import time

In [17]:
output_dir = "/home/gridsan/shibal/MoEBERT-results"
dataset = 'mnli'
batch_sizes = [16,32]
weight_decays = [0.0,0.01]
learning_rates = [1e-5,2e-5]
version = 1

In [18]:
def make_bash_file(version, batch_size, weight_decay, learning_rate, experiment):
    bash_folder_path = f"{output_dir}/bashes/MoEBERT/{dataset}/experiment_{experiment}/"
    bash_file_path = os.path.join(bash_folder_path, f"batch_size_{batch_size}_weight_decay_{weight_decay}_lr_{learning_rate}.sh")
    log_path = f"{output_dir}/logs/MoEBERT/{dataset}/experiment_{experiment}/batch_size_{batch_size}_weight_decay_{weight_decay}_lr_{learning_rate}/v{version}"
    os.makedirs(bash_folder_path,exist_ok=True)
    os.makedirs(log_path,exist_ok=True)
    saving_dir = f"{output_dir}/{dataset}/batch_size_{batch_size}_weight_decay_{weight_decay}_lr_{learning_rate}"
    os.makedirs(saving_dir,exist_ok=True)
    with open(bash_file_path,"w") as f:
        f.write("#!/bin/bash\n")
        f.write("#SBATCH --gres=gpu:volta:1\n")
        f.write("#SBATCH --cpus-per-task=20\n")
        f.write("#SBATCH --mem=180G\n")
        f.write("#SBATCH --time=4-00:00\n")
        f.write("#SBATCH --mail-type=FAIL\n")
        f.write("#SBATCH --mail-user=shibal@mit.edu\n")
        f.write(f"#SBATCH -o {log_path}/%j.out\n")
        f.write(f"#SBATCH -e {log_path}/%j.err\n\n")
        
        f.write(f"source activate moebert\n\n")
        f.write(f"export num_gpus=1\n")
        f.write(f"export CUBLAS_WORKSPACE_CONFIG=':16:8'\n") # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
        f.write(f"export PYTHONHASHSEED=0\n") 
        
        f.write("HF_USER_DIR='/home/gridsan/shibal/.cache/huggingface'\n")
        f.write("HF_LOCAL_DIR='/state/partition1/user/shibal/cache/huggingface'\n")
        f.write("mkdir -p $HF_LOCAL_DIR\n")
        f.write("rsync -a --ignore-existing $HF_USER_DIR/ ${HF_LOCAL_DIR}\n")
        f.write("export HF_HOME=${HF_LOCAL_DIR}\n")
        f.write("export TRANSFORMERS_OFFLINE=1\n")
        f.write("export HF_DATASETS_OFFLINE=1\n")
        f.write("export WANDB_DISABLED='true'\n")

        f.write("export BACKEND='pytorch'\n\n")

        f.write("export HDF5_USE_FILE_LOCKING=FALSE\n\n")

        f.write("cd /home/gridsan/shibal/projects/MoEBERT-fork\n\n")
        
        f.write(f"python examples/text-classification/run_glue.py --model_name_or_path bert-base-uncased --task_name {dataset} --per_device_train_batch_size {batch_size} --weight_decay {weight_decay} --learning_rate {learning_rate} --do_train --do_eval --do_predict --max_seq_length 128 --num_train_epochs 10 --output_dir {saving_dir}/model --logging_steps 20 --logging_dir {saving_dir}/log --report_to tensorboard --evaluation_strategy steps --eval_steps 1000 --save_strategy epoch --load_best_model_at_end True --warmup_ratio 0.0 --seed 0 --weight_decay 0.0 --fp16 \n\n") 
        
    return bash_file_path


In [19]:
# bash_files = []
# bash_files.append(make_bash_file(version))

# torun = [0]
# torun = range(1,6)


In [20]:
submitted = []
# print(len(torun))

exit_code = 1
i = 0
for batch_size in batch_sizes:
    for weight_decay in weight_decays:
        for learning_rate in learning_rates:
            i+=1
            if i % 100 == 0:
                clear_output(wait=True)
            print(i)
            sh = make_bash_file(version, batch_size, weight_decay, learning_rate, i)
            while True:
                process = Popen(["sbatch",sh], stdout=PIPE)
                (output, err) = process.communicate()
                exit_code = process.wait()
                print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())),output,err)
                if exit_code == 0:
                    print(sh,"submitted!")
                    tmp_id = str(output)[-11:-3]
                    print("job id:", tmp_id)
                    submitted.append(tmp_id)
                    break
                time.sleep(10000)

1
2023-09-06 10:30:34 b'Submitted batch job 23773655\n' None
/home/gridsan/shibal/MoEBERT-results/bashes/MoEBERT/mnli/experiment_1/batch_size_16_weight_decay_0.0_lr_1e-05.sh submitted!
job id: 23773655
2
2023-09-06 10:30:34 b'Submitted batch job 23773656\n' None
/home/gridsan/shibal/MoEBERT-results/bashes/MoEBERT/mnli/experiment_2/batch_size_16_weight_decay_0.0_lr_2e-05.sh submitted!
job id: 23773656
3
2023-09-06 10:30:34 b'Submitted batch job 23773657\n' None
/home/gridsan/shibal/MoEBERT-results/bashes/MoEBERT/mnli/experiment_3/batch_size_16_weight_decay_0.01_lr_1e-05.sh submitted!
job id: 23773657
4
2023-09-06 10:30:34 b'Submitted batch job 23773658\n' None
/home/gridsan/shibal/MoEBERT-results/bashes/MoEBERT/mnli/experiment_4/batch_size_16_weight_decay_0.01_lr_2e-05.sh submitted!
job id: 23773658
5
2023-09-06 10:30:34 b'Submitted batch job 23773659\n' None
/home/gridsan/shibal/MoEBERT-results/bashes/MoEBERT/mnli/experiment_5/batch_size_32_weight_decay_0.0_lr_1e-05.sh submitted!
job i