In [2]:
def get_slurm_script(code):

    slurm_script = f'''#!/bin/bash
#SBATCH -p rise # partition (queue)
#SBATCH -N 1 # number of nodes requested
#SBATCH -n 1 # number of tasks (i.e. processes)
#SBATCH --cpus-per-task=8 # number of cores per task
#SBATCH --gres=gpu:1
##SBATCH --nodelist=bombe # if you need specific nodes
#SBATCH --exclude=zanino,ace,blaze,flaminio,freddie,luigi,pavia,r[10,16],atlas,como,havoc,steropes,blaze
#SBATCH -t 7-00:00 # time requested (D-HH:MM)
#SBATCH -D /data/yyaoqing/Good_vs_bad_data/NLP_metrics
#SBATCH -o slurm_logs/slurm.%N.%j..out # STDOUT
#SBATCH -e slurm_logs/slurm.%N.%j..err # STDERR
pwd
hostname
date
echo starting job...
source ~/.bashrc
export PATH="/data/yyaoqing/anaconda3/bin:$PATH"
conda activate pytorch-transformer
export WANDB_PROJECT=WW_EVAL_GLUE_MEASURES
export PYTHONUNBUFFERED=1

{code}

wait
date
'''
    return slurm_script

In [4]:
with open('commands_GLUE.txt', 'r') as f:
    lines = f.readlines()
    file_id = 0
    for line in lines:
        tmp_script = get_slurm_script(line)
        with open(f'GLUE_eval_measures_{file_id}.sh', 'w') as fwrite:
            fwrite.write(tmp_script)
        
        file_id += 1

In [5]:
save_steps = 200
max_steps = 10000
weight_decay = 0.01
lr = '2e-5'
#lr = '4e-5'

#GLUE_tasks = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
# The new GLUE tasks are only for classification
GLUE_tasks = ["mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "wnli"]

# Switch between randomize/not randomize layers
#randomize = ("_randomize", " --randomize_layers")
randomize = ("", "")

# Switch between different types of models
#ckpt_command = ("", "")
#ckpt_command = ("_BERT", " --model_checkpoint bert-base-uncased")
ckpt_command = ("_ALBERT", " --model_checkpoint albert-base-v2")
#ckpt_command = ("_DISTILBERT", "")

common_suffix = "srun -N 1 -n 1 --gres=gpu:1 python"
eval_measure_command = f" --save_steps {save_steps} --max_steps {max_steps}{ckpt_command[1]}"
common_command = f" --pretrain --save_steps {save_steps} --max_steps {max_steps}" \
                  + f" --weight_decay {weight_decay} --lr {lr}{randomize[1]} --eval --eval_ww{ckpt_command[1]}"

common_name = f"pretrain{ckpt_command[0]}{randomize[0]}_lr_{lr}_wd_{weight_decay}" \
        + f"_save_steps_{save_steps}_max_steps_{max_steps}"

def get_ckpt_folder(task, name):
    return f"../checkpoint/{task}/{name}"

def get_python_name(task):
    if task == "SQuAD":
        return "SQuAD"
    elif task == "MLM":
        return "language_modeling"
    else:
        return "GLUE"

def get_commands(task, GLUE_task = None):
    
    GLUE_command = ""
    if GLUE_task:
        GLUE_command = f" --GLUE_task {GLUE_task}"
    
    for randomize_layers_num in [2,3]:
        name = f"{common_name}_randlayer_{randomize_layers_num}"
        ckpt_folder = get_ckpt_folder(task, name)
        command1 = f"mkdir -p {ckpt_folder}"
        command = f"{common_suffix} {get_python_name(task)}.py --ckpt_folder {ckpt_folder}{common_command}{GLUE_command} --randomize_layers_num {randomize_layers_num} 1>{ckpt_folder}/train.log 2>{ckpt_folder}/train.err &"
        eval_command = f"{common_suffix} eval_measures_GLUE.py --ckpt_folder {ckpt_folder}{eval_measure_command}{GLUE_command} 1>{ckpt_folder}/eval.log 2>{ckpt_folder}/eval.err &"
        #print(command1)
        #print(command)
        print(eval_command)

print("\n")
get_commands("SQuAD")
print("\n")
for task in GLUE_tasks:
    get_commands(task, GLUE_task = task)
print("\n")
get_commands("MLM")



srun -N 1 -n 1 --gres=gpu:1 python eval_measures_GLUE.py --ckpt_folder ../checkpoint/SQuAD/pretrain_ALBERT_lr_2e-5_wd_0.01_save_steps_200_max_steps_10000_randlayer_2 --save_steps 200 --max_steps 10000 --model_checkpoint albert-base-v2 1>../checkpoint/SQuAD/pretrain_ALBERT_lr_2e-5_wd_0.01_save_steps_200_max_steps_10000_randlayer_2/eval.log 2>../checkpoint/SQuAD/pretrain_ALBERT_lr_2e-5_wd_0.01_save_steps_200_max_steps_10000_randlayer_2/eval.err &
srun -N 1 -n 1 --gres=gpu:1 python eval_measures_GLUE.py --ckpt_folder ../checkpoint/SQuAD/pretrain_ALBERT_lr_2e-5_wd_0.01_save_steps_200_max_steps_10000_randlayer_3 --save_steps 200 --max_steps 10000 --model_checkpoint albert-base-v2 1>../checkpoint/SQuAD/pretrain_ALBERT_lr_2e-5_wd_0.01_save_steps_200_max_steps_10000_randlayer_3/eval.log 2>../checkpoint/SQuAD/pretrain_ALBERT_lr_2e-5_wd_0.01_save_steps_200_max_steps_10000_randlayer_3/eval.err &


srun -N 1 -n 1 --gres=gpu:1 python eval_measures_GLUE.py --ckpt_folder ../checkpoint/mnli/pretrain