In [2]:
import os
from idr_pytools import gpu_jobs_submitter

# project root path
dsdir = os.getenv("DSDIR")
scratch = os.getenv("SCRATCH")

root = os.path.join(scratch,"pretrain-med-data-qual")
idr_models_dir = os.path.join(dsdir,"HuggingFace_Models")

# bert mlm pretraining continual

## defining arguments


The Pretraining Phases Hardware arguments are taken from [NVIDIA Pytorch BERT Language Modeling](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/README.md#pre-training-nvidia-dgx-a100-8x-a100-80gb) and [NVIDIA Tensorflow BioBERT Language Modeling](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/biobert/README.md#pre-training)

In [3]:
# Args to modify according to needs
debug = True 
phase = 1

# Local Paths
pubmed_path = f"{root}/data/pubmed_preproc"
bert_path = f"{idr_models_dir}/bert-base-uncased"
run_mlm_path = f"{root}/pretraining/run_mlm_offline.py "
accuracy_path = f"{root}/pretraining/accuracy.py"

out_dir_template = f"{root}/pretraining/{{exp_name}} "

# Pretraining Phases Hardware Arguments 
if debug :
    gpu = "a100"
    n_gpu = 4
    sequence_length = 128
    batch_size = 128
    precision = "fp16"
    max_steps = 100
    model_path = bert_path
    acc_steps = None # gradient accumulation steps
    
elif phase == 1 :
    gpu = "a100"
    n_gpu = 8
    sequence_length = 128
    precision = "fp16" # tf32,bf16
    batch_size = 256 # 128 if tf32
    max_steps = 19531
    acc_steps = None # 32 according to NVIDIA
    model_path = bert_path
    
elif phase == 2 :
    gpu = "a100"
    n_gpu = 8
    sequence_length = 512
    batch_size = 32
    precision = "fp16" # tf32,bf16
    max_steps = 4340
    acc_steps = None # 128 according to NVIDIA
    model_path = "{model_path}" # must be last checkpoint of phase 1

In [4]:
# Torchrun (distributed training) Arguments
base_cmd = "torchrun --standalone "
base_cmd += f"--nproc_per_node {n_gpu} "
base_cmd += "--nnodes 1 "
base_cmd += f"{run_mlm_path} "

# Model Arguments
base_cmd += f"--model_name_or_path {model_path} "

# Dataset Arguments
base_cmd += f"--dataset_name {pubmed_path} "
base_cmd += f"--metric_path {accuracy_path} "
base_cmd += f"--max_eval_samples {25600} " if debug else ""
base_cmd += "--preprocessing_num_workers 8 "
base_cmd += f"--max_seq_length {sequence_length} "

# Training Arguments
## Basic arguments 
base_cmd += "--do_train " 
base_cmd += "--do_eval "  
base_cmd += "--seed 42 " 
base_cmd += "--overwrite_output_dir true " # if debug else "" 
## BERT hyperparameters 
base_cmd += f"--per_device_train_batch_size {batch_size} " 
base_cmd += f"--per_device_eval_batch_size {batch_size} " 
base_cmd += "--learning_rate 1e-4 " 
base_cmd += "--weight_decay 0.01 " 
base_cmd += "--adam_beta1 0.9 " 
base_cmd += "--adam_beta2 0.999 " 
base_cmd += "--adam_epsilon 1e-6 " # RoBERTa
## Efficiency / Memory
base_cmd += f"--{precision} true "
base_cmd += "--eval_accumulation_steps 1 "
base_cmd += f"--gradient_accumulation_steps {acc_steps} " if acc_steps else ""
## Number of steps / epochs
base_cmd += f"--max_steps {max_steps} "
base_cmd += f"--warmup_steps {max_steps//10} " # warmup for 10% of steps
## Evaluation / Logging / Model Save
base_cmd += "--evaluation_strategy no " # no evaluation during training only at the end for perplexity
base_cmd += "--logging_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--logging_steps 0.1 "
base_cmd += "--save_steps 0.1 " 
base_cmd += "--logging_first_step true " 
base_cmd += "--log_on_each_node false "
base_cmd += "--save_total_limit 1 "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to wandb "

# Data Filters experiences
bounds = {
    "none":[
        (None,None,"all"),
    ],
    "random":[
        (0.0,0.5,"50%"),
        (0.0,0.25,"25%")
    ],
    "h-index":[
        (103,1400,"top50%"),
        (53,190,"mid50%"),
        (190,1400,"top25%"),
        (77,142,"mid25%"),
    ],
    "sjr":[
        (1.312,100.0,"top25%"),
        (0.462,0.984,"mid25%"),
    ]
}
cmds = []
exp_names = []
for metric, exps in bounds.items():
    for lower_bound, upper_bound, bound_name in exps:
        cmd = base_cmd
        # filtering metric
        if metric != "none": 
            cmd += f"--filter_metric {metric} "
            cmd += f"--filter_lower_threshold {lower_bound} "
            cmd += f"--filter_upper_threshold {upper_bound} "
        # experience name
        exp_name = f"{metric}_{bound_name}_p{phase}" if not debug else f"{metric}_{bound_name}_debug"
        # output_dir
        cmd += f"--output_dir  {out_dir_template.format(exp_name=exp_name)}"
        # wandb args
        cmd += f"--wandb_group {exp_name}_{gpu}x{n_gpu} "
        cmd += f"--wandb_name {exp_name} "
        # model resuming for phase 2
        if phase == 2: # TODO : find last checkpoint for phase 2
            pass
        # append to lists
        cmds.append(cmd)
        exp_names.append(exp_name)
        

cmds = [cmds[-1]]
exp_names = [exp_names[-1]]

## launching jobs

In [5]:
slurm_addon_template = """#SBATCH --mail-type=ALL
#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

script_addon = f"""module load python/3.11.5
conda activate transformers_latest"""

for cmd,exp_name in zip(cmds, exp_names) :
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "cuda/12.1.0",
        n_gpu = n_gpu,
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "v100-32g" if "v100" in gpu else gpu,
        time_max="20:00:00" if not debug else "2:00:00",
        account=f"aro@{gpu}",
        email="mathieu.lai-king@lisn.upsaclay.fr",
        slurm_addon=slurm_addon,
        script_addon=script_addon,
    )

batch job 0: 4 GPUs distributed on 1 nodes with 4 tasks / 4 gpus per node and 8 cpus per task
Submitted batch job 1551459


In [6]:
!squeue -u $USER

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           1551459    gpu_p5 sjr_mid2  urz45id PD       0:00      1 (Resources)


In [2]:
# sync weigths and biases
# TODO : handle distributed logging (one wandb run for each GPU used currently)
!wandb sync --include-offline wandb/offline-*

Find logs at: /gpfsssd/scratch/rech/aro/urz45id/pretrain-med-data-qual/wandb/debug-cli.urz45id.log
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.


# blurb ner evaluation

## define arguments

In [None]:
debug = True
gpu = "v100-32g"

base_cmd = f"python {root}/evaluation/run_ner.py "

# Model Arguments
base_cmd += f"--model_name_or_path {scratch}/models/bert-base-uncased "
base_cmd += f"--cache_dir {scratch}/hf_cache "

# Dataset Arguments
base_cmd += "--dataset_name bigbio/blurb "
base_cmd += f"--seqeval_metric_path {root}/evaluation/seqeval.py "
base_cmd += "--max_seq_length 512 "
base_cmd += "--pad_to_max_length true " if debug else ""
base_cmd += "--max_train_samples 64 " if debug else ""
base_cmd += "--max_eval_samples 64 " if debug else ""
base_cmd += "--preprocessing_num_workers 10 " if debug else ""

# Training Arguments
## Basic arguments
base_cmd += "--do_train --do_eval --do_predict "
base_cmd += "--seed 42 "
## Efficiency
base_cmd += "--tf32 true " if gpu == "a100" else ""
base_cmd += "--bf16 false "
base_cmd += "--fp16 true " if gpu == "v100" else ""
## Batchs
base_cmd += "--per_device_train_batch_size 32 "
base_cmd += "--per_device_eval_batch_size 32 "
## Optimizer (default : linear with warmup)
base_cmd += "--learning_rate 2e-5 " # SciBERT  
base_cmd += "--weight_decay 0.01 "  
base_cmd += "--adam_beta1 0.9 " 
base_cmd += "--adam_beta2 0.98 " 
base_cmd += "--warmup_ratio 0.1 "
## Number of steps / epochs
base_cmd += "--num_train_epochs 10 " 
## Evaluation / Logging / Model Save
base_cmd += "--logging_strategy steps " 
base_cmd += "--evaluation_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--logging_steps 0.1 " if not debug else "--logging_steps 0.5 "
base_cmd += "--eval_steps 0.1 " if not debug else "--eval_steps 0.5 "
base_cmd += "--save_steps 0.1 " if not debug else "--save_steps 0.5 "
base_cmd += "--log_on_each_node false "
base_cmd += "--local_rank 0 "
base_cmd += "--save_total_limit 3 "
base_cmd += "--load_best_model_at_end true "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to none "

In [None]:
n_seeds = 5 # 10 for BioASQ, BIOSSES, and PubMedQA in PubMedBERT paper and 5 for others
blurb_ner_configs = ["bc5chem","bc5disease","bc2gm","jnlpba","ncbi_disease"]

cmds = []
for blurb_config in blurb_ner_configs:
    cmd = base_cmd
    cmd += f"--dataset_config_name {blurb_config} "
    for seed in range(n_seeds):
        cmd += "--seed 42 "
        if debug:
            cmd += "--output_dir evaluation/debug_output "
        else:
            cmd += f"--output_dir evaluation/bert-base-uncased/{blurb_config}/{seed} "
        cmds.append(cmd)

if debug : 
    cmds = cmds[0]

cmds

## launch jobs

In [None]:
job_name = "blurb-ner-debug" if debug else "blurb-ner"

In [None]:
job_ids = gpu_jobs_submitter(
    cmds , # ONE JOB TO TEST ONLY
    name=job_name,
    module="pytorch-gpu/py3/2.1.1",
    n_gpu = 1,
    qos="qos_gpu-dev",
    time_max="02:00:00" if debug else "5:00:00",
    account=f"aro@{gpu.split('-')[0]}"
)  

# clean logs, cache , wandb runs, cancel jobs, training debug checkpoints

In [14]:
# delete logs
!rm -rf slurm/log/*

In [16]:
# delete debug training checkpoints
!rm -rf pretraining/*_debug/checkpoint*
!rm -rf pretraining/*_debug/*.bin
!rm -rf pretraining/*_debug/*.safetensors

In [None]:
# delete wandb run dir
!rm -rf wandb

In [None]:
# remove data cache and tmp files
!rm -rf data/.cache
!rm -rf data/pubmed_preproc/*/cache-*.arrow
!rm -rf data/pubmed_preproc/*/tmp*

In [None]:
!rm -rf core-python-*

In [None]:
# cancel all my jobs
!scancel -u $USER