In [4]:
import os
from idr_pytools import gpu_jobs_submitter

# project root path
dsdir = os.getenv("DSDIR")
scratch = os.getenv("SCRATCH")

root = os.path.join(scratch,"pretrain-med-data-qual")
idr_models_dir = os.path.join(dsdir,"HuggingFace_Models")

# bert mlm pretrain

## computing number of optimal steps and grad accumulation

We define the number of optimal steps as the number of steps required to perform an entire epoch on the full PubMed dataset (Baseline last update january 2024).
Following RoBERTa, we aim for an effective batch_size of 8192

The Pretraining Phases Hardware arguments are taken from [NVIDIA Pytorch BERT Language Modeling](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/README.md#pre-training-nvidia-dgx-a100-8x-a100-80gb) and [NVIDIA Tensorflow BioBERT Language Modeling](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/biobert/README.md#pre-training)

In [5]:
# Gradient accumulation
sequence_length = 512
gpu_model = "a100"
max_batch_size_per_gpu = 32 # on A100 with 512 seq length
gpu_nb = 2
target_batch_size = 8192
gradient_accumulation = target_batch_size // (gpu_nb*max_batch_size_per_gpu) 
print(f"Gradient accumulation needed for effective batch size of {target_batch_size}, with {gpu_nb}*{gpu_model} GPUs with per device batch size of {max_batch_size_per_gpu} = ",gradient_accumulation)
# Optimal steps number
total_token_nb =  15888466068 # calculated number of tokens in pubmed
train_token_nb = 0.95*total_token_nb
token_per_step = target_batch_size * sequence_length
optimal_train_step_nb = train_token_nb // token_per_step
print(f"Number of tokens per step (in an effective batch, with grad acc and gpu parrallel) : {token_per_step}")
print(f"Optimal step number with effective batch size of {target_batch_size} : {optimal_train_step_nb}")

Gradient accumulation needed for effective batch size of 8192, with 2*a100 GPUs with per device batch size of 32 =  128
Number of tokens per step (in an effective batch, with grad acc and gpu parrallel) : 4194304
Optimal step number with effective batch size of 8192 : 3598.0


## defining arguments

In [9]:
# Args to modify according to needs
debug = True 

# Local Paths
pubmed_path = f"{root}/data/pubmed_preproc"
bert_path = f"{idr_models_dir}/bert-base-uncased"
run_mlm_path = f"{root}/pretraining/run_mlm_offline.py "
accuracy_path = f"{root}/pretraining/accuracy.py"
out_dir_template = f"{root}/pretraining/{{exp_name}} "

# Pretraining Phases Hardware Arguments 
gpu = "a100"
n_gpu = 2
sequence_length = 512
batch_size = 32 # per device
precision = "fp16"
max_steps = 3598 if not debug else 100
acc_steps = 128 if not debug else 2

In [10]:
# Torchrun (distributed training) Arguments
base_cmd = "torchrun --standalone "
base_cmd += f"--nproc_per_node {n_gpu} "
base_cmd += "--nnodes 1 "
base_cmd += f"{run_mlm_path} "

# Model Arguments
base_cmd += f"--model_name_or_path {bert_path} "

# Dataset Arguments
base_cmd += f"--dataset_name {pubmed_path} "
base_cmd += f"--metric_path {accuracy_path} "
base_cmd += f"--max_eval_samples {25600} " if debug else ""
base_cmd += "--preprocessing_num_workers 8 "
base_cmd += f"--max_seq_length {sequence_length} "

# Training Arguments
## Basic arguments 
base_cmd += "--do_train " 
base_cmd += "--do_eval "  
base_cmd += "--seed 42 " 
base_cmd += "--overwrite_output_dir true " if debug else "" 
## BERT hyperparameters 
base_cmd += f"--per_device_train_batch_size {batch_size} " 
base_cmd += f"--per_device_eval_batch_size {batch_size} " 
base_cmd += "--learning_rate 1e-4 " 
base_cmd += "--weight_decay 0.01 " 
base_cmd += "--adam_beta1 0.9 " 
base_cmd += "--adam_beta2 0.999 " 
base_cmd += "--adam_epsilon 1e-6 " # RoBERTa
## Efficiency / Memory
base_cmd += f"--{precision} true "
base_cmd += "--eval_accumulation_steps 2 "
base_cmd += f"--gradient_accumulation_steps {acc_steps} " if acc_steps else ""
## Number of steps / epochs
base_cmd += f"--max_steps {max_steps} "
base_cmd += f"--warmup_steps {max_steps//10} " # warmup for 10% of steps
## Evaluation / Logging / Model Save
base_cmd += "--evaluation_strategy no " # no evaluation during training only at the end for perplexity
base_cmd += "--logging_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--logging_steps 0.01 "
base_cmd += "--save_steps 0.1 " 
base_cmd += "--logging_first_step true " 
base_cmd += "--log_on_each_node false "
base_cmd += "--save_total_limit 2 "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to wandb "

# Data Filters experiences
bounds = {
    "none":[
        (None,None,"all"),
    ],
    "random":[
        (0.0,0.5,"50%"),
        (0.0,0.25,"25%")
    ],
    "h-index":[
        (103,1400,"top50%"),
        (53,190,"mid50%"),
        (190,1400,"top25%"),
        (77,142,"mid25%"),
    ],
    "sjr":[
        (1.312,100.0,"top25%"),
        (0.462,0.984,"mid25%"),
        (0.759,100.0,"top50%"),
    ]
}
cmds = []
exp_names = []
for metric, exps in bounds.items():
    for lower_bound, upper_bound, bound_name in exps:
        cmd = base_cmd
        # filtering metric
        if metric != "none": 
            cmd += f"--filter_metric {metric} "
            cmd += f"--filter_lower_threshold {lower_bound} "
            cmd += f"--filter_upper_threshold {upper_bound} "
        else:
            cmd += "--streaming "
        # experience name
        exp_name = f"{metric}_{bound_name}" 
        if debug : exp_name += "_debug"
        # output_dir
        cmd += f"--output_dir  {out_dir_template.format(exp_name=exp_name)} "
        # wandb args
        cmd += f"--wandb_group {exp_name}_{gpu}x{n_gpu} "
        cmd += f"--wandb_name {exp_name} "
        # append to lists
        cmds.append(cmd)
        exp_names.append(exp_name)
        
for i,e in enumerate(exp_names):print(i,":",e)

0 : none_all_debug
1 : random_50%_debug
2 : random_25%_debug
3 : h-index_top50%_debug
4 : h-index_mid50%_debug
5 : h-index_top25%_debug
6 : h-index_mid25%_debug
7 : sjr_top25%_debug
8 : sjr_mid25%_debug
9 : sjr_top50%_debug


In [11]:
if debug : 
    debug_ind = 9
    cmds = [cmds[debug_ind]]
    exp_names = [exp_names[debug_ind]]
    print(cmds)
    print(exp_names)

['torchrun --standalone --nproc_per_node 4 --nnodes 1 /gpfsscratch/rech/aro/urz45id/pretrain-med-data-qual/pretraining/run_mlm_offline.py  --model_name_or_path /gpfsdswork/dataset/HuggingFace_Models/bert-base-uncased --dataset_name /gpfsscratch/rech/aro/urz45id/pretrain-med-data-qual/data/pubmed_preproc --metric_path /gpfsscratch/rech/aro/urz45id/pretrain-med-data-qual/pretraining/accuracy.py --max_eval_samples 25600 --preprocessing_num_workers 8 --max_seq_length 512 --do_train --do_eval --seed 42 --overwrite_output_dir true --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 1e-4 --weight_decay 0.01 --adam_beta1 0.9 --adam_beta2 0.999 --adam_epsilon 1e-6 --fp16 true --eval_accumulation_steps 2 --gradient_accumulation_steps 2 --max_steps 100 --warmup_steps 10 --evaluation_strategy no --logging_strategy steps --save_strategy steps --logging_steps 0.01 --save_steps 0.1 --logging_first_step true --log_on_each_node false --save_total_limit 2 --disable_tqdm true

## launching jobs

In [12]:
slurm_addon_template = """#SBATCH --mail-type=ALL
#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

script_addon = f"""module load python/3.11.5
conda activate transformers_latest"""

for cmd,exp_name in zip(cmds, exp_names) :
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "cuda/12.1.0",
        n_gpu = n_gpu,
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "v100-32g" if "v100" in gpu else gpu,
        time_max="20:00:00" if not debug else "2:00:00",
        account=f"aro@{gpu}",
        email="mathieu.lai-king@lisn.upsaclay.fr",
        slurm_addon=slurm_addon,
        script_addon=script_addon,
    )

batch job 0: 4 GPUs distributed on 1 nodes with 4 tasks / 4 gpus per node and 8 cpus per task
Submitted batch job 1687501


# blurb ner evaluation

## define arguments

In [17]:
# Args to modify according to needs
debug = False 

# Local Paths
cache_dir = f"{root}/evaluation/cache"
run_ner_path = f"{root}/evaluation/run_ner_offline.py "
seqeval_path = f"{root}/evaluation/evaluate_seqeval.py"
out_dir_template = f"{root}/evaluation/out/{{exp_name}}"

models_paths = [
    f"{idr_models_dir}/bert-base-uncased",
]

# Pretraining Phases Hardware Arguments 
gpu = "v100"
n_gpu = 1
max_seq_length = 512
batch_size = 16 # per device
precision = "fp16"

In [18]:
# Torchrun (distributed training) Arguments
base_cmd = f"python {run_ner_path} "

# Model Argument
base_cmd += f"--cache_dir {cache_dir} "

# Dataset Arguments
base_cmd += "--preprocessing_num_workers 8 "
base_cmd += f"--max_seq_length {max_seq_length} "
base_cmd += f"--seqeval_path {seqeval_path} "

# Training Arguments
## Basic arguments 
base_cmd += "--do_train --do_eval --do_predict "
base_cmd += "--overwrite_output_dir true " if debug else "" 
## Hyperparameters 
base_cmd += f"--per_device_train_batch_size {batch_size} " 
base_cmd += f"--per_device_eval_batch_size {batch_size} " 
base_cmd += "--learning_rate 3e-5 " 
## Efficiency / Memory
base_cmd += f"--{precision} true "
base_cmd += "--eval_accumulation_steps 2 "
## Number of steps / epochs
base_cmd += f"--num_train_epochs 3 "
base_cmd += f"--warmup_ratio 0.1 "
## Evaluation / Logging / Model Save
base_cmd += "--evaluation_strategy steps "
base_cmd += "--logging_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--eval_steps 0.1 "
base_cmd += "--logging_steps 0.1 "
base_cmd += "--save_steps 0.1 " 
base_cmd += "--logging_first_step true " 
base_cmd += "--save_total_limit 2 "
base_cmd += "--load_best_model_at_end true "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to wandb "

# Different experiments Runs 
datasets_configs=[
    ("bigbio/blurb","bc5chem"),
    ("bigbio/blurb","bc5disease"),
    ("bigbio/blurb","bc2gm"),
    ("bigbio/blurb","jnlpba"),
    ("bigbio/blurb","ncbi_disease"),
    #("bigbio/ebm_pico", None),
]
seed_nb = 5

cmds = []
exp_names = []

for model_path in models_paths :
    for dataset_name, dataset_config in datasets_configs :
        for seed in range(seed_nb):
            cmd = base_cmd
            cmd += f"--dataset_name {dataset_name} "
            cmd += f"--dataset_config_name {dataset_config} " if dataset_config else ""
            cmd += f"--seed {seed} "
            cmd += f"--model_name_or_path {model_path} "
            # TODO : special treatment for EBM-NLP PICO
            # Experience name for output directory
            exp_name = f"{model_path.split('/')[-1]}_{dataset_name.split('/')[-1]}"
            exp_name += f"-{dataset_config}" if dataset_config else ""
            exp_name += f"_seed{seed}"
            exp_name += "_debug" if debug else ""
            out_dir = out_dir_template.format(exp_name=exp_name)
            cmd += f"--output_dir {out_dir} "
            # Weights and Biases
            cmd += f"--run_name {exp_name}"
            # fill lists
            if os.path.exists(os.path.join(out_dir,"predict_results.json")):
                print(exp_name, "already finished")
                continue
            cmds.append(cmd)
            exp_names.append(exp_name)

# Display experiences and chosen debug
for i,e in enumerate(exp_names):print(i,":",e)
if debug : 
    i = 24 # CHANGE THIS VALUE TO CHOOSE WHICH EXPERIENCE TO DEBUG
    cmds = [cmds[i]]
    exp_names = [exp_names[i]]
    print("--------------------------")
    print(f"Debugging with only exp n°{i}")
    print(exp_names)

bert-base-uncased_blurb-bc5chem_seed0 already finished
bert-base-uncased_blurb-bc5chem_seed1 already finished
bert-base-uncased_blurb-bc5chem_seed2 already finished
bert-base-uncased_blurb-bc5chem_seed3 already finished
bert-base-uncased_blurb-bc5chem_seed4 already finished
bert-base-uncased_blurb-bc5disease_seed1 already finished
bert-base-uncased_blurb-bc5disease_seed2 already finished
bert-base-uncased_blurb-bc5disease_seed3 already finished
bert-base-uncased_blurb-bc2gm_seed0 already finished
bert-base-uncased_blurb-bc2gm_seed2 already finished
bert-base-uncased_blurb-bc2gm_seed4 already finished
bert-base-uncased_blurb-jnlpba_seed1 already finished
bert-base-uncased_blurb-jnlpba_seed2 already finished
bert-base-uncased_blurb-jnlpba_seed4 already finished
bert-base-uncased_blurb-ncbi_disease_seed0 already finished
bert-base-uncased_blurb-ncbi_disease_seed3 already finished
0 : bert-base-uncased_blurb-bc5disease_seed0
1 : bert-base-uncased_blurb-bc5disease_seed4
2 : bert-base-uncase

## launch jobs

In [14]:
slurm_addon_template = """#SBATCH --mail-type=ALL
#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

script_addon = f"""module load python/3.11.5
conda activate transformers_latest"""

for cmd,exp_name in zip(cmds, exp_names) :   
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "cuda/12.1.0",
        n_gpu = n_gpu,
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "v100-32g" if "v100" in gpu else gpu,
        time_max="02:00:00",
        account=f"aro@{gpu}",
        email="mathieu.lai-king@lisn.upsaclay.fr",
        slurm_addon=slurm_addon,
        script_addon=script_addon,
    )

batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 10 cpus per task
Submitted batch job 1698870
batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 10 cpus per task
Submitted batch job 1698871
batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 10 cpus per task
Submitted batch job 1698873
batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 10 cpus per task
Submitted batch job 1698875
batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 10 cpus per task
Submitted batch job 1698877
batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 10 cpus per task
Submitted batch job 1698878
batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 10 cpus per task
Submitted batch job 1698879
batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 10 cpus per task
Submitted batch job 1698882
batch job 0: 2 G

# job control and wandb synchronization

In [20]:
!squeue -u $USER

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           1687330    gpu_p5 sjr_top2  urz45id PD       0:00      1 (Priority)
           1687329    gpu_p5 h-index_  urz45id PD       0:00      1 (Priority)
           1687328    gpu_p5 h-index_  urz45id PD       0:00      1 (Priority)
           1687327    gpu_p5 h-index_  urz45id PD       0:00      1 (Priority)
           1687326    gpu_p5 h-index_  urz45id PD       0:00      1 (Priority)
           1687325    gpu_p5 random_2  urz45id PD       0:00      1 (Priority)
           1687324    gpu_p5 random_5  urz45id PD       0:00      1 (Priority)
           1687323    gpu_p5 none_all  urz45id PD       0:00      1 (Priority)
           1687332    gpu_p5 sjr_top5  urz45id PD       0:00      1 (Priority)
           1687331    gpu_p5 sjr_mid2  urz45id PD       0:00      1 (Priority)


In [8]:
# sync weigths and biases
# TODO : handle distributed logging (one wandb run for each GPU used currently)
!wandb sync --include-offline wandb/offline-*

Find logs at: /gpfsssd/scratch/rech/aro/urz45id/pretrain-med-data-qual/wandb/debug-cli.urz45id.log
done.
done.
done.
done.
done.
done.
done.
.wandb file is empty (header is 0 bytes instead of the expected 7), skipping: /gpfsssd/scratch/rech/aro/urz45id/pretrain-med-data-qual/wandb/offline-run-20240423_164946-krgwfod9/run-krgwfod9.wandb
done.
done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/8onv07xy ... done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/9fw6hnmz ... done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/k6myos0s ... done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/rloo2eog ... done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/saf4exwb ... done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/utk56m8j ... done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/k6az5x9l ... done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/l2aj4zks ... done.
Sy

# clean logs, cache , wandb runs, cancel jobs, training debug checkpoints

In [5]:
# delete logs
!rm -rf slurm/log/*

In [16]:
# delete slurm files
!rm -rf slurm/*.slurm

In [6]:
# delete debug training checkpoints
!rm -rf pretraining/*_debug/checkpoint*
!rm -rf pretraining/*_debug/*.bin
!rm -rf pretraining/*_debug/*.safetensors

In [7]:
# delete wandb run dir
!rm -rf wandb

In [26]:
# remove data cache and tmp files
!rm -rf data/.cache
!rm -rf data/pubmed_preproc/*/cache-*.arrow
!rm -rf data/pubmed_preproc/*/tmp*

In [8]:
!rm -rf core-python-*

In [23]:
# cancel all my jobs
!scancel -u $USER

In [9]:
# remove pretraining dirs /!\ to handle with care
#!rm -rf pretraining/*/