In [2]:
import os
import glob
import json
import shutil
from idr_pytools import gpu_jobs_submitter

# project root path
dsdir = os.getenv("DSDIR")
scratch = os.getenv("SCRATCH")
root = os.path.join(scratch,"pretrain-med-data-qual")
idr_models_dir = os.path.join(dsdir,"HuggingFace_Models")

# bert mlm pretrain

## computing number of optimal steps and grad accumulation

We define the number of optimal steps as the number of steps required to perform an entire epoch on the full PubMed dataset (Baseline last update january 2024).
Following RoBERTa, we aim for an effective batch_size of 8192

The Pretraining Phases Hardware arguments are taken from [NVIDIA Pytorch BERT Language Modeling](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/README.md#pre-training-nvidia-dgx-a100-8x-a100-80gb) and [NVIDIA Tensorflow BioBERT Language Modeling](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/biobert/README.md#pre-training)

In [5]:
# Gradient accumulation
sequence_length = 512
gpu_model = "a100"
max_batch_size_per_gpu = 32 # on A100 with 512 seq length
gpu_nb = 2
target_batch_size = 8192
gradient_accumulation = target_batch_size // (gpu_nb*max_batch_size_per_gpu) 
print(f"Gradient accumulation needed for effective batch size of {target_batch_size}, with {gpu_nb}*{gpu_model} GPUs with per device batch size of {max_batch_size_per_gpu} = ",gradient_accumulation)
# Optimal steps number
total_token_nb =  15888466068 # calculated number of tokens in pubmed
train_token_nb = 0.95*total_token_nb
token_per_step = target_batch_size * sequence_length
optimal_train_step_nb = train_token_nb // token_per_step
print(f"Number of tokens per step (in an effective batch, with grad acc and gpu parrallel) : {token_per_step}")
print(f"Optimal step number with effective batch size of {target_batch_size} : {optimal_train_step_nb}")

Gradient accumulation needed for effective batch size of 8192, with 2*a100 GPUs with per device batch size of 32 =  128
Number of tokens per step (in an effective batch, with grad acc and gpu parrallel) : 4194304
Optimal step number with effective batch size of 8192 : 3598.0


## defining arguments

In [3]:
# Args to modify according to needs
debug = False 

# Local Paths
pubmed_path = f"{root}/data/pubmed_preproc"
bert_path = f"{idr_models_dir}/bert-base-uncased"
run_mlm_path = f"{root}/pretraining/run_mlm_offline.py "
accuracy_path = f"{root}/pretraining/accuracy.py"
out_dir_template = f"{root}/pretraining/{{exp_name}}"

# Pretraining Phases Hardware Arguments for A100 GPUs
n_gpu = 2
sequence_length = 512
batch_size = 32 # per device
precision = "fp16"
max_steps = 3598 if not debug else 100
acc_steps = 128 if not debug else 2

In [4]:
# Torchrun (distributed training) Arguments
base_cmd = "torchrun --standalone "
base_cmd += f"--nproc_per_node {n_gpu} "
base_cmd += "--nnodes 1 "
base_cmd += f"{run_mlm_path} "

# Model Arguments
base_cmd += f"--model_name_or_path {bert_path} "

# Dataset Arguments
base_cmd += f"--dataset_name {pubmed_path} "
base_cmd += f"--metric_path {accuracy_path} "
base_cmd += f"--max_eval_samples {25600} " if debug else ""
base_cmd += "--preprocessing_num_workers 8 "
base_cmd += f"--max_seq_length {sequence_length} "

# Training Arguments
## Basic arguments 
base_cmd += "--seed 42 " 
base_cmd += "--overwrite_output_dir true " if debug else "" 
## BERT hyperparameters 
base_cmd += f"--per_device_train_batch_size {batch_size} " 
base_cmd += f"--per_device_eval_batch_size {batch_size} " 
base_cmd += "--learning_rate 1e-4 " 
base_cmd += "--weight_decay 0.01 " 
base_cmd += "--adam_beta1 0.9 " 
base_cmd += "--adam_beta2 0.999 " 
base_cmd += "--adam_epsilon 1e-6 " # RoBERTa
## Efficiency / Memory
base_cmd += f"--{precision} true "
base_cmd += "--eval_accumulation_steps 2 "
base_cmd += f"--gradient_accumulation_steps {acc_steps} " if acc_steps else ""
## Number of steps / epochs
base_cmd += f"--max_steps {max_steps} "
base_cmd += f"--warmup_steps {max_steps//10} " # warmup for 10% of steps
## Evaluation / Logging / Model Save
base_cmd += "--evaluation_strategy no " # no evaluation during training only at the end for perplexity
base_cmd += "--logging_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--logging_steps 0.01 "
base_cmd += "--save_steps 0.1 " 
base_cmd += "--logging_first_step true " 
base_cmd += "--log_on_each_node false "
base_cmd += "--log_on_each_node false "
base_cmd += "--save_total_limit 3 "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to wandb "

# Data Filters experiences
bounds = {
    "none":[
        (None,None,"all"),
    ],
    "random":[
        (0.0,0.5,"50%"),
        (0.0,0.25,"25%")
    ],
    "h-index":[
        (103,1400,"top50%"),
        (53,190,"mid50%"),
        (190,1400,"top25%"),
        (77,142,"mid25%"),
    ],
    "sjr":[
        (1.312,100.0,"top25%"),
        (0.462,0.984,"mid25%"),
        (0.759,100.0,"top50%"),
    ]
}
cmds = []
exp_names = []
for metric, exps in bounds.items():
    for lower_bound, upper_bound, bound_name in exps:
        cmd = base_cmd
        # filtering metric
        if metric != "none": 
            cmd += f"--filter_metric {metric} "
            cmd += f"--filter_lower_threshold {lower_bound} "
            cmd += f"--filter_upper_threshold {upper_bound} "
        else:
            cmd += "--streaming "
        # experience name
        exp_name = f"{metric}_{bound_name}" 
        if debug : exp_name += "_debug"
        # evaluate cache dir
        cmd += f"--evaluate_cache_dir pretraining/.evaluate_cache/{exp_name} "
        # output_dir
        out_dir = out_dir_template.format(exp_name=exp_name)
        cmd += f"--output_dir {out_dir} "
        # wandb args
        cmd += f"--wandb_group {exp_name} "
        cmd += f"--wandb_name {exp_name} "
        # append to lists
        if os.path.exists(os.path.join(out_dir,"eval_results.json")):
            print(exp_name, "already finished")
            continue
        elif os.path.exists(os.path.join(out_dir,"train_results.json")):
            print(exp_name,"run eval only")
            cmd += "--do_eval "  
        else : 
            print(exp_name,"run train and eval")
            cmd += "--do_train " 
            cmd += "--do_eval " 
        cmds.append(cmd)
        exp_names.append(exp_name)
        
for i,e in enumerate(exp_names):print(i,":",e)

none_all run train and eval
random_50% already finished
random_25% already finished
h-index_top50% already finished
h-index_mid50% already finished
h-index_top25% already finished
h-index_mid25% already finished
sjr_top25% already finished
sjr_mid25% already finished
sjr_top50% run eval only
0 : none_all
1 : sjr_top50%


In [6]:
if debug : 
    debug_ind = 4
    cmds = [cmds[debug_ind]]
    exp_names = [exp_names[debug_ind]]
    print(cmds)
    print(exp_names)

## launching jobs

In [7]:
slurm_addon_template = """#SBATCH --mail-type=ALL
#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

script_addon = """module load python/3.11.5
conda activate pretrain
"""
for cmd,exp_name in zip(cmds, exp_names) :
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "cuda/12.1.0",
        n_gpu = n_gpu,
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "a100",
        time_max="20:00:00" if not debug else "2:00:00",
        account=f"aro@a100",
        email="mathieu.lai-king@lisn.upsaclay.fr",
        slurm_addon=slurm_addon,
        script_addon=script_addon
    )

batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 8 cpus per task
Submitted batch job 1977363
batch job 0: 2 GPUs distributed on 1 nodes with 2 tasks / 2 gpus per node and 8 cpus per task
Submitted batch job 1977364


# fine-tune blurb eval

In [3]:
cache_dir = f"{root}/data/.blurb_cache"
out_dir_template = f"{root}/evaluation/out/{{exp_name}}"
models_paths = [
    f"{idr_models_dir}/bert-base-uncased",
    f"{root}/pretraining/none_all_ckpt-1440",
    f"{root}/pretraining/random_25%",
    f"{root}/pretraining/random_50%_ckpt-3240",
    f"{root}/pretraining/h-index_mid25%",
    f"{root}/pretraining/h-index_mid50%",
    f"{root}/pretraining/h-index_top25%",
    f"{root}/pretraining/h-index_top50%_ckpt-3240",
    f"{root}/pretraining/sjr_mid25%",
    f"{root}/pretraining/sjr_top25%",
    f"{root}/pretraining/sjr_top50%_ckpt-2880",
]

# Pretraining Phases Hardware Arguments 
# for 1xV100
max_seq_length = 512
batch_size = 16 # per device
precision = "fp16"

## ner

In [55]:
# Args to modify according to needs
debug = False 

# Local Paths
run_ner_path = f"{root}/evaluation/run_ner.py "
seqeval_path = f"{root}/evaluation/metrics/evaluate_seqeval.py"

In [56]:
# Torchrun (distributed training) Arguments
base_cmd = f"python {run_ner_path} "

# Model args
base_cmd += f"--cache_dir {cache_dir} "

# Dataset Arguments
base_cmd += "--preprocessing_num_workers 8 "
base_cmd += f"--max_seq_length {max_seq_length} "
base_cmd += f"--seqeval_path {seqeval_path} "
base_cmd += "--return_entity_level_metrics "

# Training Arguments
## Basic arguments 
base_cmd += "--do_train --do_eval --do_predict "
base_cmd += "--overwrite_output_dir true " if debug else "" 
## Hyperparameters 
base_cmd += f"--per_device_train_batch_size {batch_size} " 
base_cmd += f"--per_device_eval_batch_size {batch_size} " 
base_cmd += "--learning_rate 3e-5 " 
## Efficiency / Memory
base_cmd += f"--{precision} true "
base_cmd += "--eval_accumulation_steps 2 "
## Number of steps / epochs
base_cmd += f"--num_train_epochs 5 "
base_cmd += f"--warmup_ratio 0.1 "
## Evaluation / Logging / Model Save
base_cmd += "--evaluation_strategy steps "
base_cmd += "--logging_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--eval_steps 0.1 "
base_cmd += "--logging_steps 0.1 "
base_cmd += "--save_steps 0.1 " 
base_cmd += "--logging_first_step true " 
base_cmd += "--save_total_limit 2 "
base_cmd += "--load_best_model_at_end true "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to wandb "

# Different experiments Runs 
datasets_configs=[
    ("bigbio/blurb","bc5chem"),
    ("bigbio/blurb","bc5disease"),
    ("bigbio/blurb","bc2gm"),
    ("bigbio/blurb","jnlpba"),
    ("bigbio/blurb","ncbi_disease"),
]
seed_nb = 5

cmds = []
exp_names = []

for model_path in models_paths :
    for dataset_name, dataset_config in datasets_configs :
        for seed in range(seed_nb):
            cmd = base_cmd
            cmd += f"--dataset_name {dataset_name} "
            cmd += f"--dataset_config_name {dataset_config} " if dataset_config else ""
            cmd += f"--seed {seed} "
            cmd += f"--model_name_or_path {model_path} "
            
            # Experience name for output directory
            exp_name = f"{model_path.split('/')[-1]}_{dataset_name.split('/')[-1]}"
            exp_name += f"-{dataset_config}" if dataset_config else ""
            exp_name += f"_seed{seed}"
            exp_name += "_debug" if debug else ""
            out_dir = out_dir_template.format(exp_name=exp_name)
            cmd += f"--output_dir {out_dir} "
            # Cache dir (for cache problems)
            base_cmd += f"--evaluate_cache_dir {root}/evaluation/out/.evaluate_cache/{exp_name} "
            # Weights and Biases
            cmd += f"--run_name {exp_name}"
            # fill lists
            if os.path.exists(os.path.join(out_dir,"predict_results.json")):
                #print(exp_name, "already finished")
                continue
            cmds.append(cmd)
            exp_names.append(exp_name)

# Display experiences and chosen debug
for i,e in enumerate(exp_names):print(i,":",e)
if debug : 
    i = 24 # CHANGE THIS VALUE TO CHOOSE WHICH EXPERIENCE TO DEBUG
    cmds = [cmds[i]]
    exp_names = [exp_names[i]]
    print("--------------------------")
    print(f"Debugging with only exp n°{i}")
    print(exp_names)

In [9]:
slurm_addon_template = """#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

script_addon = f"""module load python/3.11.5
conda activate pretrain"""

for cmd,exp_name in zip(cmds, exp_names) :   
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "cuda/12.1.0",
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "v100-32g",
        time_max="02:00:00",
        account=f"aro@v100",
        slurm_addon=slurm_addon,
        script_addon=script_addon,
    )

batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941844
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941845
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941848
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941849
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941852
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941853
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941855
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941856
batch job 0: 1 G

## biosses

In [68]:
# Args to modify according to needs
debug = True 

# Local Paths
run_biosses_path = f"{root}/evaluation/run_biosses.py"
biosses_path = f"{root}/data/biosses"
pearsonr_path = f"{root}/evaluation/metrics/evaluate_pearsonr.py"

In [70]:
base_cmd = f"python {run_biosses_path} "
base_cmd += f"--pearsonr_path {pearsonr_path} "
base_cmd += f"--biosses_path {biosses_path} "
# add exps
seed_nb = 10
cmds, exp_names = [], []
for model_path in models_paths:
    for seed in range(seed_nb):
        cmd = base_cmd
        cmd += f"--model_path {model_path} "
        cmd += f"--seed {seed} "
        # exp_name
        exp_name = f"{model_path.split('/')[-1]}_biosses_seed{seed}"
        exp_name += '_debug' if debug else ''
        cmd += f"--evaluate_cache_dir evaluation/out/.evaluate_cache/{exp_name} "
        # out_dir
        out_dir = out_dir_template.format(exp_name=exp_name)
        cmd += f"--output_dir {out_dir} "
        # fill lists
        if os.path.exists(os.path.join(out_dir,"predict_results.json")):
            print(exp_name, "already finished")
            continue
        cmds.append(cmd)
        exp_names.append(exp_name)
# Display experiences and chosen debug
for i,e in enumerate(exp_names):print(i,":",e)
if debug : 
    i = 9 # CHANGE THIS VALUE TO CHOOSE WHICH EXPERIENCE TO DEBUG
    cmds = [cmds[i]]
    exp_names = [exp_names[i]]
    print("--------------------------")
    print(f"Debugging with only exp n°{i}")
    print(exp_names)
    print(cmds)

bert-base-uncased_biosses_seed0_debug already finished
random_25%_biosses_seed0_debug already finished
0 : bert-base-uncased_biosses_seed1_debug
1 : bert-base-uncased_biosses_seed2_debug
2 : bert-base-uncased_biosses_seed3_debug
3 : bert-base-uncased_biosses_seed4_debug
4 : bert-base-uncased_biosses_seed5_debug
5 : bert-base-uncased_biosses_seed6_debug
6 : bert-base-uncased_biosses_seed7_debug
7 : bert-base-uncased_biosses_seed8_debug
8 : bert-base-uncased_biosses_seed9_debug
9 : random_25%_biosses_seed1_debug
10 : random_25%_biosses_seed2_debug
11 : random_25%_biosses_seed3_debug
12 : random_25%_biosses_seed4_debug
13 : random_25%_biosses_seed5_debug
14 : random_25%_biosses_seed6_debug
15 : random_25%_biosses_seed7_debug
16 : random_25%_biosses_seed8_debug
17 : random_25%_biosses_seed9_debug
18 : h-index_mid25%_biosses_seed0_debug
19 : h-index_mid25%_biosses_seed1_debug
20 : h-index_mid25%_biosses_seed2_debug
21 : h-index_mid25%_biosses_seed3_debug
22 : h-index_mid25%_biosses_seed4_de

In [71]:
slurm_addon_template = """#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

for cmd,exp_name in zip(cmds, exp_names) :   
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "pytorch-gpu/py3/2.2.0",
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "v100-32g",
        time_max="01:00:00",
        account=f"aro@v100",
        slurm_addon=slurm_addon,
    )

batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1882820


## hoc

In [53]:
# Args to modify according to needs
debug = False 

# Local Paths
run_hoc_path = f"{root}/evaluation/run_hoc.py"
f1_path = f"{root}/evaluation/metrics/evaluate_f1.py"
hoc_path = f"{root}/data/hallmarks_of_cancer"

In [54]:
# Torchrun (distributed training) Arguments
base_cmd = f"python {run_hoc_path} "

# Model Argument
base_cmd += f"--trust_remote_code true "

# Dataset Arguments
base_cmd += "--load_from_disk "
base_cmd += f"--dataset_path {hoc_path} "
base_cmd += f"--max_seq_length {max_seq_length} "
base_cmd += f"--metric_path {f1_path} "

# Training Arguments
## Basic arguments 
base_cmd += "--do_train --do_eval --do_predict "
base_cmd += "--overwrite_output_dir true " if debug else "" 
## Hyperparameters 
base_cmd += f"--per_device_train_batch_size {batch_size} " 
base_cmd += f"--per_device_eval_batch_size {batch_size} " 
base_cmd += "--learning_rate 3e-5 " 
## Efficiency / Memory
base_cmd += f"--{precision} true "
base_cmd += "--eval_accumulation_steps 2 "
## Number of steps / epochs
base_cmd += f"--num_train_epochs 5 "
base_cmd += f"--warmup_ratio 0.1 "
## Evaluation / Logging / Model Save
base_cmd += "--evaluation_strategy steps "
base_cmd += "--logging_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--eval_steps 0.1 "
base_cmd += "--logging_steps 0.1 "
base_cmd += "--save_steps 0.1 " 
base_cmd += "--logging_first_step true " 
base_cmd += "--save_total_limit 2 "
base_cmd += "--load_best_model_at_end true "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to wandb "

# add exps
seed_nb = 5
cmds, exp_names = [], []
for model_path in models_paths:
    for seed in range(seed_nb):
        cmd = base_cmd
        cmd += f"--model_name_or_path {model_path} "
        cmd += f"--seed {seed} "
        # exp_name
        exp_name = f"{model_path.split('/')[-1]}_hoc_seed{seed}"
        exp_name += '_debug' if debug else ''
        cmd += f"--evaluate_cache_dir evaluation/out/.evaluate_cache/{exp_name} "
        # out_dir
        out_dir = out_dir_template.format(exp_name=exp_name)
        cmd += f"--output_dir {out_dir} "
        # fill lists
        if os.path.exists(os.path.join(out_dir,"predict_results.json")):
            #print(exp_name, "already finished")
            continue
        cmds.append(cmd)
        exp_names.append(exp_name)
# Display experiences and chosen debug
for i,e in enumerate(exp_names):print(i,":",e)
if debug : 
    i = 5 # CHANGE THIS VALUE TO CHOOSE WHICH EXPERIENCE TO DEBUG
    cmds = [cmds[i]]
    exp_names = [exp_names[i]]
    print("--------------------------")
    print(f"Debugging with only exp n°{i}")
    print(exp_names)

In [12]:
slurm_addon_template = """#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

for cmd,exp_name in zip(cmds, exp_names) :   
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "pytorch-gpu/py3/2.2.0",
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "v100-32g",
        time_max="02:00:00",
        account=f"aro@v100",
        slurm_addon=slurm_addon,
    )

batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941889
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941890
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941891
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941892
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941894


## qa

In [51]:
# Args to modify according to needs
debug = False 

# Local Paths
run_qa_path = f"{root}/evaluation/run_qa.py"
f1_path = f"{root}/evaluation/metrics/evaluate_accuracy.py"
datasets_paths = [f"{root}/data/pubmed_qa",f"{root}/data/bioasq_task_b",]

In [52]:
# Torchrun (distributed training) Arguments
base_cmd = f"python {run_qa_path} "

# Model Argument
base_cmd += f"--trust_remote_code true "

# Dataset Arguments
base_cmd += "--text_column_names question,context "
base_cmd += "--text_column_delimiter [SEP] "
base_cmd += "--label_column_name answer "
base_cmd += f"--max_seq_length {max_seq_length} "
base_cmd += f"--metric_path {f1_path} "

# Training Arguments
## Basic arguments 
base_cmd += "--do_train --do_eval --do_predict "
base_cmd += "--overwrite_output_dir true " if debug else "" 
## Hyperparameters 
base_cmd += f"--per_device_train_batch_size {batch_size} " 
base_cmd += f"--per_device_eval_batch_size {batch_size} " 
base_cmd += "--learning_rate 3e-5 " 
## Efficiency / Memory
base_cmd += f"--{precision} true "
base_cmd += "--eval_accumulation_steps 2 "
## Number of steps / epochs
base_cmd += f"--num_train_epochs 5 "
base_cmd += f"--warmup_ratio 0.1 "
## Evaluation / Logging / Model Save
base_cmd += "--evaluation_strategy steps "
base_cmd += "--logging_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--eval_steps 0.1 "
base_cmd += "--logging_steps 0.1 "
base_cmd += "--save_steps 0.1 " 
base_cmd += "--logging_first_step true " 
base_cmd += "--save_total_limit 2 "
base_cmd += "--load_best_model_at_end true "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to wandb "

# add exps
seed_nb = 10 # 10 RUNS FOR QA DATASETS
cmds, exp_names = [], []
for dataset_path in datasets_paths:
    for model_path in models_paths:
        for seed in range(seed_nb):
            cmd = base_cmd
            cmd += f"--dataset_path {dataset_path} "
            cmd += f"--model_name_or_path {model_path} "
            cmd += f"--seed {seed} "
            # exp_name
            exp_name = f"{model_path.split('/')[-1]}"
            exp_name += f"_{dataset_path.split('/')[-1].replace('_','-')}_seed{seed}"
            exp_name += '_debug' if debug else ''
            cmd += f"--evaluate_cache_dir evaluation/out/.evaluate_cache/{exp_name} "
            # out_dir
            out_dir = out_dir_template.format(exp_name=exp_name)
            cmd += f"--output_dir {out_dir} "
            # fill lists
            if os.path.exists(os.path.join(out_dir,"predict_results.json")):
                #print(exp_name, "already finished")
                continue
            cmds.append(cmd)
            exp_names.append(exp_name)
# Display experiences and chosen debug
for i,e in enumerate(exp_names):print(i,":",e)
if debug : 
    i = 39 # CHANGE THIS VALUE TO CHOOSE WHICH EXPERIENCE TO DEBUG
    cmds = [cmds[i]]
    exp_names = [exp_names[i]]
    print("--------------------------")
    print(f"Debugging with only exp n°{i}")
    print(exp_names)

In [15]:
slurm_addon_template = """#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

for cmd,exp_name in zip(cmds, exp_names) :   
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "pytorch-gpu/py3/2.2.0",
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "v100-32g",
        time_max="02:00:00",
        account=f"aro@v100",
        slurm_addon=slurm_addon,
    )

batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941896
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941898
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941899
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941900
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941902
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941903
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941905
batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1941906
batch job 0: 1 G

## rel. extraction

In [4]:
# Args to modify according to needs
debug = False 

# Local Paths
run_path = f"{root}/evaluation/run_relation_extraction.py"
metric_path = f"{root}/evaluation/metrics/evaluate_f1.py"
datasets_paths = [f"{root}/data/chemprot",f"{root}/data/ddi_corpus",f"{root}/data/gad"]

In [5]:
# Torchrun (distributed training) Arguments
base_cmd = f"python {run_path} "

# Model Argument
base_cmd += f"--trust_remote_code true "

# Dataset Arguments
base_cmd += "--text_column_names text "
base_cmd += f"--max_seq_length {max_seq_length} "
base_cmd += f"--metric_path {metric_path} "

# Training Arguments
## Basic arguments 
base_cmd += "--do_train --do_eval --do_predict "
base_cmd += "--overwrite_output_dir true " if debug else "" 
## Hyperparameters 
base_cmd += f"--per_device_train_batch_size {batch_size} " 
base_cmd += f"--per_device_eval_batch_size {batch_size} " 
base_cmd += "--learning_rate 3e-5 " 
## Efficiency / Memory
base_cmd += f"--{precision} true "
base_cmd += "--eval_accumulation_steps 2 "
## Number of steps / epochs
base_cmd += f"--num_train_epochs 5 "
base_cmd += f"--warmup_ratio 0.1 "
## Evaluation / Logging / Model Save
base_cmd += "--evaluation_strategy steps "
base_cmd += "--logging_strategy steps " 
base_cmd += "--save_strategy steps " 
base_cmd += "--eval_steps 0.1 "
base_cmd += "--logging_steps 0.1 "
base_cmd += "--save_steps 0.1 " 
base_cmd += "--logging_first_step true " 
base_cmd += "--save_total_limit 2 "
base_cmd += "--load_best_model_at_end true "
## Experiment Visualisation
base_cmd += "--disable_tqdm true "
base_cmd += "--report_to wandb "

# add exps
seed_nb = 5
cmds, exp_names = [], []
for dataset_path in datasets_paths:
    for model_path in models_paths:
        for seed in range(seed_nb):
            cmd = base_cmd
            cmd += f"--dataset_path {dataset_path} "
            cmd += f"--model_name_or_path {model_path} "
            cmd += f"--seed {seed} "
            # exp_name
            exp_name = f"{model_path.split('/')[-1]}"
            exp_name += f"_{dataset_path.split('/')[-1].replace('_','-')}_seed{seed}"
            exp_name += '_debug' if debug else ''
            cmd += f"--evaluate_cache_dir evaluation/out/.evaluate_cache/{exp_name} "
            # out_dir
            out_dir = out_dir_template.format(exp_name=exp_name)
            cmd += f"--output_dir {out_dir} "
            # fill lists
            if os.path.exists(os.path.join(out_dir,"predict_results.json")):
                #print(exp_name, "already finished")
                continue
            cmds.append(cmd)
            exp_names.append(exp_name)
# Display experiences and chosen debug
for i,e in enumerate(exp_names):print(i,":",e)
if debug : 
    i = 94 # CHANGE THIS VALUE TO CHOOSE WHICH EXPERIENCE TO DEBUG
    cmds = [cmds[i]]
    exp_names = [exp_names[i]]
    print("--------------------------")
    print(f"Debugging with only exp n°{i}")
    print(exp_names)

In [49]:
slurm_addon_template = """#SBATCH --output=slurm/log/{exp_name}.out 
#SBATCH --error=slurm/log/{exp_name}.err"""

for cmd,exp_name in zip(cmds, exp_names) :   
    # change log filename according to experience
    slurm_addon = slurm_addon_template.format(exp_name=exp_name)
    # send job
    job_ids = gpu_jobs_submitter(
        cmd,
        name = exp_name,
        module = "pytorch-gpu/py3/2.2.0",
        qos = "qos_gpu-dev" if debug else "qos_gpu-t3",
        constraint = "v100-32g",
        time_max="02:00:00",
        account=f"aro@v100",
        slurm_addon=slurm_addon,
    )

batch job 0: 1 GPUs distributed on 1 nodes with 1 tasks / 1 gpus per node and 10 cpus per task
Submitted batch job 1942752


# job control and wandb synchronization

In [1]:
!squeue -u $USER

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           1977363    gpu_p5 none_all  urz45id  R    1:25:16      1 jean-zay-iam33
           1977364    gpu_p5 sjr_top5  urz45id  R    1:25:16      1 jean-zay-iam33


In [17]:
# sync weigths and biases
# TODO : handle distributed logging (one wandb run for each GPU used currently)
!wandb sync --include-offline wandb/offline-* --clean-force

Find logs at: /gpfsssd/scratch/rech/aro/urz45id/pretrain-med-data-qual/wandb/debug-cli.urz45id.log
.wandb file is empty (header is 0 bytes instead of the expected 7), skipping: /gpfsssd/scratch/rech/aro/urz45id/pretrain-med-data-qual/wandb/offline-run-20240514_083902-0mxzed0b/run-0mxzed0b.wandb
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
.wandb file is empty (header is 0 bytes instead of the expected 7), skipping: /gpfsssd/scratch/rech/aro/urz45id/pretrain-med-data-qual/wandb/offline-run-20240514_141218-v8abme9d/run-v8abme9d.wandb
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/6h40ksz2 ... done.
done.
done.
done.
done.
Syncing: https://wandb.ai/laiking/pretrain-med-data-qual/runs/yrtyurfe ... done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
done.
don

# postproc

In [66]:
# cancel all my jobs
!scancel -u $USER

In [36]:
# delete logs
!rm -rf slurm/log/*

In [2]:
# delete debug dirs
!rm -rf pretraining/*_debug/
!rm -rf pretraining/*_ckpt-*

In [3]:
!rm -rf evaluation/out/*_debug
!rm -rf evaluation/out/*_ckpt-*

In [33]:
# delete wandb run dir
!rm -rf wandb

In [38]:
# delete evaluate cache
!rm -rf evaluation/out/.evaluate_cache

In [37]:
# delete slurm files
!rm -rf slurm/*.slurm
!rm -rf data/.blurb_cache/seqeval
!rm -rf core-python-*
!rm -rf **/.ipynb_checkpoints

In [6]:
# copy eval output dirs to WORK
out_dir = f"{os.getenv('WORK')}/results/pretrain-med-data-qual"
for pred_f in glob.glob("evaluation/out/*/predict_results.json"):
    out_f = f"{out_dir}/{pred_f.split('/')[-2]}.json"
    if not os.path.exists(out_f):
        res = json.load(open(pred_f))
        json.dump(res,open(out_f,'w'))

In [7]:
# clean evaluation output dirs
for res_dir in glob.glob("evaluation/out/*/"):
    if os.path.exists(os.path.join(res_dir,"predict_results.json")):
        for ckpt in glob.glob(f"{res_dir}/checkpoint-*"):
            shutil.rmtree(ckpt)
        for fpath in glob.glob(f"{res_dir}/*.*"):
            if "result" not in fpath:
                os.remove(fpath)

In [29]:
# copy models to WORK
models_work = os.path.join(os.getenv("WORK"),"models","perso")
for subp in os.listdir("pretraining"):
    if '.' in subp:continue
    renamed = "bert-bio_" + subp.replace("_","-").replace("%","").replace("h-index","hind")
    out_dir = os.path.join(models_work,renamed)
    if os.path.exists(f"pretraining/{subp}/model.safetensors"):
        if not os.path.isdir(out_dir):os.mkdir(out_dir)
        for f in os.listdir(f"pretraining/{subp}"):
            if "checkpoint" not in f:
                shutil.copy(f"pretraining/{subp}/{f}",out_dir)