# Setup hyperparameter tuning of the Atlas fine-tuning on ParaRel

In [20]:
import os
import random
random.seed(42)

In [12]:
possible_params = {"batch_size": [4, 32, 64],
                   "lr": [(5e-5, 1e-5), (4e-5, 4e-5)],
                   #"train_steps" covered in script
                   "retriever_temp": [0.1, 0.01],
                   "n_context": [10,20,30],
                   "training_data": [("P138",), ("P138","P127"), ("P138","P127","P1412")]
                  }

In [17]:
nbr_settings = 22
setting_params_list = []
for _ in range(nbr_settings):
    setting_params = {}
    for key, val in possible_params.items():
        setting_params[key] = random.sample(val, 1)[0]
    print(setting_params)
    setting_params_list.append(setting_params)

{'batch_size': 32, 'lr': (5e-05, 1e-05), 'retriever_temp': 0.1, 'n_context': 20, 'training_data': ('P138',)}
{'batch_size': 4, 'lr': (5e-05, 1e-05), 'retriever_temp': 0.01, 'n_context': 20, 'training_data': ('P138', 'P127', 'P1412')}
{'batch_size': 4, 'lr': (4e-05, 4e-05), 'retriever_temp': 0.1, 'n_context': 30, 'training_data': ('P138', 'P127', 'P1412')}
{'batch_size': 32, 'lr': (4e-05, 4e-05), 'retriever_temp': 0.01, 'n_context': 10, 'training_data': ('P138',)}
{'batch_size': 64, 'lr': (4e-05, 4e-05), 'retriever_temp': 0.01, 'n_context': 10, 'training_data': ('P138',)}
{'batch_size': 32, 'lr': (5e-05, 1e-05), 'retriever_temp': 0.01, 'n_context': 10, 'training_data': ('P138', 'P127', 'P1412')}
{'batch_size': 32, 'lr': (4e-05, 4e-05), 'retriever_temp': 0.1, 'n_context': 10, 'training_data': ('P138',)}
{'batch_size': 64, 'lr': (5e-05, 1e-05), 'retriever_temp': 0.1, 'n_context': 20, 'training_data': ('P138', 'P127', 'P1412')}
{'batch_size': 64, 'lr': (5e-05, 1e-05), 'retriever_temp': 0.0

Create the corresponding run files starting from a template file

In [18]:
template_file_path = "/cephyr/users/lovhag/Alvis/projects/atlas/alvis_scripts/pararel_training_hyperparam_search/train_template.sh"

template_file = ""
with open(template_file_path) as f:
    for line in f.readlines():
        template_file = template_file + line
print(template_file)

#!/usr/bin/env bash
#SBATCH -p alvis
#SBATCH -A SNIC2022-22-1040
#SBATCH -N NULL
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-node=A40:4
#SBATCH --job-name=train-atlas-pararel-hyperparam-search-template
#SBATCH -o /mimer/NOBACKUP/groups/snic2021-23-309/project-data/atlas/logs/pararel_train_hyperparam_search_template.out
#SBATCH -t 0-04:00:00

# COMMENT: this script relies on a previous passage encoding (load_index_path)

set -eo pipefail

module load PyTorch/1.11.0-foss-2021a-CUDA-11.3.1
source venv/bin/activate

size=base
YEAR=${1:-"2017"}

PASSAGES="data/corpora/wiki/enwiki-dec${YEAR}/text-list-100-sec.jsonl data/corpora/wiki/enwiki-dec${YEAR}/infobox.jsonl"

EXPERIMENT_NAME=template-${SLURM_JOB_ID}

port=$(shuf -i 15000-16000 -n 1)
TRAIN_FILES="/cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas/NULL"
EVAL_FILES="/cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas/P17_100.jsonl /cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas/P101_100.jsonl /cephyr

In [None]:
script_names = {"batch_size": ["#SBATCH -N"],
                "lr": ["--lr", "--lr_retriever"],
                "retriever_temp": ["--temperature_score"],
                "n_context": ["--n_context", "--retriever_n_context"],
                "training_data": ["TRAIN_FILES='/cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas/'"]}

In [27]:
for ix, setting_params in enumerate(setting_params_list):
    exp_num = f"{ix:02}"
    new_file = template_file.replace("template", exp_num)
    new_file = new_file.replace("#SBATCH -N NULL", f"#SBATCH -N {int(setting_params['batch_size']/4)}")
    new_file = new_file.replace("--lr NULL", f"--lr {setting_params['lr'][0]}")
    new_file = new_file.replace("--lr_retriever NULL", f"--lr_retriever {setting_params['lr'][1]}")
    new_file = new_file.replace("--temperature_score NULL", f"--temperature_score {setting_params['retriever_temp']}")
    new_file = new_file.replace("--n_context NULL", f"--n_context {setting_params['n_context']}")
    new_file = new_file.replace("--retriever_n_context NULL", f"--retriever_n_context {setting_params['n_context']}")
    
    train_dir = "/cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas/"
    train_files = []
    for train_file in setting_params["training_data"]:
        train_files.append(os.path.join(train_dir, train_file+".jsonl"))
    replace_string = 'TRAIN_FILES="/cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas/NULL"'
    new_file = new_file.replace(replace_string, f'TRAIN_FILES="{(" ").join(train_files)}"')
    
    filename = template_file_path.replace("template", exp_num)
    with open(filename, "w") as f:
        f.write(new_file)