diff --git a/evaluation/installers/g-leaderboard/README.md b/evaluation/installers/g-leaderboard/README.md new file mode 100644 index 00000000..12192a3b --- /dev/null +++ b/evaluation/installers/g-leaderboard/README.md @@ -0,0 +1,86 @@ +# LLM Evaluation using g-leaderboard (GENIAC Official Evaluation) + +This repository contains scripts for evaluating LLMs using [g-leaderboard](https://github.com/wandb/llm-leaderboard/tree/g-leaderboard). + +## Usage + +### Build + +Clone this repository and move to the installation directory. + +```bash +git clone https://github.com/llm-jp/scripts +cd scripts/evaluation/installers/g-leaderboard +``` + +Then, run the installation script. +The following command will create an installation directory under the specified directory (here, `~/g-leaderboard`). + +```bash +# NOTE: Using a CPU node is recommended as the installation process doesn't require GPUs + +# For a cluster with SLURM +sbatch --partition {partition} install.sh ~/g-leaderboard + +# For a cluster without SLURM +bash install.sh ~/g-leaderboard > logs/install.out 2> logs/install.err +``` + +After the installation is complete, set up the wandb and huggingface accounts. + +```shell +cd ~/g-leaderboard +source environment/venv/bin/activate +wandb login +huggingface-cli login +``` + +### Contents in installed directory (~/g-leaderboard) + +The following directory structure will be created after installation. + +``` +~/g-leaderboard/ + run_g-leaderboard.sh Script for running g-leaderboard + logs/ Log files written by SLURM jobs + resources/ + config_base.yaml Configuration file template + environment/ + installer_envvar.log List of environment variables recorded during installation + install.sh Installation script + python/ Python built from source + scripts/ Scripts for environment settings + src/ Downloaded libraries + venv/ Python virtual environemnt (linked to python/) +``` + +### Evaluation + +The evaluation script takes the model path and wandb run name as arguments. +For the other settings, edit the configuration file `resources/config_base.yaml` and/or `run_g-leaderboard.sh`. + - To edit the tokenizer, wandb entity, and/or wandb project: Edit `run_g-leaderboard.sh`. + - Otherwise: Edit `resources/config_base.yaml` and `run_g-leaderboard.sh`. + +```shell +cd ~/g-leaderboard + +# For a cluster with SLURM +AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_KEY=xxx sbatch --partition {partition} run_g-leaderboard.sh {path/to/model} {wandb.run_name} + +# For a cluster without SLURM +CUDA_VISIBLE_DEVICES= AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_KEY=xxx bash run_g-leaderboard.sh {path/to/model} {wandb.run_name} +``` + +#### Sample code + +```shell +# For a cluster with SLURM +AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_KEY=xxx sbatch --partition {partition} run_g-leaderboard.sh llm-jp/llm-jp-13b-v2.0 g-leaderboard-$(whoami) + +# For a cluster without SLURM +AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_KEY=xxx bash run_g-leaderboard.sh llm-jp/llm-jp-13b-v2.0 g-leaderboard-$(whoami) +``` + +### About Azure OpenAI API + +To conduct an evaluation, you must configure the Azure OpenAI API by setting the endpoint and key for the deployment named `gpt-4`, which corresponds to `gpt-4-0613`. Please contact the administrator to obtain the necessary endpoint and key. diff --git a/evaluation/installers/g-leaderboard/install.sh b/evaluation/installers/g-leaderboard/install.sh new file mode 100644 index 00000000..9b0747fb --- /dev/null +++ b/evaluation/installers/g-leaderboard/install.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# +# g-leaderboard installation script +# +# This script use CPU on a cluster. +# - In a SLURM environment, it is recommended to use CPU nodes. +# +# Usage: +# On a cluster with SLURM: +# Run `sbatch --paratition {partition} install.sh TARGET_DIR` +# On a cluster without SLURM: +# Run `bash install.sh TARGET_DIR > logs/install-eval.out 2> logs/install-eval.err` +# - TARGET_DIR: Instalation directory +# +#SBATCH --job-name=install-g-leaderboard +#SBATCH --partition={FIX_ME} +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --mem=0 +#SBATCH --output=logs/%x-%j.out +#SBATCH --error=logs/%x-%j.err + +set -eux -o pipefail + +if [ $# -ne 1 ]; then + set +x + >&2 echo Usage: sbatch \(or bash\) install.sh TARGET_DIR + exit 1 +fi + +INSTALLER_DIR=$(pwd) +TARGET_DIR=$1 +INSTALLER_COMMON=$INSTALLER_DIR/../../../common/installers.sh + +>&2 echo INSTALLER_DIR=$INSTALLER_DIR +>&2 echo TARGET_DIR=$TARGET_DIR +>&2 echo INSTALLER_COMMON=$INSTALLER_COMMON +source $INSTALLER_COMMON + +mkdir -p $TARGET_DIR +pushd $TARGET_DIR + +# Copy basic scripts for g-leaderboard +cp ${INSTALLER_DIR}/scripts/run_g-leaderboard.sh . +mkdir resources +cp ${INSTALLER_DIR}/resources/config_base.yaml resources/ +mkdir logs + +ENV_DIR=${TARGET_DIR}/environment +mkdir $ENV_DIR +pushd $ENV_DIR + +# Copy enviroment scripts +cp ${INSTALLER_DIR}/install.sh . +mkdir scripts + +# Create environment.sh +BASE_ENV_SHELL=${INSTALLER_DIR}/scripts/environment.sh +NEW_ENV_SHELL=scripts/environment.sh +cp $BASE_ENV_SHELL $NEW_ENV_SHELL + +source $NEW_ENV_SHELL + +# Record current environment variables +set > installer_envvar.log + +# src is used to store all resources for from-scratch builds +mkdir src +pushd src + +# Install Python (function in $INSTALLER_COMMON) +install_python v${PYTHON_VERSION} ${ENV_DIR}/python +popd # $ENV_DIR + +# Prepare venv +python/bin/python3 -m venv venv +source venv/bin/activate + +# Install g-leaderboard +pushd src +git clone https://github.com/wandb/llm-leaderboard g-leaderboard -b g-leaderboard +pushd g-leaderboard +pip install --no-cache-dir -r requirements.txt + +# Deploy blended run config +BLENDED_RUN_CONFIG=${INSTALLER_DIR}/resources/blended_run_config.yaml +cp $BLENDED_RUN_CONFIG blend_run_configs/config.yaml + +echo "Installation done." | tee >(cat >&2) diff --git a/evaluation/installers/g-leaderboard/logs/.gitignore b/evaluation/installers/g-leaderboard/logs/.gitignore new file mode 100644 index 00000000..5e7d2734 --- /dev/null +++ b/evaluation/installers/g-leaderboard/logs/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/evaluation/installers/g-leaderboard/resources/blended_run_config.yaml b/evaluation/installers/g-leaderboard/resources/blended_run_config.yaml new file mode 100644 index 00000000..640656f1 --- /dev/null +++ b/evaluation/installers/g-leaderboard/resources/blended_run_config.yaml @@ -0,0 +1,25 @@ +run_chain: false # If you want to reuse past evaluation results in a new run, please set it to true. + +new_run: # This setting is for blending runs without running new evaluations. If run_chain is set to true, this setting is disabled. + entity: "your/WANDB/entity" + project: "your/WANDB/project" + run_name: "your/WANDB/run_name" + +old_runs: # Please specify the tasks you want to carry over from past runs. Multiple runs are permissible. + - run_path: "your/WANDB/run_path" + tasks: # The list of tasks to take over. Please comment out tasks that do not need to be taken over. + - jaster_ja_0_shot + - jaster_ja_4_shot + - jaster_en_0_shot + - jaster_en_4_shot + - mtbench_ja + - mtbench_en + # - run_path: "your/WANDB/run_path" + # tasks: + # - jaster_ja_0_shot + # - jaster_ja_4_shot + # - jaster_en_0_shot + # - jaster_en_4_shot + # - mtbench_ja + # - mtbench_en + \ No newline at end of file diff --git a/evaluation/installers/g-leaderboard/resources/config_base.yaml b/evaluation/installers/g-leaderboard/resources/config_base.yaml new file mode 100644 index 00000000..2679945e --- /dev/null +++ b/evaluation/installers/g-leaderboard/resources/config_base.yaml @@ -0,0 +1,86 @@ +testmode: false # If you want to test with a small amount of data, please set it to true. +model_name: "<>" # will be used in Table + +wandb: + entity: "<>" + project: "<>" + run_name: "<>" # this run_name will be used as the name of run in leaderboard. Can be changed later + +# Tasks to run +run_llm_jp_eval_ja_0_shot: true +run_llm_jp_eval_ja_few_shots: true +run_llm_jp_eval_en_0_shot: true +run_llm_jp_eval_en_few_shots: true +run_mt_bench_ja: true +run_mt_bench_en: true + +model: + api: false # if you don't use api, please set "api" as "false". If you use api, please select from "openai", "anthoropic", "google", "cohere", "mistral", "amazon_bedrock" + use_wandb_artifacts: false # if you user wandb artifacts, please set true. + artifacts_path: null # if you user wandb artifacts, please paste the link. if not, please leave it as "". + pretrained_model_name_or_path: "<>" #If you use openai api, put the name of model + device_map: "auto" + load_in_8bit: false + load_in_4bit: false + +# for llm-jp-eval +llm_jp_eval: + max_seq_length: 4096 + target_dataset: "all" # {all, jamp, janli, jcommonsenseqa, jemhopqa, jnli, jsem, jsick, jsquad, jsts, niilc, chabsa, mmlu_en} + ja_num_shots: 4 # if run_llm_jp_eval_ja_few_shots is true, please set the num of few shots. Default is 4 + en_num_shots: 4 # run_llm_jp_eval_en_few_shots is true, please set the num of few shots. Default is 4 + torch_dtype: "bf16" # {fp16, bf16, fp32} + # Items that do not need to be changed unless specifically intended. + dataset_artifact: "wandb-japan/llm-leaderboard/jaster:v11" + dataset_dir: "/jaster/1.2.6/evaluation/test" + ja: + custom_prompt_template: "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n{instruction}\n\n### 入力:\n{input}\n\n### 応答:\n" + custom_fewshots_template: "\n\n### 入力:\n{input}\n\n### 応答:\n{output}" + en: + custom_prompt_template: "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n{instruction}\n\n### 入力:\n{input}\n\n### 応答:\n" + custom_fewshots_template: "\n\n### 入力:\n{input}\n\n### 応答:\n{output}" + +# for mtbench +mtbench: + model_id: "<>" # cannot use '<', '>', ':', '"', '/', '\\', '|', '?', '*', '.' + max_new_token: 1024 + num_gpus_per_model: 8 + num_gpus_total: 8 + max_gpu_memory: null + dtype: bfloat16 # None or float32 or float16 or bfloat16 + use_azure: true # if you use azure openai service for evaluation, set true + # for conv template # added + custom_conv_template: true + # the following variables will be used when custom_conv_template is set as true + conv_name: "custom" + conv_sep: "\n\n### " + conv_stop_token_ids: "[2]" + conv_stop_str: "###" + conv_role_message_separator: ":\n" + conv_role_only_separator: ":\n" + ja: + conv_system_message: "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。" + conv_roles: "('指示', '応答')" + en: + conv_system_message: "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。" + conv_roles: "('指示', '応答')" + dataset: # Items that do not need to be changed unless specifically intended. + ja: + question_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_question:v3" + test_question_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_question_small_for_test:v5" + referenceanswer_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_referenceanswer:v1" + test_referenceanswer_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_referenceanswer_small_for_test:v1" + judge_prompt_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_prompt:v1" + bench_name: "mt_bench_ja" + en: + question_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_question:v0" + test_question_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_question_small_for_test:v0" + referenceanswer_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_referenceanswer:v0" + test_referenceanswer_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_referenceanswer_small_for_test:v0" + judge_prompt_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_prompt:v0" + bench_name: "mt_bench_en" + +#================================================================== +# Items that do not need to be changed unless specifically intended. +#================================================================== +github_version: g-eval-v1.0 #for recording diff --git a/evaluation/installers/g-leaderboard/scripts/environment.sh b/evaluation/installers/g-leaderboard/scripts/environment.sh new file mode 100644 index 00000000..b031595d --- /dev/null +++ b/evaluation/installers/g-leaderboard/scripts/environment.sh @@ -0,0 +1,4 @@ +# List of environment variables and module loads for g-leaderboard + +export LANG=ja_JP.UTF-8 +export PYTHON_VERSION=3.10.14 diff --git a/evaluation/installers/g-leaderboard/scripts/run_g-leaderboard.sh b/evaluation/installers/g-leaderboard/scripts/run_g-leaderboard.sh new file mode 100644 index 00000000..cf2de78b --- /dev/null +++ b/evaluation/installers/g-leaderboard/scripts/run_g-leaderboard.sh @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --job-name=g-leaderboard +#SBATCH --partition= +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --gpus=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=logs/%x-%j.out +#SBATCH --error=logs/%x-%j.err + +set -eux + +# Open file limit +ulimit -n 65536 1048576 + +ENV_DIR=environment +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +# Arguments +MODEL=$1 +WANDB_RUN_NAME=$2 + +# Semi-fixed vars +CONFIG_TEMPLATE=resources/config_base.yaml +TOKENIZER=$MODEL +WANDB_ENTITY=llm-jp-eval +WANDB_PROJECT=test + +# Fixed vars +G_LEADERBOARD_DIR=${ENV_DIR}/src/g-leaderboard +CONFIG_DIR=${G_LEADERBOARD_DIR}/configs + +# Config settings +NEW_CONFIG=${CONFIG_DIR}/config.${WANDB_PROJECT}.${WANDB_RUN_NAME}.yaml +REPLACE_VARS=("MODEL" "TOKENIZER" "WANDB_ENTITY" "WANDB_PROJECT" "WANDB_RUN_NAME") + +# Create a new config file to save the config file of each run +cp $CONFIG_TEMPLATE $NEW_CONFIG + +# Replace variables +for VAR in "${REPLACE_VARS[@]}"; do + VALUE=$(eval echo \${$VAR}) + sed -i "s|<<${VAR}>>|${VALUE}|g" $NEW_CONFIG +done + +# Create a temporal project +# NOTE: This is necessary to avoid using incorrect configurations when running multiple jobs at the same time. +TMP_G_LEADERBOARD_DIR=$(mktemp -d "${ENV_DIR}/src/g-leaderboard.XXXXXXXX") +cp -r $G_LEADERBOARD_DIR/* $TMP_G_LEADERBOARD_DIR +cp $NEW_CONFIG $TMP_G_LEADERBOARD_DIR/configs/config.yaml + +# Run g-leaderboard +SCRIPT_PATH=scripts/run_eval.py +pushd $TMP_G_LEADERBOARD_DIR +python $SCRIPT_PATH + +# Clean up +popd +rm -rf $TMP_G_LEADERBOARD_DIR + +echo "Done"