llm-jp · hkiyomaru · Sep 3, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/evaluation/installers/g-leaderboard/README.md b/evaluation/installers/g-leaderboard/README.md
@@ -0,0 +1,86 @@
+# LLM Evaluation using g-leaderboard (GENIAC Official Evaluation)
+
+This repository contains scripts for evaluating LLMs using [g-leaderboard](https://github.com/wandb/llm-leaderboard/tree/g-leaderboard).
+
+## Usage
+
+### Build
+
+Clone this repository and move to the installation directory.
+
+```bash
+git clone https://github.com/llm-jp/scripts
+cd scripts/evaluation/installers/g-leaderboard
+```
+
+Then, run the installation script.
+The following command will create an installation directory under the specified directory (here, `~/g-leaderboard`).
+
+```bash
+# NOTE: Using a CPU node is recommended as the installation process doesn't require GPUs
+
+# For a cluster with SLURM
+sbatch --partition {partition} install.sh ~/g-leaderboard
+
+# For a cluster without SLURM
+bash install.sh ~/g-leaderboard > logs/install.out 2> logs/install.err
+```
+
+After the installation is complete, set up the wandb and huggingface accounts.
+
+```shell
+cd ~/g-leaderboard
+source environment/venv/bin/activate
+wandb login
+huggingface-cli login
+```
+
+### Contents in installed directory (~/g-leaderboard)
+
+The following directory structure will be created after installation.
+
+```
+~/g-leaderboard/
+    run_g-leaderboard.sh      Script for running g-leaderboard
+    logs/                     Log files written by SLURM jobs
+    resources/
+        config_base.yaml      Configuration file template
+    environment/
+        installer_envvar.log  List of environment variables recorded during installation
+        install.sh            Installation script
+        python/               Python built from source
+        scripts/              Scripts for environment settings
+        src/                  Downloaded libraries
+        venv/                 Python virtual environemnt (linked to python/)
+```
+
+### Evaluation
+
+The evaluation script takes the model path and wandb run name as arguments.
+For the other settings, edit the configuration file `resources/config_base.yaml` and/or `run_g-leaderboard.sh`.
+ - To edit the tokenizer, wandb entity, and/or wandb project: Edit `run_g-leaderboard.sh`.
+ - Otherwise: Edit `resources/config_base.yaml` and `run_g-leaderboard.sh`.
+
+```shell
+cd ~/g-leaderboard
+
+# For a cluster with SLURM
+AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_KEY=xxx sbatch --partition {partition} run_g-leaderboard.sh {path/to/model} {wandb.run_name}
+
+# For a cluster without SLURM
+CUDA_VISIBLE_DEVICES=<num> AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_KEY=xxx bash run_g-leaderboard.sh {path/to/model} {wandb.run_name}
+```
+
+#### Sample code
+
+```shell
+# For a cluster with SLURM
+AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_KEY=xxx sbatch --partition {partition} run_g-leaderboard.sh llm-jp/llm-jp-13b-v2.0 g-leaderboard-$(whoami)
+
+# For a cluster without SLURM
+AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_KEY=xxx bash run_g-leaderboard.sh llm-jp/llm-jp-13b-v2.0 g-leaderboard-$(whoami)
+```
+
+### About Azure OpenAI API
+
+To conduct an evaluation, you must configure the Azure OpenAI API by setting the endpoint and key for the deployment named `gpt-4`, which corresponds to `gpt-4-0613`. Please contact the administrator to obtain the necessary endpoint and key.
diff --git a/evaluation/installers/g-leaderboard/install.sh b/evaluation/installers/g-leaderboard/install.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+#
+# g-leaderboard installation script
+#
+# This script use CPU on a cluster.
+#  - In a SLURM environment, it is recommended to use CPU nodes.
+#
+# Usage:
+# On a cluster with SLURM:
+#   Run `sbatch --paratition {partition} install.sh TARGET_DIR`
+# On a cluster without SLURM:
+#   Run `bash install.sh TARGET_DIR > logs/install-eval.out 2> logs/install-eval.err`
+# - TARGET_DIR: Instalation directory
+#
+#SBATCH --job-name=install-g-leaderboard
+#SBATCH --partition={FIX_ME}
+#SBATCH --nodes=1
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --output=logs/%x-%j.out
+#SBATCH --error=logs/%x-%j.err
+
+set -eux -o pipefail
+
+if [ $# -ne 1 ]; then
+  set +x
+  >&2 echo Usage: sbatch \(or bash\) install.sh TARGET_DIR
+  exit 1
+fi
+
+INSTALLER_DIR=$(pwd)
+TARGET_DIR=$1
+INSTALLER_COMMON=$INSTALLER_DIR/../../../common/installers.sh
+
+>&2 echo INSTALLER_DIR=$INSTALLER_DIR
+>&2 echo TARGET_DIR=$TARGET_DIR
+>&2 echo INSTALLER_COMMON=$INSTALLER_COMMON
+source $INSTALLER_COMMON
+
+mkdir -p $TARGET_DIR
+pushd $TARGET_DIR
+
+# Copy basic scripts for g-leaderboard
+cp ${INSTALLER_DIR}/scripts/run_g-leaderboard.sh .
+mkdir resources
+cp ${INSTALLER_DIR}/resources/config_base.yaml resources/
+mkdir logs
+
+ENV_DIR=${TARGET_DIR}/environment
+mkdir $ENV_DIR
+pushd $ENV_DIR
+
+# Copy enviroment scripts
+cp ${INSTALLER_DIR}/install.sh .
+mkdir scripts
+
+# Create environment.sh
+BASE_ENV_SHELL=${INSTALLER_DIR}/scripts/environment.sh
+NEW_ENV_SHELL=scripts/environment.sh
+cp $BASE_ENV_SHELL $NEW_ENV_SHELL
+
+source $NEW_ENV_SHELL
+
+# Record current environment variables
+set > installer_envvar.log
+
+# src is used to store all resources for from-scratch builds
+mkdir src
+pushd src
+
+# Install Python (function in $INSTALLER_COMMON)
+install_python v${PYTHON_VERSION} ${ENV_DIR}/python
+popd # $ENV_DIR
+
+# Prepare venv
+python/bin/python3 -m venv venv
+source venv/bin/activate
+
+# Install g-leaderboard
+pushd src
+git clone https://github.com/wandb/llm-leaderboard g-leaderboard -b g-leaderboard
+pushd g-leaderboard
+pip install --no-cache-dir -r requirements.txt
+
+# Deploy blended run config
+BLENDED_RUN_CONFIG=${INSTALLER_DIR}/resources/blended_run_config.yaml
+cp $BLENDED_RUN_CONFIG blend_run_configs/config.yaml
+
+echo "Installation done." | tee >(cat >&2)
diff --git a/evaluation/installers/g-leaderboard/logs/.gitignore b/evaluation/installers/g-leaderboard/logs/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/evaluation/installers/g-leaderboard/resources/blended_run_config.yaml b/evaluation/installers/g-leaderboard/resources/blended_run_config.yaml
@@ -0,0 +1,25 @@
+run_chain: false # If you want to reuse past evaluation results in a new run, please set it to true.
+
+new_run: # This setting is for blending runs without running new evaluations. If run_chain is set to true, this setting is disabled.
+  entity: "your/WANDB/entity"
+  project: "your/WANDB/project"
+  run_name: "your/WANDB/run_name"
+
+old_runs: # Please specify the tasks you want to carry over from past runs. Multiple runs are permissible.
+  - run_path: "your/WANDB/run_path"
+    tasks: # The list of tasks to take over. Please comment out tasks that do not need to be taken over.
+      - jaster_ja_0_shot
+      - jaster_ja_4_shot
+      - jaster_en_0_shot
+      - jaster_en_4_shot
+      - mtbench_ja
+      - mtbench_en
+  # - run_path: "your/WANDB/run_path"
+  #   tasks:
+  #     - jaster_ja_0_shot
+  #     - jaster_ja_4_shot
+  #     - jaster_en_0_shot
+  #     - jaster_en_4_shot
+  #     - mtbench_ja
+  #     - mtbench_en
+
diff --git a/evaluation/installers/g-leaderboard/resources/config_base.yaml b/evaluation/installers/g-leaderboard/resources/config_base.yaml
@@ -0,0 +1,86 @@
+testmode: false # If you want to test with a small amount of data, please set it to true.
+model_name: "<<WANDB_RUN_NAME>>" # will be used in Table
+
+wandb:
+  entity: "<<WANDB_ENTITY>>"
+  project: "<<WANDB_PROJECT>>"
+  run_name: "<<WANDB_RUN_NAME>>" # this run_name will be used as the name of run in leaderboard. Can be changed later
+
+# Tasks to run
+run_llm_jp_eval_ja_0_shot: true
+run_llm_jp_eval_ja_few_shots: true
+run_llm_jp_eval_en_0_shot: true
+run_llm_jp_eval_en_few_shots: true
+run_mt_bench_ja: true
+run_mt_bench_en: true
+
+model:
+  api: false # if you don't use api, please set "api" as "false". If you use api, please select from "openai", "anthoropic", "google", "cohere", "mistral", "amazon_bedrock"
+  use_wandb_artifacts: false # if you user wandb artifacts, please set true.
+  artifacts_path: null  # if you user wandb artifacts, please paste the link. if not, please leave it as "".
+  pretrained_model_name_or_path: "<<MODEL>>" #If you use openai api, put the name of model
+  device_map: "auto"
+  load_in_8bit: false
+  load_in_4bit: false
+
+# for llm-jp-eval
+llm_jp_eval:
+  max_seq_length: 4096
+  target_dataset: "all" # {all, jamp, janli, jcommonsenseqa, jemhopqa, jnli, jsem, jsick, jsquad, jsts, niilc, chabsa, mmlu_en}
+  ja_num_shots: 4 # if run_llm_jp_eval_ja_few_shots is true, please set the num of few shots. Default is 4
+  en_num_shots: 4 # run_llm_jp_eval_en_few_shots is true, please set the num of few shots. Default is 4
+  torch_dtype: "bf16" # {fp16, bf16, fp32}
+  # Items that do not need to be changed unless specifically intended.
+  dataset_artifact: "wandb-japan/llm-leaderboard/jaster:v11"
+  dataset_dir: "/jaster/1.2.6/evaluation/test"
+  ja: 
+    custom_prompt_template: "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n{instruction}\n\n### 入力:\n{input}\n\n### 応答:\n"
+    custom_fewshots_template: "\n\n### 入力:\n{input}\n\n### 応答:\n{output}"
+  en: 
+    custom_prompt_template: "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n\n### 指示:\n{instruction}\n\n### 入力:\n{input}\n\n### 応答:\n"
+    custom_fewshots_template: "\n\n### 入力:\n{input}\n\n### 応答:\n{output}"
+
+# for mtbench
+mtbench:
+  model_id: "<<WANDB_RUN_NAME>>" # cannot use '<', '>', ':', '"', '/', '\\', '|', '?', '*', '.'  
+  max_new_token: 1024
+  num_gpus_per_model: 8
+  num_gpus_total: 8
+  max_gpu_memory: null
+  dtype: bfloat16 # None or float32 or float16 or bfloat16
+  use_azure: true # if you use azure openai service for evaluation, set true
+  # for conv template # added
+  custom_conv_template: true
+  # the following variables will be used when custom_conv_template is set as true
+  conv_name: "custom"
+  conv_sep: "\n\n### "
+  conv_stop_token_ids: "[2]"
+  conv_stop_str: "###"
+  conv_role_message_separator: ":\n"
+  conv_role_only_separator: ":\n"
+  ja:
+    conv_system_message: "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。"
+    conv_roles: "('指示', '応答')"
+  en:
+    conv_system_message: "以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。"
+    conv_roles: "('指示', '応答')"
+  dataset: # Items that do not need to be changed unless specifically intended.
+    ja:
+      question_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_question:v3" 
+      test_question_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_question_small_for_test:v5"
+      referenceanswer_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_referenceanswer:v1" 
+      test_referenceanswer_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_referenceanswer_small_for_test:v1"
+      judge_prompt_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_ja_prompt:v1"
+      bench_name: "mt_bench_ja"
+    en:
+      question_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_question:v0"
+      test_question_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_question_small_for_test:v0"
+      referenceanswer_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_referenceanswer:v0" 
+      test_referenceanswer_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_referenceanswer_small_for_test:v0"
+      judge_prompt_artifacts_path: "wandb-japan/llm-leaderboard/mtbench_en_prompt:v0"
+      bench_name: "mt_bench_en"
+
+#==================================================================
+# Items that do not need to be changed unless specifically intended.
+#==================================================================
+github_version: g-eval-v1.0 #for recording
diff --git a/evaluation/installers/g-leaderboard/scripts/environment.sh b/evaluation/installers/g-leaderboard/scripts/environment.sh
@@ -0,0 +1,4 @@
+# List of environment variables and module loads for g-leaderboard
+
+export LANG=ja_JP.UTF-8
+export PYTHON_VERSION=3.10.14
diff --git a/evaluation/installers/g-leaderboard/scripts/run_g-leaderboard.sh b/evaluation/installers/g-leaderboard/scripts/run_g-leaderboard.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#SBATCH --job-name=g-leaderboard
+#SBATCH --partition=<partition>
+#SBATCH --exclusive
+#SBATCH --nodes=1
+#SBATCH --gpus=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --output=logs/%x-%j.out
+#SBATCH --error=logs/%x-%j.err
+
+set -eux
+
+# Open file limit
+ulimit -n 65536 1048576
+
+ENV_DIR=environment
+source ${ENV_DIR}/scripts/environment.sh
+source ${ENV_DIR}/venv/bin/activate
+
+# Arguments
+MODEL=$1
+WANDB_RUN_NAME=$2
+
+# Semi-fixed vars
+CONFIG_TEMPLATE=resources/config_base.yaml
+TOKENIZER=$MODEL
+WANDB_ENTITY=llm-jp-eval
+WANDB_PROJECT=test
+
+# Fixed vars
+G_LEADERBOARD_DIR=${ENV_DIR}/src/g-leaderboard
+CONFIG_DIR=${G_LEADERBOARD_DIR}/configs
+
+# Config settings
+NEW_CONFIG=${CONFIG_DIR}/config.${WANDB_PROJECT}.${WANDB_RUN_NAME}.yaml
+REPLACE_VARS=("MODEL" "TOKENIZER" "WANDB_ENTITY" "WANDB_PROJECT" "WANDB_RUN_NAME")
+
+# Create a new config file to save the config file of each run
+cp $CONFIG_TEMPLATE $NEW_CONFIG
+
+# Replace variables
+for VAR in "${REPLACE_VARS[@]}"; do
+  VALUE=$(eval echo \${$VAR})
+  sed -i "s|<<${VAR}>>|${VALUE}|g" $NEW_CONFIG
+done
+
+# Create a temporal project
+# NOTE: This is necessary to avoid using incorrect configurations when running multiple jobs at the same time.
+TMP_G_LEADERBOARD_DIR=$(mktemp -d "${ENV_DIR}/src/g-leaderboard.XXXXXXXX")
+cp -r $G_LEADERBOARD_DIR/* $TMP_G_LEADERBOARD_DIR
+cp $NEW_CONFIG $TMP_G_LEADERBOARD_DIR/configs/config.yaml
+
+# Run g-leaderboard
+SCRIPT_PATH=scripts/run_eval.py
+pushd $TMP_G_LEADERBOARD_DIR
+python $SCRIPT_PATH
+
+# Clean up
+popd
+rm -rf $TMP_G_LEADERBOARD_DIR
+
+echo "Done"