In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [4]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.17.3-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.7.0-py2.py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.1/300.1 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [3]:
!pip install lm_eval

Collecting lm_eval
  Downloading lm_eval-0.4.2-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate (from lm_eval)
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting jsonlines (from lm_eval)
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Collecting pybind11>=2.6.2 (from lm_eval)
  Downloading pybind11-2.13.1-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.8/238.8 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytablewriter (from lm_eval)
  Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.1/111.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score>=0.0.4 (from lm_eval)
  Dow

In [2]:
# Install required packages
!pip install accelerate bitsandbytes -i https://pypi.org/simple/

import torch

# Check for GPU
if not torch.cuda.is_available():
    raise RuntimeError("No GPU found. A GPU is needed for quantization.")

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from datasets import load_dataset, Dataset, DatasetDict
from unsloth import FastLanguageModel
from trl import SFTTrainer
from collections import Counter
import re
import json
import os
import numpy as np
from pathlib import Path
from dataclasses import dataclass, field
import time
import yaml





Looking in indexes: https://pypi.org/simple/
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.

ModuleNotFoundError: No module named 'lm_eval'

In [5]:
from lm_eval import evaluator
from lm_eval.tasks import TaskManager

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

In [9]:
import wandb

In [6]:
import lm_eval

os.environ["TOKENIZERS_PARALLELISM"] = "true"

TASKS_WE_USE = [
    {
        "name": "bbh_cot_fewshot_logical_deduction_three_objects",
        "num_shots": None,
        "is_gen": True,
        "in_openllm": False,
        "metric": "exact_match,get-answer",
    },
    {
        "name": "bbh_cot_fewshot_reasoning_about_colored_objects",
        "num_shots": None,
        "is_gen": True,
        "in_openllm": False,
        "metric": "exact_match,get-answer",
    },
    {
        "name": "bbh_cot_fewshot_temporal_sequences",
        "num_shots": None,
        "is_gen": True,
        "in_openllm": False,
        "metric": "exact_match,get-answer",
    },
]

TASK_TO_METRIC = {v["name"]: v["metric"] for v in TASKS_WE_USE}
TASK_TO_NUM_SHOT = {v["name"]: v["num_shots"] for v in TASKS_WE_USE}
ALL_TASKS = [v["name"] for v in TASKS_WE_USE]
GEN_TASKS = set([v["name"] for v in TASKS_WE_USE if v["is_gen"]])
OPENLLM_TASKS = set([v["name"] for v in TASKS_WE_USE if v["in_openllm"]])

@dataclass
class LMEvalArguments:
    model: str = field(
        default="hf",
        metadata={"help": "The model TYPE"},
    )
    model_name_or_path: str = field(
        default="unsloth/llama-2-7b-bnb-4bit",
        metadata={"help": "The model name or path."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The model revision."},
    )
    tokenizer_name_or_path: str = field(
        default="",
        metadata={
            "help": "In some rare occasion, you may want to manually specify the tokenizer name or path. If empty, set to model_name_or_path."
        },
    )
    tokenizer_revision: str = field(
        default="main",
        metadata={"help": "The tokenizer revision."},
    )
    attn_implementation: str = field(
        default=None,
        metadata={"help": "The attention implementation."},
    )
    torch_dtype: str = field(
        default="auto",
        metadata={"help": "The torch dtype."},
    )
    trust_remote_code: bool = field(
        default=True,
        metadata={"help": "Whether to trust remote code."},
    )
    batch_size: int = field(
        default=8,
        metadata={"help": "The batch size."},
    )
    output_path: str = field(
        default="output",
        metadata={"help": "The output path for the results."},
    )
    log_samples: bool = field(
        default=True,
        metadata={"help": "Whether to log samples."},
    )
    verbosity: str = field(
        default="DEBUG",
        metadata={"help": "The verbosity level."},
    )
    to_wandb: bool = field(
        default=False,
        metadata={"help": "Whether to log to wandb."},
    )
    wandb_project: str = field(
        default="",
        metadata={"help": "The wandb project."},
    )
    wandb_id: str = field(
        default="",
        metadata={
            "help": "The wandb run id to upload results to. If empty, we will check the model_name_or_path/run_args.yaml"
        },
    )

In [7]:
def __post_init__(self):
        if self.output_path:
            path = Path(self.output_path)
            if (
                path.is_file()
                or Path(self.output_path).joinpath("results.json").is_file()
            ):
                print(f"File already exists at {path}. Results will be overwritten.")
                assert not path.is_file(), "File already exists"
            elif path.suffix in (".json", ".jsonl"):
                raise NotImplementedError("Not implemented")
            else:
                path.mkdir(parents=True, exist_ok=True)
        elif self.log_samples and not self.output_path:
            assert self.output_path, "Specify --output_path"

        if self.tokenizer_name_or_path == "":
            self.tokenizer_name_or_path = self.model_name_or_path

        if self.torch_dtype == "bfloat16":
            self.torch_dtype = torch.bfloat16
        elif self.torch_dtype == "float32":
            self.torch_dtype = torch.float32
        elif self.torch_dtype == "auto":
            self.torch_dtype = "auto"
        else:
            raise NotImplementedError("Torch dtype not implemented")

        if self.to_wandb and self.wandb_id == "":
            path = Path(self.model_name_or_path)
            assert path.joinpath(
                "run_args.yaml"
            ).is_file(), f"File not found at {path.joinpath('run_args.yaml')}"
            with open(path.joinpath("run_args.yaml"), "r", encoding="utf-8") as fread:
                all_args = yaml.load(fread, Loader=yaml.Loader)
            self.wandb_id = all_args["wandb_id"]
            self.wandb_project = all_args["wandb_project"]
            print(
                f"Read wandb info from {path.joinpath('run_args.yaml')}: {self.wandb_project}/{self.wandb_id}"
            )
        return

def _handle_non_serializable(o):
    if isinstance(o, np.int64) or isinstance(o, np.int32):
        return int(o)
    elif isinstance(o, set):
        return list(o)
    else:
        return str(o)

def save_results(
    args: LMEvalArguments, new_results: dict, new_tasks: list, prev_results=None
):
    path = Path(args.output_path)
    output_path_file = path.joinpath("results.json")

    if prev_results is not None:
        for k, old_results in prev_results.items():
            if k not in new_results:
                new_results[k] = old_results
            elif isinstance(old_results, dict):
                new_results[k] = {**old_results, **new_results[k]}
            elif isinstance(old_results, (str, float, int)):
                new_results[k] = old_results
            else:
                print("skipping", k, old_results)

    if args.log_samples:
        samples = new_results.pop("samples")

    dumped = json.dumps(
        new_results, indent=2, default=_handle_non_serializable, ensure_ascii=False
    )

    output_path_file.open("w", encoding="utf-8").write(dumped)
    if args.log_samples:
        for task_name, config in new_results["configs"].items():
            if task_name not in new_tasks:
                print(f"Task {task_name} not in new_tasks: {new_tasks}")
                continue
            filename = path.joinpath(f"{task_name}.jsonl")
            samples_dumped = json.dumps(
                samples[task_name],
                indent=2,
                default=_handle_non_serializable,
                ensure_ascii=False,
            )
            filename.write_text(samples_dumped, encoding="utf-8")
    return new_results

def get_performance(args: LMEvalArguments, all_results, all_tasks):
    metrics = {}
    all_averages = []
    openllm_averages = []
    classification_average = []
    generation_average = []
    for task, task_result in all_results["results"].items():
        if task in all_tasks:
            task_result_cleaned = {}
            for k, v in task_result.items():
                if k == "alias":
                    continue
                k = k.replace(",none", "")
                task_result_cleaned[k] = v

                if k != TASK_TO_METRIC[task]:
                    continue
                all_averages.append(v)
                if task in OPENLLM_TASKS:
                    openllm_averages.append(v)
                if task in GEN_TASKS:
                    generation_average.append(v)
                else:
                    classification_average.append(v)
            metrics[task] = task_result_cleaned
    metrics["openllm_average"] = np.mean(openllm_averages).item()
    metrics["classification_average"] = np.mean(classification_average).item()
    metrics["generation_average"] = np.mean(generation_average).item()
    metrics["all_average"] = np.mean(all_averages).item()

    path = Path(args.output_path)
    output_path_file = path.joinpath("performance.json")
    dumped = json.dumps(
        metrics, indent=2, default=_handle_non_serializable, ensure_ascii=False
    )
    output_path_file.open("w", encoding="utf-8").write(dumped)
    return metrics


In [12]:
def main(args: LMEvalArguments):
    print(f"Eval parameters {args}")

    task_manager = TaskManager(args.verbosity, include_path=None)

    loaded_model, loaded_tokenizer = FastLanguageModel.from_pretrained(
        model_name=args.model_name_or_path,
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )

    if "stablelm" in loaded_tokenizer.name_or_path:
        print(
            "Setting pad token id to 100288 assuming you are using StableLM tokenizer"
        )
        loaded_tokenizer.pad_token_id = 100288

    loaded_model = loaded_model.eval()

    lm = lm_eval.api.registry.get_model(args.model).create_from_arg_string(
        "",
        {
            "pretrained": loaded_model,
            "tokenizer": loaded_tokenizer,
            "trust_remote_code": True,
            "batch_size": args.batch_size,
            "max_batch_size": None,
            "device": None,
        },
    )

    start_time = time.time()

    prev_results = None
    for task in ALL_TASKS:
        print(f"Running task {task} with {TASK_TO_NUM_SHOT[task]} shots")
        new_results = evaluator.simple_evaluate(
            model=lm,
            tasks=[task],
            batch_size=args.batch_size,
            num_fewshot=TASK_TO_NUM_SHOT[task],
            log_samples=True,
            gen_kwargs=None,
            task_manager=task_manager,
        )
        prev_results = save_results(
            args, new_results=new_results, new_tasks=[task], prev_results=prev_results
        )
    time_taken = time.time() - start_time

    all_tasks = ALL_TASKS
    performance = get_performance(args, prev_results, all_tasks)
    print(json.dumps(performance, indent=2, ensure_ascii=False))
    print(f"Time taken: {time_taken} seconds")



    if args.wandb_id != "":
        wandb.init(project=args.wandb_project, id=args.wandb_id, resume=True)
        wandb_perf = {f"lm_eval/{k}": v for k, v in performance.items()}
        wandb_perf["lm_eval/time_taken"] = time_taken
        wandb.log(wandb_perf)
        wandb.finish()
    return

if __name__ == "__main__":
    args = LMEvalArguments(
        model="hf",
        model_name_or_path="unsloth/llama-2-7b-bnb-4bit",
        model_revision="main",
        tokenizer_name_or_path="",
        tokenizer_revision="main",
        attn_implementation=None,
        torch_dtype="auto",
        trust_remote_code=True,
        batch_size=1,
        output_path="",
        log_samples=True,
        verbosity="DEBUG",
        to_wandb=False,
        wandb_project="",
        wandb_id="",
    )


    main(args)

Eval parameters LMEvalArguments(model='hf', model_name_or_path='unsloth/llama-2-7b-bnb-4bit', model_revision='main', tokenizer_name_or_path='', tokenizer_revision='main', attn_implementation=None, torch_dtype='auto', trust_remote_code=True, batch_size=1, output_path='', log_samples=True, verbosity='DEBUG', to_wandb=False, wandb_project='', wandb_id='')


config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

Running generate_until requests:  74%|███████▍  | 186/250 [46:55<16:08, 15.14s/it]


generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

INFO:lm-eval:Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234


Running task bbh_cot_fewshot_logical_deduction_three_objects with None shots


INFO:lm-eval:Building contexts for bbh_cot_fewshot_logical_deduction_three_objects on rank 0...
100%|██████████| 250/250 [00:00<00:00, 848.31it/s]
INFO:lm-eval:Running generate_until requests
Running generate_until requests: 100%|██████████| 250/250 [30:21<00:00,  7.28s/it]
INFO:lm-eval:Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234


Running task bbh_cot_fewshot_reasoning_about_colored_objects with None shots


Downloading data:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

INFO:lm-eval:Building contexts for bbh_cot_fewshot_reasoning_about_colored_objects on rank 0...
100%|██████████| 250/250 [00:00<00:00, 791.99it/s]
INFO:lm-eval:Running generate_until requests
Running generate_until requests: 100%|██████████| 250/250 [31:58<00:00,  7.67s/it]
INFO:lm-eval:Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234


skipping git_hash None
skipping upper_git_hash None
Task bbh_cot_fewshot_logical_deduction_three_objects not in new_tasks: ['bbh_cot_fewshot_reasoning_about_colored_objects']
Running task bbh_cot_fewshot_temporal_sequences with None shots


Downloading data:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

INFO:lm-eval:Building contexts for bbh_cot_fewshot_temporal_sequences on rank 0...
100%|██████████| 250/250 [00:00<00:00, 563.37it/s]
INFO:lm-eval:Running generate_until requests
Running generate_until requests: 100%|██████████| 250/250 [31:56<00:00,  7.67s/it]


skipping git_hash None
skipping upper_git_hash None
Task bbh_cot_fewshot_logical_deduction_three_objects not in new_tasks: ['bbh_cot_fewshot_temporal_sequences']
Task bbh_cot_fewshot_reasoning_about_colored_objects not in new_tasks: ['bbh_cot_fewshot_temporal_sequences']
{
  "bbh_cot_fewshot_logical_deduction_three_objects": {
    "exact_match,get-answer": 0.5,
    "exact_match_stderr,get-answer": 0.031686212526223896
  },
  "bbh_cot_fewshot_reasoning_about_colored_objects": {
    "exact_match,get-answer": 0.38,
    "exact_match_stderr,get-answer": 0.030760116042626042
  },
  "bbh_cot_fewshot_temporal_sequences": {
    "exact_match,get-answer": 0.136,
    "exact_match_stderr,get-answer": 0.02172334261705207
  },
  "openllm_average": NaN,
  "classification_average": NaN,
  "generation_average": 0.33866666666666667,
  "all_average": 0.33866666666666667
}
Time taken: 5679.502562046051 seconds


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
