In [None]:
from separability import Model
from datasets import load_dataset

m = Model("facebook/galactica-6.7b")

In [None]:
import tempfile
from tqdm import tqdm
from human_eval.data import write_jsonl
from human_eval.evaluation import evaluate_functional_correctness

def mktemp(name:str):
    with tempfile.NamedTemporaryFile(
            suffix=name, delete=False
        ) as temp_file:
        temp_filename = temp_file.name
    return temp_filename

def gen_temp_jsonl_files():
    return [mktemp("-problems.jsonl"), mktemp("-samples.jsonl")]

def load_problems(n=None):
    def __load_dataset():
        _dataset = load_dataset("openai_humaneval")["test"]
        if n is None:
            return _dataset
    
        # Filter to only the first n problems
        indices = list(range(0, n))
        return _dataset.select(indices=indices)
    
    # Load problems in human-eval dict format
    _dataset = __load_dataset()
    return {d["task_id"]: d for d in _dataset}


def generate_one_completion(prompt):
    [i,o] = m.generate(prompt, num=100, temperature=None)
    #o = o.split("\n\ndef")[0]
    return o
 
def generate_samples(problems):
    num_samples_per_task = 1
    samples = []
    pbar = tqdm(desc="human-eval, gen", total=num_samples_per_task*len(problems.keys()))
    for _ in range(num_samples_per_task):
        for task_id in problems:
            samples.append({
                "task_id": task_id,
                "completion": generate_one_completion(problems[task_id]["prompt"])
            })
            pbar.update(1)
    pbar.close()
    return samples

problems = load_problems(n=100)
samples = generate_samples(problems)

f_problems, f_samples = gen_temp_jsonl_files()
write_jsonl(f_problems, list(problems.values()))
write_jsonl(f_samples, samples)

# set TOKENIZERS_PARALLELISM=true
from os import environ
environ["TOKENIZERS_PARALLELISM"] = "false"

out = evaluate_functional_correctness(
    sample_file=f_samples,
    problem_file=f_problems,
    k = [1, 10],
)

environ["TOKENIZERS_PARALLELISM"] = "true"

print(out)

In [None]:
import fnmatch
from accelerate import Accelerator
from typing import Optional, List
import json

from lm_eval.evaluator import Evaluator as BigCodeEvaluator
from lm_eval.tasks import ALL_TASKS as BIG_CODE_ALL_TASKS
from dataclasses import dataclass


@dataclass
class BigCodeEvalArgs:
    model: str = "codeparrot/codeparrot-small"
    peft_model: Optional[str] = None
    revision: Optional[str] = None
    use_auth_token: bool = False
    trust_remote_code: bool = False
    tasks: Optional[str] = None  # Assuming tasks will be a list; if not, adjust the type
    instruction_tokens: Optional[str] = None
    batch_size: int = 1
    max_length_generation: int = 512
    precision: str = "fp32"
    load_in_8bit: bool = False
    load_in_4bit: bool = False
    limit: Optional[int] = None
    postprocess: bool = True
    allow_code_execution: bool = False
    generation_only: bool = False
    load_generations_path: Optional[str] = None
    metric_output_path: str = "evaluation_results.json"
    save_generations: bool = False
    save_generations_path: str = "generations.json"
    save_references: bool = False
    
    # New fields
    model_ckpt: str = ""
    prefix: str = ""
    do_sample: bool = True
    temperature: float = 0.2
    top_k: int = 0
    top_p: float = 0.95
    n_samples: int = 1
    eos: str = ""
    seed: int = 0

def pattern_match(patterns, source_list):
    """Returns a list containing all values of the source_list that
    match at least one of the patterns"""
    task_names = set()
    for pattern in patterns:
        for matching in fnmatch.filter(source_list, pattern):
            task_names.add(matching)
    return list(task_names)

def evaluate_bigcode(opt: Model, tasks:str = "mbpp"):
    args = BigCodeEvalArgs(
        model = opt.model_repo,
        allow_code_execution = True,
        tasks = tasks
    )
    
    if args.tasks is None:
        task_names = BIG_CODE_ALL_TASKS
    else:
        task_names = pattern_match(args.tasks.split(","), BIG_CODE_ALL_TASKS)

    
    accelerator = Accelerator()
    results = {}
    opt.tokenizer.eos_token = "</s>"
    if not opt.tokenizer.eos_token:
        if opt.tokenizer.bos_token:
            opt.tokenizer.eos_token = opt.tokenizer.bos_token
            print("bos_token used as eos_token")
        else:
            raise ValueError("No eos_token or bos_token found")
    opt.tokenizer.pad_token = opt.tokenizer.eos_token

    evaluator = BigCodeEvaluator(accelerator, opt.predictor, opt.tokenizer, args)

    for task in task_names:
        if args.generation_only:
            if accelerator.is_main_process:
                print("generation mode only")
            generations, references = evaluator.generate_text(task)
            if accelerator.is_main_process:
                with open(args.save_generations_path, "w") as fp:
                    json.dump(generations, fp)
                    print(f"generations were saved at {args.save_generations_path}")
                if args.save_references:
                    with open("references.json", "w") as fp:
                        json.dump(references, fp)
                        print("references were saved")
        else:
            results[task] = evaluator.evaluate(task)

    results["config"] = {
        "model": args.model,
        "revision": args.revision,
        "temperature": args.temperature,
        "n_samples": args.n_samples,
    }
    
    dumped = json.dumps(results, indent=2)
    if accelerator.is_main_process:
        print(dumped)

    with open(args.metric_output_path, "w") as f:
        f.write(dumped)
        
evaluate_bigcode(m)

In [None]:

print(m.tokenizer.all_special_tokens)
print(m.tokenizer.all_special_tokens_extended)
for i in range(0, 50):
    print(i, m.tokenizer.convert_ids_to_tokens([i]))
for i in range(m.cfg.d_vocab-100, m.cfg.d_vocab):
    print(i, m.tokenizer.convert_ids_to_tokens([i]))