In [1]:
"""
Objective - 
Create generation on a model for human eval data, evaluate the generations and write them to a file
"""

model_path = "Meta-Llama-3-8B-Instruct-oss-full-2e-4bs" # local path/s3 name/huggingface name of model
model_name = "Meta-Llama-3-8B-Instruct-oss-full-2e-4bs" # model name (DONT keep any '/' or '.' in there)
tokeniser = None # huggingface name/local path of tokeniser, make it None if model already has tokeniser
isS3 = True # true if model is to be loaded from s3, make sure ./model directory is empty
isHF = False  # true if model is to be loaded from huggingface
isLocal = False # true if model is to be loaded from local
benchmark='humaneval' #choose from ["humaneval", "mbpp"]
home_path = '/home/ec2-user/SageMaker'   # local path starting from root to CodeLLMRnD

run_generations = True # make it false if you only want to evaluate
greedy = True # evalplus -> greedy True, if want to give custom n_samples, temperature, max_length_generation, top_p, pass greety=True
n_samples = 50 # evalplus -> n_samples=1, bigcode -> n_samples=50
temperature = 0.2 # evalplus -> temperature=0, bigcode -> temperature=0.2

merged_results_file_name= "evalplusrepo_"+benchmark+"_results_"+model_name+".jsonl" 

In [2]:
!pip install -e .
!pip install -q -r requirements.txt
!pip install -q -r codegen/requirements.txt

Obtaining file:///home/ec2-user/SageMaker/CodeLLMRnD/evalplus
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: evalplus
  Building editable for evalplus (pyproject.toml) ... [?25ldone
[?25h  Created wheel for evalplus: filename=evalplus-0.1.0.dev699-0.editable-py3-none-any.whl size=12239 sha256=634fa8d7ba00440f5e3651dc69fda2cad29163d79a6f7ca9817e9f2f38b0ebeb
  Stored in directory: /tmp/pip-ephem-wheel-cache-401pw8oi/wheels/73/d0/bb/c6ae10827b44a29c868345417fd245beb475210ed86aaf5cf8
Successfully built evalplus
Installing collected packages: evalplus
  Attempting uninstall: evalplus
    Found existing installation: evalplus 0.3.0.dev25
    Uninstalling evalplus-0.3.0.dev25:
      Successfully un

In [3]:
inference_results_dir = benchmark + "_inference/" # local path to save results - end the path with a "/"

In [4]:
import sys
sys.path.append('../human_eval')
from run_bash_command import run_bash_command
if not isLocal and run_generations:
    run_bash_command(f"sudo find {home_path} -type f -size +500M -print0 | xargs -0 rm")

In [5]:
import os
import shutil
if os.path.isdir(inference_results_dir) and run_generations:
    shutil.rmtree(inference_results_dir)
os.makedirs(os.path.dirname(inference_results_dir), exist_ok=True)

In [6]:
import sys
if isS3:
    sys.path.append('../fine_tuning/utils')
    from modelComm import ModelComm
    modelComm = ModelComm()
    model_path = modelComm.download_model(model_path)

In [None]:
local = (isHF==False)
if greedy:
    command = f"python codegen/generate.py --model {model_path} --bs 1 --greedy --root {inference_results_dir} --dataset {benchmark}"
else:
    command = f"python codegen/generate.py --model {model_path} --bs 1 --temperature {temperature} --n_samples {n_samples} --root {inference_results_dir} --dataset {benchmark}"

if local:
    command += " --local"
print(command)
if run_generations:
    run_bash_command(command)

python codegen/generate.py --model ./model/Meta-Llama-3-8B-Instruct-oss-full-2e-4bs --bs 1 --greedy --root humaneval_inference/ --dataset humaneval --local
There was a problem when trying to write in your cache folder (/JawTitan/huggingface/hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.
Initializing a decoder model: ./model/Meta-Llama-3-8B-Instruct-oss-full-2e-4bs ...
INFO 04-19 21:18:52 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='./model/Meta-Llama-3-8B-Instruct-oss-full-2e-4bs', tokenizer='./model/Meta-Llama-3-8B-Instruct-oss-full-2e-4bs', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
Special tokens have been added in the vocabulary, make sure the a

In [None]:
if greedy:
    temperature=0.0
command = f"evalplus.evaluate --dataset {benchmark} --samples {inference_results_dir}{benchmark}/{model_path}_temp_{temperature}/ --i-just-wanna-run"
print(command)
run_bash_command(command)

In [None]:
if benchmark == "mbpp":
    test_file = os.path.join('data','MbppPlus-v0.1.0.jsonl')
else:
    test_file = os.path.join('data','HumanEvalPlus-v0.1.9.jsonl')
command = f"python merge_results.py {inference_results_dir}{benchmark}/{model_path}_temp_0.0/eval_results.json {test_file} {inference_results_dir}"
print(command)
run_bash_command(command)

In [None]:
merged_results_path = os.path.join(inference_results_dir,'final_results.jsonl')
import sys; sys.path.append(home_path+'/CodeLLMRnD/fine_tuning/utils')
from resultsComm import ResultsComm
resultsComm = ResultsComm()
resultsComm.upload_results(merged_results_path, merged_results_file_name)