In [None]:
"""
Objective - 
Create generation on a model for human eval data, evaluate the generations and write them to a file
"""

model_path = "Meta-Llama-3-8B-Instruct-oss-full-2e-4bs" # local path/s3 name/huggingface name of model
model_name = "Meta-Llama-3-8B-Instruct-oss-full-2e-4bs" # model name (DONT keep any '/' or '.' in there)
tokeniser = None # huggingface name/local path of tokeniser, make it None if model already has tokeniser
isS3 = True # true if model is to be loaded from s3, make sure ./model directory is empty
isHF = False  # true if model is to be loaded from huggingface
isLocal = False # true if model is to be loaded from local
benchmark='humaneval' #choose from ["humaneval", "mbpp"]
home_path = '/home/ec2-user/SageMaker'   # local path starting from root to CodeLLMRnD

run_generations = True # make it false if you only want to evaluate
greedy = True # evalplus -> greedy True, if want to give custom n_samples, temperature, max_length_generation, top_p, pass greety=True
n_samples = 50 # evalplus -> n_samples=1, bigcode -> n_samples=50
temperature = 0.2 # evalplus -> temperature=0, bigcode -> temperature=0.2

merged_results_file_name= "evalplusrepo_"+benchmark+"_results_"+model_name+".jsonl" 

In [None]:
!pip install -q -e .
!pip install -q -r requirements.txt
!pip install -q -r codegen/requirements.txt

In [None]:
inference_results_dir = benchmark + "_inference/" # local path to save results - end the path with a "/"

In [None]:
import sys
sys.path.append('../human_eval')
from run_bash_command import run_bash_command
if not isLocal and run_generations:
    run_bash_command(f"sudo find {home_path} -type f -size +500M -print0 | xargs -0 rm")

In [None]:
import os
import shutil
if os.path.isdir(inference_results_dir) and run_generations:
    shutil.rmtree(inference_results_dir)
os.makedirs(os.path.dirname(inference_results_dir), exist_ok=True)

In [None]:
import sys
if isS3:
    sys.path.append('../fine_tuning/utils')
    from modelComm import ModelComm
    modelComm = ModelComm()
    model_path = modelComm.download_model(model_path)

In [None]:
local = (isHF==False)
if greedy:
    command = f"python codegen/generate.py --model {model_path} --bs 1 --greedy --root {inference_results_dir} --dataset {benchmark}"
else:
    command = f"python codegen/generate.py --model {model_path} --bs 1 --temperature {temperature} --n_samples {n_samples} --root {inference_results_dir} --dataset {benchmark}"

if local:
    command += " --local"
print(command)
if run_generations:
    run_bash_command(command)

In [None]:
if greedy:
    temperature=0.0
command = f"evalplus.evaluate --dataset {benchmark} --samples {inference_results_dir}{benchmark}/{model_path}_temp_{temperature}/ --i-just-wanna-run"
print(command)
run_bash_command(command)

In [None]:
if benchmark == "mbpp":
    test_file = os.path.join('data','MbppPlus-v0.1.0.jsonl')
else:
    test_file = os.path.join('data','HumanEvalPlus-v0.1.9.jsonl')
command = f"python merge_results.py {inference_results_dir}{benchmark}/{model_path}_temp_0.0/eval_results.json {test_file} {inference_results_dir}"
print(command)
run_bash_command(command)

In [None]:
merged_results_path = os.path.join(inference_results_dir,'final_results.jsonl')
import sys; sys.path.append(home_path+'/CodeLLMRnD/fine_tuning/utils')
from resultsComm import ResultsComm
resultsComm = ResultsComm()
resultsComm.upload_results(merged_results_path, merged_results_file_name)