## Test type-inf dataset on LLMs

In [1]:
%load_ext autoreload
%autoreload 2
    
# # set cuda visible device
# !export CUDA_VISIBLE_DEVICES=0
    
!export TRANSFORMERS_CACHE=/work/arjunguha-research-group/franlucc

In [2]:
root = "/work/arjunguha-research-group"
codellama7b = f"{root}/models/CodeLlama-7b-hf"
codellama13b = f"{root}/models/CodeLlama-13b-hf"
starcoder = f"{root}/arjun/models/starcoderbase"

from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel
import torch

def clear_gpu():
    destroy_model_parallel()
    del llm
    gc.collect()
    torch.cuda.empty_cache()
    torch.distributed.destroy_process_group()

In [16]:
clear_gpu()

UnboundLocalError: local variable 'llm' referenced before assignment

## CodeLlama 7b

CodeLlama 7b and 13b do code infilling using `<PRE> {prefix} <SUF>{suffix} <MID>`:
```
<PRE>def remove_non_ascii(s: str) -> str:
    """<SUF>
    return result
<MID>
```
or placeholder
```
def remove_non_ascii(s: str) -> str:
    """ <FILL>
    return result
```
then split and join with actual special tokens

In [3]:
from vllm import LLM, SamplingParams

llm = LLM(model=codellama7b)

INFO 01-17 20:12:18 llm_engine.py:70] Initializing an LLM engine with config: model='/work/arjunguha-research-group/models/CodeLlama-7b-hf', tokenizer='/work/arjunguha-research-group/models/CodeLlama-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)
INFO 01-17 20:12:25 llm_engine.py:275] # GPU blocks: 7226, # CPU blocks: 512
INFO 01-17 20:12:26 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-17 20:12:26 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
INFO 01-17 20:12:28 model_runner.py:5

In [20]:
from build_dataset import *

codellama_fim = FimObj("<PRE> ", " <SUF>"," <MID>", "<FILL>") # codellama is finnicky with these spaces
prompts = get_prompts(targets=["bible"])

### Single token Generations

In [21]:
generate_test(prompts, llm, "codellama7b", 1, {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, codellama_fim) 

Processed prompts: 100%|██████████| 2/2 [00:00<00:00, 10.52it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  4.13it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  4.17it/s]


### Multi token Generations

In [22]:
generate_test(prompts, llm, "codellama7b", 50,  {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, codellama_fim) 

Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  2.86it/s]
Processed prompts: 100%|██████████| 2/2 [00:01<00:00,  1.42it/s]
Processed prompts: 100%|██████████| 2/2 [00:01<00:00,  1.40it/s]


### Evaluate

In [30]:
verbose=False
print(json.dumps(type_inf_eval("generations/codellama7b/singletok", verbose), indent=4))
print(json.dumps(type_inf_eval("generations/codellama7b/multitok", verbose), indent=4))

{
    "bible_fim_BookTitle_greedy_6_n1": false,
    "bible_fim_BookTitle_temp_0.2_8_n20": 0.0,
    "bible_fim_BookTitle_temp_0.8-top_p_0.95_10_n20": 0.0,
    "bible_fim_unknown_greedy_0_n1": false,
    "bible_fim_unknown_greedy_7_n1": false,
    "bible_fim_unknown_temp_0.2_1_n20": 0.0,
    "bible_fim_unknown_temp_0.2_9_n20": 0.0,
    "bible_fim_unknown_temp_0.8-top_p_0.95_11_n20": 0.0,
    "bible_fim_unknown_temp_0.8-top_p_0.95_2_n20": 0.0,
    "date_fim_DateQuery_greedy_0_n1": false,
    "date_fim_DateQuery_temp_0.2_2_n20": 0.0,
    "date_fim_DateQuery_temp_0.8-top_p_0.95_4_n20": 0.0,
    "date_fim_ValidDateQuery_greedy_1_n1": false,
    "date_fim_ValidDateQuery_temp_0.2_3_n20": 0.0,
    "date_fim_ValidDateQuery_temp_0.8-top_p_0.95_5_n20": 0.0
}
{
    "bible_fim_BookTitle_greedy_12_n1": false,
    "bible_fim_BookTitle_temp_0.2_14_n20": 0.0,
    "bible_fim_BookTitle_temp_0.8-top_p_0.95_16_n20": 0.0,
    "bible_fim_unknown_greedy_13_n1": false,
    "bible_fim_unknown_greedy_3_n1": false

### FIM sanity check
since Codellama is getting all 0s, sanity check with a FIM example from CodeLlama tut: 
[codellama infill example](https://github.com/facebookresearch/codellama/blob/main/example_infilling.py)

**EDIT** the problem was CodeLlama wants spaces in its special fim tokens


In [10]:
cont = '''def remove_non_ascii(s: str) -> str:
    """ <FILL>
    return result'''

prompts = [placeholder_to_std_fmt(cont, codellama_fim)]
print(prompts)
output = llm.generate(prompts, greedy_params)
print(output[0].outputs[0].text)

['<PRE> def remove_non_ascii(s: str) -> str:\n    """  <SUF>\n    return result <MID>']


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.89it/s]

Remove non-ASCII characters from a string.

    Args:
        s (str): The string to remove non-ASCII characters from.

    Returns:
        str: The string with non-ASCII characters





## CodeLlama 13b

CodeLlama 7b and 13b do code infilling using `<PRE> {prefix} <SUF>{suffix} <MID>`:
```
<PRE>def remove_non_ascii(s: str) -> str:
    """<SUF>
    return result
<MID>
```
or placeholder
```
def remove_non_ascii(s: str) -> str:
    """ <FILL>
    return result
```
then split and join with actual special tokens

In [3]:
from vllm import LLM, SamplingParams

llm = LLM(model=codellama13b)

INFO 01-17 20:31:53 llm_engine.py:70] Initializing an LLM engine with config: model='/work/arjunguha-research-group/models/CodeLlama-13b-hf', tokenizer='/work/arjunguha-research-group/models/CodeLlama-13b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)
INFO 01-17 20:32:01 llm_engine.py:275] # GPU blocks: 3631, # CPU blocks: 327
INFO 01-17 20:32:03 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-17 20:32:03 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
INFO 01-17 20:32:06 model_runner.py

In [4]:
from build_dataset import *

codellama_fim = FimObj("<PRE> ", " <SUF>"," <MID>", "<FILL>")
prompts = get_prompts(targets=["bible"])

### Single token Generations

In [5]:
generate_test(prompts, llm, "codellama13b", 1, {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, codellama_fim) 

Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  6.54it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  3.29it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  3.28it/s]


### Multi token Generations

In [6]:
generate_test(prompts, llm, "codellama13b", 50,  {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, codellama_fim) 

Processed prompts: 100%|██████████| 2/2 [00:01<00:00,  1.61it/s]
Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]


### Evaluate

In [10]:
verbose=False
print(json.dumps(type_inf_eval("generations/codellama13b/singletok", verbose), indent=4))
print(json.dumps(type_inf_eval("generations/codellama13b/multitok", verbose), indent=4))

{
    "bible_fim_BookTitle_greedy_0_n1": false,
    "bible_fim_BookTitle_temp_0.2_2_n20": 0.0,
    "bible_fim_BookTitle_temp_0.8-top_p_0.95_4_n20": 0.0,
    "bible_fim_unknown_greedy_1_n1": false,
    "bible_fim_unknown_temp_0.2_3_n20": 0.0,
    "bible_fim_unknown_temp_0.8-top_p_0.95_5_n20": 0.0,
    "date_fim_DateQuery_greedy_0_n1": false,
    "date_fim_DateQuery_temp_0.2_2_n20": 0.0,
    "date_fim_DateQuery_temp_0.8-top_p_0.95_4_n20": 0.0,
    "date_fim_ValidDateQuery_greedy_1_n1": false,
    "date_fim_ValidDateQuery_temp_0.2_3_n20": 0.0,
    "date_fim_ValidDateQuery_temp_0.8-top_p_0.95_5_n20": 0.0
}
{
    "bible_fim_BookTitle_greedy_6_n1": false,
    "bible_fim_BookTitle_temp_0.2_8_n20": 0.0,
    "bible_fim_BookTitle_temp_0.8-top_p_0.95_10_n20": 0.0,
    "bible_fim_unknown_greedy_7_n1": false,
    "bible_fim_unknown_temp_0.2_9_n20": 0.0,
    "bible_fim_unknown_temp_0.8-top_p_0.95_11_n20": 0.0,
    "date_fim_DateQuery_greedy_6_n1": true,
    "date_fim_DateQuery_temp_0.2_8_n20": 1.0,


## StarcoderBase

In [4]:
from vllm import LLM, SamplingParams

llm = LLM(model=starcoder)

INFO 01-17 20:26:39 llm_engine.py:70] Initializing an LLM engine with config: model='/work/arjunguha-research-group/arjun/models/starcoderbase', tokenizer='/work/arjunguha-research-group/arjun/models/starcoderbase', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)
INFO 01-17 20:27:17 llm_engine.py:275] # GPU blocks: 131427, # CPU blocks: 13107
INFO 01-17 20:27:19 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-17 20:27:19 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
INFO 01-17 20:27:22 model_r

In [5]:
from build_dataset import *

prompts = get_prompts(targets=["bible"])
starcoder_fim = FimObj("<fim_prefix>", "<fim_suffix>","<fim_middle>", "<FILL>")
generate_test(prompts, 
              llm, 
              "starcoder", 
              1, 
              {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, 
              starcoder_fim) 
generate_test(prompts, 
              llm, 
              "starcoder", 
              20, 
              {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, 
              starcoder_fim) 

Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  5.91it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  2.76it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  2.71it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  5.86it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  2.52it/s]
Processed prompts: 100%|██████████| 2/2 [00:01<00:00,  1.71it/s]


### Evaluation

In [9]:
verbose = False
print(json.dumps(type_inf_eval("generations/starcoder/singletok", verbose), indent=4))
print(json.dumps(type_inf_eval("generations/starcoder/multitok", verbose), indent=4))

{
    "bible_fim_BookTitle_greedy_0_n1": false,
    "bible_fim_BookTitle_temp_0.2_2_n20": 0.0,
    "bible_fim_BookTitle_temp_0.8-top_p_0.95_4_n20": 0.0,
    "bible_fim_unknown_greedy_1_n1": true,
    "bible_fim_unknown_temp_0.2_3_n20": 1.0,
    "bible_fim_unknown_temp_0.8-top_p_0.95_5_n20": 1.0,
    "date_fim_DateQuery_greedy_0_n1": false,
    "date_fim_DateQuery_temp_0.2_2_n20": 0.0,
    "date_fim_DateQuery_temp_0.8-top_p_0.95_4_n20": 0.0,
    "date_fim_ValidDateQuery_greedy_1_n1": false,
    "date_fim_ValidDateQuery_temp_0.2_3_n20": 0.0,
    "date_fim_ValidDateQuery_temp_0.8-top_p_0.95_5_n20": 0.0
}
{
    "bible_fim_BookTitle_greedy_6_n1": true,
    "bible_fim_BookTitle_temp_0.2_8_n20": 1.0,
    "bible_fim_BookTitle_temp_0.8-top_p_0.95_10_n20": 0.6000000000000001,
    "bible_fim_unknown_greedy_7_n1": true,
    "bible_fim_unknown_temp_0.2_9_n20": 1.0,
    "bible_fim_unknown_temp_0.8-top_p_0.95_11_n20": 1.0,
    "date_fim_DateQuery_greedy_6_n1": true,
    "date_fim_DateQuery_temp_0.2_8