## Test type-inf dataset on LLMs

In [1]:
%load_ext autoreload
%autoreload 2
    
# # set cuda visible device
!export CUDA_VISIBLE_DEVICES=3
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    
# !export TRANSFORMERS_CACHE=/work/arjunguha-research-group/franlucc
# root = "/work/arjunguha-research-group"

root = "/home/arjun/models/"
# codellama7b = f"{root}/models/CodeLlama-7b-hf"
# codellama13b = f"{root}/models/CodeLlama-13b-hf"
# starcoder = f"{root}/arjun/models/starcoderbase"
starcoder7b = f"{root}/starcoderbase-7b"
starcoder1b = f"{root}/starcoderbase-1b"

import json
from vllm import LLM, SamplingParams
from build_dataset import *

# fim special toks
# codellama is finnicky with these spaces: "<PRE> {prefix} <SUF>{suffix} <MID>" or "<PRE> {prefix} <SUF> {suffix} <MID>"
codellama_fim = FimObj("<PRE> ", " <SUF> "," <MID>", "<FILL>")
starcoder_fim = FimObj("<fim_prefix>", "<fim_suffix>","<fim_middle>", "<FILL>")

## REPL

In [2]:
from evaluate import *
import datasets
ds = datasets.load_dataset("franlucc/stenotype-eval-dataset", split="train")
fim_examples = fim_dataset(ds)

llm = LLM(model=starcoder7b)

100%|██████████| 338/338 [00:02<00:00, 167.42it/s]

INFO 01-19 20:20:40 llm_engine.py:72] Initializing an LLM engine with config: model='/home/arjun/models//starcoderbase-7b', tokenizer='/home/arjun/models//starcoderbase-7b', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0)





INFO 01-19 20:21:04 llm_engine.py:207] # GPU blocks: 176864, # CPU blocks: 12483


In [None]:
from evaluate import *
import subprocess
import tempfile
params = SamplingParams(temperature=0, max_tokens=10)
prog = open("ts-benchmark/curried_renamed_no_examples/tree_postorder.ts").read()
prompts = [
  fim_prog(prog)[-2], # some FIM'd variation
]

for k,p in enumerate(prompts):
  p = placeholder_to_std_fmt(p, starcoder_fim)
  out = llm.generate([p], params)
  generated_text = out[0].outputs[0].text
  
  # # find index of <fim_suffix> in prompt
  # suffix_idx = p.index("<fim_suffix>")
  # # display index as (row, column)
  # suffix_idx = (p[:suffix_idx].count("\n"), suffix_idx - p[:suffix_idx].rfind("\n"))
  # print(f"suffix_idx: {suffix_idx}")
  # print(f"PROG:\n{p!r}")
  # print(f"GEN:\n{generated_text}")
  
  # write to file temp.ts
  with open("temp.ts", "w") as f:
    f.write(unfim(p+generated_text, starcoder_fim))
  

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  6.58it/s]


## CodeLlama 7b

CodeLlama 7b and 13b do code infilling using `<PRE> {prefix} <SUF>{suffix} <MID>`:
```
<PRE>def remove_non_ascii(s: str) -> str:
    """<SUF>
    return result
<MID>
```
or placeholder
```
def remove_non_ascii(s: str) -> str:
    """ <FILL>
    return result
```
then split and join with actual special tokens

In [2]:
llm = LLM(model=codellama7b)

INFO 01-18 15:45:38 llm_engine.py:70] Initializing an LLM engine with config: model='/work/arjunguha-research-group/models/CodeLlama-7b-hf', tokenizer='/work/arjunguha-research-group/models/CodeLlama-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)
INFO 01-18 15:45:43 llm_engine.py:275] # GPU blocks: 7226, # CPU blocks: 512
INFO 01-18 15:45:44 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-18 15:45:44 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
INFO 01-18 15:45:47 model_runner.py:5

### Single token Generations

In [3]:
prompts = get_prompts()
generate_test(prompts, llm, "codellama7b", 1, {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, codellama_fim) 

Processed prompts: 100%|██████████| 15/15 [00:01<00:00, 10.51it/s]
Processed prompts: 100%|██████████| 15/15 [00:04<00:00,  3.56it/s]
Processed prompts: 100%|██████████| 15/15 [00:04<00:00,  3.47it/s]


### Multi token Generations

In [8]:
generate_test(prompts, llm, "codellama7b", 50,  {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, codellama_fim) 

Processed prompts: 100%|██████████| 15/15 [00:02<00:00,  7.49it/s]
Processed prompts: 100%|██████████| 15/15 [00:05<00:00,  2.59it/s]
Processed prompts: 100%|██████████| 15/15 [00:06<00:00,  2.49it/s]


### Evaluate

In [14]:
verbose=True

def pretty_print(dict, targets = None):
    if targets:
        dict = {k:v for k,v in dict.items() if all([t in k for t in targets])}
    print(json.dumps(dict, indent=4))

st = type_inf_eval("generations/codellama7b/singletok", verbose)
mt = type_inf_eval("generations/codellama7b/multitok", verbose)
targ = ["greedy"]
# pretty_print(st, targ)
pretty_print(mt, targ)

st = type_inf_eval("generations/v0_codellama7b/singletok", verbose)
mt = type_inf_eval("generations/v0_codellama7b/multitok", verbose)
targ = ["greedy"]
# pretty_print(st, targ)
pretty_print(mt, targ)

{
    "bible_fim_BookTitle_greedy_45_n1": [
        false,
        [
            "BookTitle | unknown"
        ],
        "BookTitle"
    ],
    "bible_fim_unknown_greedy_46_n1": [
        true,
        [
            "unknown"
        ],
        "unknown"
    ],
    "bookmark_fim_null_greedy_47_n1": [
        true,
        [
            "null"
        ],
        "null"
    ],
    "date_fim_DateQuery_greedy_48_n1": [
        false,
        [
            "DateQuery {\n\tif (readableDate=='') return empty()\n\tif (readableDate.match(/^(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)T(\\d\\"
        ],
        "DateQuery"
    ],
    "date_fim_ValidDateQuery_greedy_49_n1": [
        true,
        [
            "ValidDateQuery"
        ],
        "ValidDateQuery"
    ],
    "election_fim_T[]_greedy_50_n1": [
        false,
        [
            "T[]): boolean {\n  const sortedA = sortedArrayOfAnyElectionObjects(a);\n  const sortedB = sortedArrayOfAnyElectionObjects(b);\n  if (sortedA.length !== sortedB.leng

### FIM sanity check
since Codellama is getting all 0s, sanity check with a FIM example from CodeLlama tut: 
[codellama infill example](https://github.com/facebookresearch/codellama/blob/main/example_infilling.py)

**EDIT** the problem was CodeLlama wants spaces in its special fim tokens


In [10]:
cont = '''def remove_non_ascii(s: str) -> str:
    """ <FILL>
    return result'''

prompts = [placeholder_to_std_fmt(cont, codellama_fim)]
print(prompts)
output = llm.generate(prompts, greedy_params)
print(output[0].outputs[0].text)

['<PRE> def remove_non_ascii(s: str) -> str:\n    """  <SUF>\n    return result <MID>']


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.89it/s]

Remove non-ASCII characters from a string.

    Args:
        s (str): The string to remove non-ASCII characters from.

    Returns:
        str: The string with non-ASCII characters





## CodeLlama 13b

CodeLlama 7b and 13b do code infilling using `<PRE> {prefix} <SUF>{suffix} <MID>`:
```
<PRE>def remove_non_ascii(s: str) -> str:
    """<SUF>
    return result
<MID>
```
or placeholder
```
def remove_non_ascii(s: str) -> str:
    """ <FILL>
    return result
```
then split and join with actual special tokens

In [3]:
llm = LLM(model=codellama13b)

INFO 01-17 20:31:53 llm_engine.py:70] Initializing an LLM engine with config: model='/work/arjunguha-research-group/models/CodeLlama-13b-hf', tokenizer='/work/arjunguha-research-group/models/CodeLlama-13b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)
INFO 01-17 20:32:01 llm_engine.py:275] # GPU blocks: 3631, # CPU blocks: 327
INFO 01-17 20:32:03 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-17 20:32:03 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
INFO 01-17 20:32:06 model_runner.py

### Single token Generations

In [5]:
prompts = get_prompts(targets=["bible"])
generate_test(prompts, llm, "codellama13b", 1, {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, codellama_fim) 

Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  6.54it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  3.29it/s]
Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  3.28it/s]


### Multi token Generations

In [6]:
generate_test(prompts, llm, "codellama13b", 50,  {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, codellama_fim) 

Processed prompts: 100%|██████████| 2/2 [00:01<00:00,  1.61it/s]
Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
Processed prompts: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]


### Evaluate

In [10]:
verbose=False
print(json.dumps(type_inf_eval("generations/codellama13b/singletok", verbose), indent=4))
print(json.dumps(type_inf_eval("generations/codellama13b/multitok", verbose), indent=4))

{
    "bible_fim_BookTitle_greedy_0_n1": false,
    "bible_fim_BookTitle_temp_0.2_2_n20": 0.0,
    "bible_fim_BookTitle_temp_0.8-top_p_0.95_4_n20": 0.0,
    "bible_fim_unknown_greedy_1_n1": false,
    "bible_fim_unknown_temp_0.2_3_n20": 0.0,
    "bible_fim_unknown_temp_0.8-top_p_0.95_5_n20": 0.0,
    "date_fim_DateQuery_greedy_0_n1": false,
    "date_fim_DateQuery_temp_0.2_2_n20": 0.0,
    "date_fim_DateQuery_temp_0.8-top_p_0.95_4_n20": 0.0,
    "date_fim_ValidDateQuery_greedy_1_n1": false,
    "date_fim_ValidDateQuery_temp_0.2_3_n20": 0.0,
    "date_fim_ValidDateQuery_temp_0.8-top_p_0.95_5_n20": 0.0
}
{
    "bible_fim_BookTitle_greedy_6_n1": false,
    "bible_fim_BookTitle_temp_0.2_8_n20": 0.0,
    "bible_fim_BookTitle_temp_0.8-top_p_0.95_10_n20": 0.0,
    "bible_fim_unknown_greedy_7_n1": false,
    "bible_fim_unknown_temp_0.2_9_n20": 0.0,
    "bible_fim_unknown_temp_0.8-top_p_0.95_11_n20": 0.0,
    "date_fim_DateQuery_greedy_6_n1": true,
    "date_fim_DateQuery_temp_0.2_8_n20": 1.0,


## StarcoderBase

In [2]:
llm = LLM(model=starcoder)

INFO 01-18 13:44:40 llm_engine.py:70] Initializing an LLM engine with config: model='/work/arjunguha-research-group/arjun/models/starcoderbase', tokenizer='/work/arjunguha-research-group/arjun/models/starcoderbase', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)
INFO 01-18 13:45:20 llm_engine.py:275] # GPU blocks: 131427, # CPU blocks: 13107
INFO 01-18 13:45:22 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-18 13:45:22 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
INFO 01-18 13:45:25 model_r

In [3]:
prompts = get_prompts()
generate_test(prompts, 
              llm, 
              "starcoder", 
              1, 
              {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, 
              starcoder_fim) 
generate_test(prompts, 
              llm, 
              "starcoder", 
              20, 
              {"greedy":greedy_params, "temp_0.2":sampling_eval, "temp_0.8-top_p_0.95": sampling_creative}, 
              starcoder_fim) 

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.00it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.93it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.96it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]


### Evaluation

In [8]:
verbose=True

def pretty_print(dict, targets = None):
    if targets:
        dict = {k:v for k,v in dict.items() if all([t in k for t in targets])}
    print(json.dumps(dict, indent=4))

st = type_inf_eval("generations/starcoder/singletok", verbose)
mt = type_inf_eval("generations/starcoder/multitok", verbose)

targ = ["greedy", "transaction"]
pretty_print(st, targ)
pretty_print(mt, targ)

{
    "transaction_fim_any_greedy_54_n1": [
        true,
        [
            "any"
        ],
        "any"
    ],
    "transaction_fim_void_greedy_55_n1": [
        false,
        [
            "Promise"
        ],
        "void"
    ]
}
{
    "transaction_fim_any_greedy_60_n1": [
        true,
        [
            "any"
        ],
        "any"
    ],
    "transaction_fim_void_greedy_61_n1": [
        false,
        [
            "Promise<void>"
        ],
        "void"
    ]
}
