In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [30]:
import re
import datasets
import builtins
import pandas as pd
from vllm import LLM, SamplingParams
import os
from typing import List
import sys
sys.path.append("/home/franlucc/projects/codetrace")
from codetrace.parsing_utils import placeholder_to_std_fmt, DEEPSEEK_FIM

# Codellama completions

In [3]:
DIR="/home/franlucc/projects/codetrace"
ds = datasets.load_from_disk(f"{DIR}/experiments/codellama_7b/py_completions")
ds_sc1_ = datasets.load_dataset("nuprl-staging/py_typeinf_fim", split="train")
ds, ds_sc1_

(Dataset({
     features: ['key', 'prefix', 'suffix', 'middle', 'correct', 'model', 'fim_type', 'fim_program', 'hexsha', 'generated_text'],
     num_rows: 125507
 }),
 Dataset({
     features: ['key', 'prefix', 'suffix', 'middle', 'correct', 'model', 'fim_type', 'fim_program', 'hexsha'],
     num_rows: 265879
 }))

In [4]:
hexshas = set(ds["fim_program"])
ds_sc1 = ds_sc1_.filter(lambda x: x["fim_program"] in hexshas, num_proc=30)

In [5]:
df_sc1 = ds_sc1.to_pandas()
df_sc1_ = ds_sc1_.to_pandas()
df = ds.to_pandas()
print(df_sc1["correct"].value_counts(), df_sc1["correct"].mean())
print(df_sc1_["correct"].value_counts(), df_sc1_["correct"].mean())
print(df["correct"].value_counts(), df["correct"].mean())

correct
True     118953
False      6554
Name: count, dtype: int64 0.9477798051104719
correct
True     134511
False    131368
Name: count, dtype: int64 0.5059105833856754
correct
False    113228
True      12279
Name: count, dtype: int64 0.09783518050786012


In [6]:
def get_pattern(fim_type:str) -> str:
    raw_pattern = r'from\s+.+?\s+import\s+({fim_type})\n|import\s*.*?\s+?({fim_type})\n'
    return raw_pattern.format(fim_type=fim_type)

def is_imported(fim_type:str, fim_program:str) -> bool:
    matches = re.search(get_pattern(fim_type), fim_program)
    if not matches:
        return False
    # print([matches.group(i) for i in range(1, len(matches.groups()) + 1)])
    return any(matches.group(i) == fim_type for i in range(1, len(matches.groups()) + 1))

prog = '''from  oop import strop
from oop.dilup import strop
import Strofhsugf
import igfhi from str

from j import (
    strop
)

from j.model import Integration, stroop

'''
is_imported("stroop", prog)

True

In [7]:
ds = ds.map(lambda x: {**x, 
                       "is_imported_fim" : is_imported(x["fim_type"], x["fim_program"]),
                       "is_builtin": any([x["fim_type"]==str(i) for i in dir(builtins)]),
                       }, num_proc=30)

In [8]:
df = ds.to_pandas()
df["imported_fim_or_builtin"] = df["is_builtin"] | df["is_imported_fim"]
df["imported_fim_and_builtin"] = df["is_builtin"] & df["is_imported_fim"]
df_correct = df[df["correct"]]
df_incorrect = df[~df["correct"]]
print(df["is_imported_fim"].value_counts())
print((df_correct["is_imported_fim"]).value_counts())
print(df["correct"].mean())

is_imported_fim
False    106442
True      19065
Name: count, dtype: int64
is_imported_fim
False    9998
True     2281
Name: count, dtype: int64
0.09783518050786012


In [9]:
def ignore_types(d, types):
    for t in types:
        d = d[d["fim_type"] != str(t)]
    return d.reset_index()
 
df_corr_imported = df_correct[df_correct["is_imported_fim"]].reset_index()
df_incorr_imported = df_incorrect[df_incorrect["is_imported_fim"]].reset_index()
# df_ic = ignore_types(df_, dir(builtins))
# k=2
# print(df_ic["fim_program"][k], df_ic["fim_type"][k], df_ic["is_imported_fim"][k])

# with pd.option_context('display.max_rows', None):
#     print(df_ic["fim_type"].value_counts())

print(df_correct["is_imported_fim"].mean())
print(df_correct["is_builtin"].mean())

print(df_correct["is_imported_fim"].value_counts())
print(df_correct["is_builtin"].value_counts())
print(df_correct["imported_fim_or_builtin"].value_counts())
print(df_correct["imported_fim_and_builtin"].value_counts())

print(df_correct["is_imported_fim"].sum() / df_correct["imported_fim_or_builtin"].sum())
print(df_correct["is_builtin"].sum() / df_correct["imported_fim_or_builtin"].sum())
print(df_correct["imported_fim_or_builtin"].mean())

print(df_correct[df_correct["imported_fim_and_builtin"]]["fim_type"].value_counts())
item = df_correct[df_correct["imported_fim_and_builtin"]].reset_index()
# print(item["fim_program"][0], item["fim_type"][0])

0.18576431305480903
0.7217200097727828
is_imported_fim
False    9998
True     2281
Name: count, dtype: int64
is_builtin
True     8862
False    3417
Name: count, dtype: int64
imported_fim_or_builtin
True     11138
False     1141
Name: count, dtype: int64
imported_fim_and_builtin
False    12274
True         5
Name: count, dtype: int64
0.20479439755790985
0.7956545160711079
0.9070771235442625
fim_type
bool    3
int     2
Name: count, dtype: int64


In [10]:
muts = pd.read_csv(f"{DIR}/experiments/codellama_7b/nov7_test_all_v2_incorrect_log.csv")

In [11]:
k=67
muts["generated_text"][k],muts["mutated_generated_text"][k],muts["fim_type"][k]

(nan, '\n', 'float')

In [12]:
muts["mutated_generated_text"].value_counts()

mutated_generated_text
\n    1353
""      21
Name: count, dtype: int64

In [13]:
df_incorrect["generated_text"].value_counts()

generated_text
            109052
""            2164
#              314
"""            229
~~~~~~~~       168
             ...  
any              1
subset           1
tp               1
sound            1
Se               1
Name: count, Length: 211, dtype: int64

In [14]:
df["correct"].mean(), df["correct"].count()

(0.09783518050786012, 125507)

# Deepseek completions

In [15]:
py_data = datasets.load_dataset("nuprl-staging/py_typeinf_fim")
py_data

DatasetDict({
    train: Dataset({
        features: ['key', 'prefix', 'suffix', 'middle', 'correct', 'model', 'fim_type', 'fim_program', 'hexsha'],
        num_rows: 265879
    })
})

In [None]:
model = LLM("/mnt/ssd/franlucc/models/deepseek-coder-6.7b-base", device="cuda:2")

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}


INFO 11-08 13:36:23 config.py:820] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 11-08 13:36:23 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/mnt/ssd/franlucc/models/deepseek-coder-6.7b-base', speculative_config=None, tokenizer='/mnt/ssd/franlucc/models/deepseek-coder-6.7b-base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=65536, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/mnt/ssd/franlucc/models/deepseek-coder-6.7b-base, use_v2_block_manager=Fal

INFO 11-08 13:36:24 model_runner.py:720] Starting to load model /mnt/ssd/franlucc/models/deepseek-coder-6.7b-base...


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:07<00:07,  7.55s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:29<00:00, 15.85s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:29<00:00, 14.61s/it]



INFO 11-08 13:36:54 model_runner.py:732] Loading model weights took 12.5708 GB
INFO 11-08 13:36:54 gpu_executor.py:102] # GPU blocks: 7446, # CPU blocks: 512
INFO 11-08 13:36:56 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-08 13:36:56 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-08 13:37:06 model_runner.py:1225] Graph capturing finished in 10 secs.


In [39]:
def generate(llm: LLM, prompts:List[str])->List[str]:
    params = SamplingParams(max_tokens=1, temperature=0, n=1)
    outputs = llm.generate(prompts, sampling_params=params)
    res =[]
    for req_output in outputs:
        for output in req_output.outputs:
            res.append(output.text)
    return res

In [40]:
prompt = '''
def factorial(n: <FILL>):
    pass
'''.strip()

generate(model, placeholder_to_std_fmt(prompt, DEEPSEEK_FIM))

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 24.20it/s, est. speed input: 364.56 toks/s, output: 24.28 toks/s]


['int']