Imports

In [43]:
import pandas as pd
import general_adapter
from openai import OpenAI
import re

Args

In [44]:
class Args():
    file_path_base = 'CoderEval4Java.json'
    file_path_label = 'CEJavaHumanLabel.jsonl'

    # prompt stuff
    prompt_num = 37

    # model stuff
    model = 'gpt-4-turbo'
    key = ''

args=Args()

Load Dataset and Print Columns

In [45]:


with open(args.file_path_base, 'r') as file:
    df_base = pd.json_normalize(pd.read_json(file)['RECORDS'])

df_label = pd.read_json(args.file_path_label, lines=True)

print(df_base.columns)
print(df_label.columns)

Index(['_id', 'all_context', 'class_name', 'code', 'docstring', 'end_lineno',
       'file_content', 'file_name', 'human_label', 'level', 'lineno', 'name',
       'oracle_context', 'package', 'project'],
      dtype='object')
Index(['question_id', 'input', 'signature', 'docstring'], dtype='object')


Filter Dataframes To Only Include All Self-Contained Prompts

In [46]:
# truncate df_base to only include prompts with level as "self_contained"
df_base_filtered = df_base[df_base['level'] == 'self_contained']
# reset index
df_base_filtered.reset_index(drop=True, inplace=True)
print(df_base_filtered.shape)
# saved all the question_ids in df_base_filtered to a list
self_contained_ids = df_base_filtered['_id'].tolist()

# filter df_label to only include rows with question_id in question_ids
df_label_filtered = df_label[df_label['question_id'].isin(self_contained_ids)]
# reset index
df_label_filtered.reset_index(drop=True, inplace=True)
print(df_label_filtered.shape)

(55, 15)
(55, 4)


Print Content of Selected Prompt

In [47]:
# for col in df_base.columns:
#     print(f'{col}: {df_base[col][args.prompt_num]}')

for col in df_label_filtered.columns:
    print(f'{col}: {df_label_filtered[col][args.prompt_num]}')

question_id: 636767de1a6d9265ec01871e
input: /**Check whether the key contains null.**/
private void check(String modelName) throws IllegalStateException {
signature: private void check(String modelName) throws IllegalStateException {
docstring: Check whether the key contains null.


Printing Delta Components

In [48]:
function_header = df_label_filtered['signature'][args.prompt_num]
docstring = df_label_filtered['docstring'][args.prompt_num]
print('Prompt: ', args.prompt_num)
print('-'*50)
print('FUNCTION HEADER')
print(function_header)
print('-'*50)
print('DOCSTRING')
print(docstring)

Prompt:  37
--------------------------------------------------
FUNCTION HEADER
private void check(String modelName) throws IllegalStateException {
--------------------------------------------------
DOCSTRING
Check whether the key contains null.


Make Deltas (Combinations)

In [49]:

pattern = r"\b(public|private|protected)?\s*(static)?\s*([\w\[\]<>, ]+)\s+(\w+)\s*\(([^)]*)\)\s*(throws\s+[\w\., ]+)?\s*{"

def extract_entry_point(function_header):
    match = re.search(pattern, function_header)
    if match:
        return match.group(4)

def normalized_function_header(function_header):
    entry_point = extract_entry_point(function_header)
    if entry_point:
        return re.sub(entry_point, "func", function_header)
    return function_header

def del_underscore_and_caps(entry_point):
    return entry_point.capitalize()

def capitalize_first_letter(input_str):
    if not input_str:
        return input_str
    first_char = input_str[0].upper()
    rest_of_string = input_str[1:]
    return first_char + rest_of_string

In [50]:
df_entry_point = pd.DataFrame(columns=['prompt', 'function_header', 'entry_point'])

for i in range(df_label_filtered.shape[0]):
    function_header = df_label_filtered['signature'][i]
    entry_point = extract_entry_point(function_header)
    row = pd.DataFrame([{'prompt': i, 'function_header': function_header, 'entry_point': entry_point}])
    df_entry_point = pd.concat([df_entry_point, row], ignore_index=True)
    # export as csv
df_entry_point.to_csv('entry_points.csv', index=False)

In [51]:
def create_deltas(function_header, docstring):
    deltas_dict = {}

    # Pre-Transfromation Deltas
    deltas_dict['delta_1'] = f'{function_header}\n"""\n{docstring}\n"""\n'
    deltas_dict['delta_2'] = f'{docstring}\nCreate a Java method named {extract_entry_point(function_header)}'
    deltas_dict['delta_3'] = f'{normalized_function_header(function_header)}\n"""\n{docstring}\n"""\n'
    # deltas_dict['delta_4'] = f'{docstring}\n{function_header}'
    deltas_dict['delta_4'] = f'{function_header}'

    # Post-Transformation Deltas
    docstring_transform = docstring.title() # capitalize the first letter of each word in the docstring
    deadcode_transform = f'{function_header}\n\tif (False) {{\n\t\tint[] x = new int[42];\n\t\tfor (int i = 0; i < 42; i++) {{\n\t\t\tx[i] = i;\n\t\t}}\n\t}}'
    entry_point_transform = del_underscore_and_caps(extract_entry_point(function_header)) # transform the function name
    entry_point_function_header_transform = function_header.replace(extract_entry_point(function_header), entry_point_transform) # replace the function name with a new name
    entry_point_docstring_transform = docstring.replace(extract_entry_point(function_header), entry_point_transform) # replace the function name with a new name

    deltas_dict['delta_5'] = f'{function_header}\n"""\n{docstring_transform}\n"""\n'
    deltas_dict['delta_6'] = f'{deadcode_transform}\n"""\n{docstring}\n"""\n'
    deltas_dict['delta_7'] = f'{entry_point_function_header_transform}\n"""\n{entry_point_docstring_transform}\n"""\n'

    # New Deltas
    prefix_docstring = f'DOCSTRING: {docstring}'
    entry_point = extract_entry_point(function_header)
    prefix_entry_point = f'method{capitalize_first_letter(entry_point)}'
    print(prefix_entry_point)
    function_header_prefix = function_header.replace(entry_point, prefix_entry_point)
    deltas_dict['delta_8'] = f'{function_header}\n"""\n{prefix_docstring}\n"""\n'
    deltas_dict['delta_9'] = f'{function_header_prefix}\n"""\n{docstring.replace(entry_point, prefix_entry_point)}\n"""\n'

    return deltas_dict

LLM Inference Helper Functions

In [52]:
def generate_openai_output(delta):
    question = f"{delta}"
    client = OpenAI(api_key=args.key)
    response = client.chat.completions.create(
        model=args.model,
        messages=[{'role': 'user', 'content': question}],
        max_tokens=2048,
        temperature=0
    )
    answer = response.choices[0].message.content.strip()
    return answer

In [53]:
df_new_deltas = pd.DataFrame(columns=['_id', 'delta_num', 'delta_val'])

for i in range(len(df_label_filtered)):
    id = self_contained_ids[i]
    function_header = df_label_filtered['signature'][i]
    docstring = df_label_filtered['docstring'][i]
    deltas_dict = create_deltas(function_header, docstring)
    row1 = pd.DataFrame([{'_id': id, 'delta_num': '8', 'delta_val': deltas_dict['delta_8']}])
    row2 = pd.DataFrame([{'_id': id, 'delta_num': '9', 'delta_val': deltas_dict['delta_9']}])
    df_new_deltas = pd.concat([df_new_deltas, row1], ignore_index=True)
    df_new_deltas = pd.concat([df_new_deltas, row2], ignore_index=True)

# export the new deltas to a jsonl file
df_new_deltas.to_json('codereval-java_delta8_delta9.jsonl', orient='records', lines=True)

methodTrimArrayElements
methodIsSameLength
methodPop
methodToPrimitive
methodShouldPrintMessage
methodToString
methodVisitFrameStart
methodParseEndOfLine
methodToBoolean
methodAccept
methodNullToEmpty
methodNullToEmpty
methodBuildTimeRanges
methodToPrimitive
methodComputeUTF8Size
methodRemove
methodStartsWithIgnoreCase
methodEndsWith
methodNullToEmpty
methodNullToEmpty
methodSwap
methodIsEmpty
methodCopyStrings
methodEnlarge
methodInsert
methodIsNotTrue
methodToPrimitive
methodToString
methodToChar
methodDigitValue
methodComputeBinaryLog
methodIsAllZeros
methodReadLabel
methodToBoolean
methodToString
methodFactorial
methodCompareSigned
methodCheck
methodToPrimitive
methodIsAsciiControl
methodToByteArray
methodReload
methodCountOccurrencesOf
methodPop
methodOverrideName
methodToPrimitive
methodStartsWith
methodIsAsciiAlphaUpper
methodLength
methodIsHex
methodIsSameLength
methodClone
methodArrayequals
methodIsAscii
methodEncodeTemplateNames


Generate LLM Output

In [54]:
df_results = pd.DataFrame(columns=['prompt', 'delta', 'code', 'llm_output'])

for prompt_index in range(df_label_filtered.shape[0]):
    print(f'Prompt: {prompt_index}')
    function_header = df_label_filtered['signature'][prompt_index]
    docstring = df_label_filtered['docstring'][prompt_index]
    deltas_dict = create_deltas(function_header, docstring)

    # temporary: remove all deltas except delta_8 and delta_9
    deltas_dict = {key: deltas_dict[key] for key in ['delta_8', 'delta_9']}

    for delta_index, delta in enumerate(deltas_dict):
        print(f"Generating Output for Delta {delta_index + 1} of {len(deltas_dict)}")
        llm_output = generate_openai_output(deltas_dict[delta])
        code = general_adapter.extract_java_code(llm_output)
        row = pd.DataFrame([{'prompt': prompt_index, 'delta': delta, 'code': code, 'llm_output': llm_output}])
        df_results = pd.concat([df_results, row], ignore_inndex=True)

Prompt: 0
methodTrimArrayElements
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 1
methodIsSameLength
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 2
methodPop
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 3
methodToPrimitive
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 4
methodShouldPrintMessage
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 5
methodToString
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 6
methodVisitFrameStart
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 7
methodParseEndOfLine
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 8
methodToBoolean
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 9
methodAccept
Generating Output for Delta 1 of 2
Generating Output for Delta 2 of 2
Prompt: 10
method

Save Results to JSON

In [55]:
# Save results to JSONL file
df_results.to_json(f"{args.model}_codereval-java_delta8_delta9.jsonl", orient='records', lines=True)