Imports

In [None]:
import pandas as pd
import general_adapter
from openai import OpenAI
import re

Args

In [None]:
class Args():
    file_path_base = 'CoderEval4Python.json'
    file_path_label = 'CEPythonHumanLabel.jsonl'

    # prompt stuff
    prompt_num = 10

    # model stuff
    model = 'gpt-4-turbo'
    key = ''

args=Args()

Load Dataset and Print Columns

In [None]:


with open(args.file_path_base, 'r') as file:
    df_base = pd.json_normalize(pd.read_json(file)['RECORDS'])

df_label = pd.read_json(args.file_path_label, lines=True)

print(df_base.columns)
print(df_label.columns)

Filter Dataframes To Only Include All Self-Contained Prompts

In [None]:
# truncate df_base to only include prompts with level as "self_contained"
df_base_filtered = df_base[df_base['level'] == 'self_contained']
# reset index
df_base_filtered.reset_index(drop=True, inplace=True)
print(df_base_filtered.shape)
# saved all the question_ids in df_base_filtered to a list
self_contained_ids = df_base_filtered['_id'].tolist()

for i in range(len(self_contained_ids)):
    print(self_contained_ids[i])
    
# filter df_label to only include rows with question_id in question_ids
df_label_filtered = df_label[df_label['question_id'].isin(self_contained_ids)]
# reset index
df_label_filtered.reset_index(drop=True, inplace=True)
print(df_label_filtered.shape)

Print Content of Selected Prompt (DEBUG)

In [None]:
# for col in df_base.columns:
#     print(f'{col}: {df_base[col][args.prompt_num]}')

for col in df_label_filtered.columns:
    print(f'{col}: {df_label_filtered[col][args.prompt_num]}')

Printing Delta Components

In [None]:
function_header = df_label_filtered['signature'][args.prompt_num]
docstring = df_label_filtered['docstring'][args.prompt_num]
print('Prompt: ', args.prompt_num)
print('-'*50)
print('FUNCTION HEADER')
print(function_header)
print('-'*50)
print('DOCSTRING')
print(docstring)

Make Deltas (Combinations)

In [None]:
pattern = r"\bdef\s+(\w+)\s*\(([^)]*)\)(\s*->\s*[\w\[\],\s]*)?\s*:"

def extract_entry_point(function_header):
    return re.search(pattern, function_header).group(1)

def normalized_function_header(function_header):
    entry_point = extract_entry_point(function_header)
    return re.sub(entry_point, "func", function_header)

def del_underscore_and_caps(entry_point):
    if '_' in entry_point:
        func_elements = entry_point.split('_')
        func_elements = [i.capitalize() for i in func_elements]
        func_elements = ''.join(func_elements)
        return func_elements
    return entry_point.capitalize()

In [None]:
def create_deltas(function_header, docstring):
    deltas_dict = {}

    # Pre-Transfromation Deltas
    deltas_dict['delta_1'] = f'{function_header}\n"""\n{docstring}\n"""\n'
    deltas_dict['delta_2'] = f'{docstring}\nCreate a function named {extract_entry_point(function_header)}'
    deltas_dict['delta_3'] = f'{normalized_function_header(function_header)}\n"""\n{docstring}\n"""\n'
    # deltas_dict['delta_4'] = f'{docstring}\n{function_header}'
    deltas_dict['delta_4'] = f'{function_header}'

    # Post-Transformation Deltas
    docstring_transform = docstring.title() # capitalize the first letter of each word in the docstring
    deadcode_transform = f'{function_header}\n\tif False:\n\t\tx=[_ for i in range(42)]' # add a line of code to the function
    entry_point_transform = del_underscore_and_caps(extract_entry_point(function_header)) # transform the function name
    entry_point_function_header_transform = function_header.replace(extract_entry_point(function_header), entry_point_transform) # replace the function name with a new name
    entry_point_docstring_transform = docstring.replace(extract_entry_point(function_header), entry_point_transform) # replace the function name with a new name

    deltas_dict['delta_5'] = f'{function_header}\n"""\n{docstring_transform}\n"""\n'
    deltas_dict['delta_6'] = f'{deadcode_transform}\n"""\n{docstring}\n"""\n'
    deltas_dict['delta_7'] = f'{entry_point_function_header_transform}\n"""\n{entry_point_docstring_transform}\n"""\n'
    
    # New Deltas
    prefix_docstring = f'DOCSTRING: {docstring}'
    entry_point = extract_entry_point(function_header)
    prefix_entry_point = f'func_{entry_point}'
    function_header_prefix = function_header.replace(entry_point, prefix_entry_point)
    deltas_dict['delta_8'] = f'{function_header}\n"""\n{prefix_docstring}\n"""\n'
    deltas_dict['delta_9'] = f'{function_header_prefix}\n"""\n{docstring.replace(entry_point, prefix_entry_point)}\n"""\n'
    
    return deltas_dict


LLM Inference Helper Functions

In [None]:
def generate_openai_output(delta):
    question = f"{delta}"
    client = OpenAI(api_key=args.key)
    response = client.chat.completions.create(
        model=args.model,
        messages=[{'role': 'user', 'content': question}],
        max_tokens=2048,
        temperature=0
    )
    answer = response.choices[0].message.content.strip()
    return answer

In [None]:
df_new_deltas = pd.DataFrame(columns=['_id', 'delta_num', 'delta_val'])

for i in range(len(df_label_filtered)):
    id = self_contained_ids[i]
    function_header = df_label_filtered['signature'][i]
    docstring = df_label_filtered['docstring'][i]
    deltas_dict = create_deltas(function_header, docstring)
    row1 = pd.DataFrame([{'_id': id, 'delta_num': '8', 'delta_val': deltas_dict['delta_8']}])
    row2 = pd.DataFrame([{'_id': id, 'delta_num': '9', 'delta_val': deltas_dict['delta_9']}])
    df_new_deltas = pd.concat([df_new_deltas, row1], ignore_index=True)
    df_new_deltas = pd.concat([df_new_deltas, row2], ignore_index=True)

# export the new deltas to a jsonl file
df_new_deltas.to_json('codereval-python_delta8_delta9.jsonl', orient='records', lines=True)


Generate LLM Output

In [None]:
df_results = pd.DataFrame(columns=['prompt', 'delta', 'code', 'llm_output'])

for prompt_index in range(df_label_filtered.shape[0]):
    print(f'Prompt: {prompt_index}')
    function_header = df_label_filtered['signature'][prompt_index]
    docstring = df_label_filtered['docstring'][prompt_index]
    deltas_dict = create_deltas(function_header, docstring)

    # temporary: remove all deltas except delta_8 and delta_9
    deltas_dict = {key: deltas_dict[key] for key in ['delta_8', 'delta_9']}

    for delta_index, delta in enumerate(deltas_dict):
        print(f"Generating Output for Delta {delta_index + 1} of {len(deltas_dict)}")
        llm_output = generate_openai_output(deltas_dict[delta])
        code = general_adapter.extract_python_code(llm_output)
        row = pd.DataFrame([{'prompt': prompt_index, 'delta': delta, 'code': code, 'llm_output': llm_output}])
        df_results = pd.concat([df_results, row], ignore_index=True)

Save Results to JSON

In [None]:
# Save results to JSONL file
df_results.to_json(f"{args.model}_codereval-python_delta8_delta9.jsonl", orient='records', lines=True)