<h1> MTPB Data: step-by-step instructions --> incremental code generation </h1>

<p>
generate code using MTPB's "prompt" instructions, alternating between prompts and code as shown below:

Input: prompt1
Output: code1

Input: code1 + prompt2
Output: code2

Input: code1 + code2 + prompt3
Output: code3</p>

In [1]:
import pandas as pd
import json

def read_json_line_format(path_to_file):
    """
        Read a JSON Lines format and store it into a dataframe.
    """
    data = []
    with open(path_to_file, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.json_normalize(data)
    return df

mtbp_path = 'data/mtpb.jsonl'
mtbp_data = read_json_line_format(mtbp_path)

In [None]:
def get_keys(input_list):
    """
        Get the list of unique input keys and list it (comma separated).
    """
    keys = set()
    for d in input_list:
        keys.update(d.keys())
    keys = sorted(list(keys))  # sort keys for consistent output
    return ','.join(keys)

def processing_name(name):
     #TODO: do the name processing
    return None

def create_signature_for_function(data):

    # initiate a list of signature
    signatures = []
    # loop over all the rows
    for i in range(len(data)):
        # extract the name of the according problem
        name = data.iloc[i]['name']
        # process the name
        name = processing_name(name)
        # get the name of the input
        # create the function signature architecture
        signature = f'def '

In [43]:
mtbp_data.iloc[6]['inputs']

[{'a1': 3, 'a2': 5, 'a3': 0, 'a4': 4},
 {'a1': 5, 'a2': 3, 'a3': 0, 'a4': 9},
 {'a1': 9, 'a2': 3, 'a3': 0, 'a4': 2},
 {'a1': 2, 'a2': 4, 'a3': 0, 'a4': 7},
 {'a1': 2, 'a2': 4, 'a3': 4, 'a4': 7}]

In [37]:
mtbp_data.iloc[18]

prompts           [Initialize a list of integers with {a1} and a...
inputs            [{'a1': '[0,1,2,3]', 'a2': '4'}, {'a1': '[1, 1...
outputs                    [[1, 3], [1, 2], [0, 3], [2, 3], [2, 3]]
max_gen_length                                                128.0
category                                                  algorithm
name                                                        Two-sum
description       Implement the two-sum problem on a given input...
id                                                               19
Name: 18, dtype: object

In [34]:
mtbp_data.iloc[14]['prompts']

["Create a function encrypt that takes a string as an argument and returns a string encrypted with the alphabet being rotated. The alphabet should be rotated in a manner such that the letters shift down by two places. For example: encrypt('hi') returns 'jk', encrypt('asdfghjkl') returns 'cufhijlmn', encrypt('gf') returns 'ih'.",
 'Create a function decrypt that decodes the encrypted string from encrypt() back into the original text.',
 'Assign "{a1}" to a variable named "original_text".',
 "Call the function encrypt with original_text as argument and assign the result to a variable named 'encrypted_text'.",
 "Call the function decrypt with encrypted_text as argument and assign the result to a variable named 'restored_text'.",
 'Create a list named "my_result" containing restored_text and encrypted_text as elements.',
 'Print the list.']

In [11]:
import re

def extract_bracket_content(text):
    pattern = "{(.*?)}"
    match = re.search(pattern, text)
    if match is not None:
        return match.group(1)
    else:
        return None
    
def managing_prompts_with_input(prompts, input):
    """
        This function gives an example of the architecture of the input
    """
    # we will simply add an example of the architecture of the first input. eg {input} for example : 'input' = [1,2,3]
    processed_prompts = []
    # Look for the prompt to change
    for prompt in prompts:
        # extract the input key to replace with the for example
        input_key = extract_bracket_content(prompt)
        if input_key is None:
            processed_prompts.append(prompt)
        else:
            added_prompt = '{' + input_key + '}' + f' for example : {input_key} = {input[input_key]} '
            processed_prompt = prompt.replace(input_key,added_prompt)

    return processed_prompt

prompts = mtbp_data.iloc[0]['prompts']

In [12]:
prompts

['Assign the string "{A}" to a variable named "my_string".',
 'Lowercase the given string "my_string".',
 'Assign the distinct characters of the string to a variable named "chars".',
 'Sort these characters in alphabetical order.',
 'Print the resulting list of characters.']

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from santaC import *


def generating_step_by_step_with_context(model, data):
    # Starting with an empty piece of code
    code = ''

    for j in range(len(data)):
        if j > 2 : 
            break
        else:
            prompts = data.iloc[j]['prompts']
            # Iterate over each prompt
            for i, prompt in enumerate(prompts):
                # show what prompts is currently beeing used
                print(f"Prompt {i+1}: {prompt}")

                # Add the prompt to the previously generated code
                input_text = code + '\n' + '#' + prompt

                # Encode the input text
                input_ids = model.tokenizer.encode(input_text, return_tensors='pt')

                # Generate the output
                output_ids = model.forward(input_ids)

                # Decode the output
                output_text = model.decode_output(output_ids[0])

                # Process output
                
                # Extract the newly generated code
                new_code = output_text[len(input_text):].strip()

                # Append the newly generated code to the existing code
                code += '\n' + new_code

                # print(f"Code after step {i+1}:\n{code}\n{'-'*50}")

        print("Final generated code:\n", code)

model = MySantaCoder('GrdS', 128)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [None]:
generating_step_by_step_with_context(model, mtbp_data)

In [None]:
model = MySantaCoder('GrdS', 128)

In [32]:
generated_code_signature = 'def assign(A):\n' 
example_of_prompt = '\t# Assign the string "{A}" to a variable named "my_string".'
code_to_prompt = generated_code_signature + example_of_prompt
input_ids = model.tokenizer.encode(code_to_prompt, return_tensors='pt')

# Generate the output
output_ids = model.forward(input_ids)

# Decode the output
output_text = model.decode_output(output_ids[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.




In [33]:
print(output_text)

def assign(A):
	# Assign the string "{A}" to a variable named "my_string".
	my_string = A
	# Print the value of my_string.
	print(my_string)

# Call the function assign.
assign("Hello")

# Print the value of my_string.
print(my_string)

# Print the value of my_string.
print(my_string)

# Print the value of my_string.
print(my_string)

# Print the value of my_string.

