<h1> MTPB Data: step-by-step instructions --> incremental code generation </h1>

<p>
generate code using MTPB's "prompt" instructions, alternating between prompts and code as shown below:

Input: prompt1
Output: code1

Input: code1 + prompt2
Output: code2

Input: code1 + code2 + prompt3
Output: code3</p>

In [1]:
import pandas as pd
import json
import re

def read_json_line_format(path_to_file):
    """
        Read a JSON Lines format and store it into a dataframe.
    """
    data = []
    with open(path_to_file, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.json_normalize(data)
    return df
########################### Unused yet ###############################
def extract_bracket_content(text):
    pattern = "{(.*?)}"
    match = re.search(pattern, text)
    if match is not None:
        return match.group(1)
    else:
        return None
    
def managing_prompts_with_input(prompts, input):
    """
        This function gives an example of the architecture of the input
    """
    # we will simply add an example of the architecture of the first input. eg {input} for example : 'input' = [1,2,3]
    processed_prompts = []
    # Look for the prompt to change
    for prompt in prompts:
        # extract the input key to replace with the for example
        input_key = extract_bracket_content(prompt)
        if input_key is None:
            processed_prompts.append(prompt)
        else:
            added_prompt = '{' + input_key + '}' + f' for example : {input_key} = {input[input_key]} '
            processed_prompt = prompt.replace(input_key,added_prompt)

    return processed_prompt
#######################################################################

def get_keys(input_list):
    """Get the list of unique input keys and list it (comma separated).
    """
    keys = set()
    for d in input_list:
        keys.update(d.keys())
    keys = sorted(list(keys))  # sort keys for consistent output
    return ','.join(keys)

def processing_name(name):
    """Processing the name of the problem to match the syntax of a function
    """
    name = name.lower()  # convert to lowercase
    name = re.sub('[^a-z0-9 ]', '', name)  # remove any non-alphanumeric characters (except spaces)
    name = re.sub(' ', '_', name)  # replace spaces with underscores
    return name

def create_signature_for_function(data):
    """Create the function signature for each problem.
    """
    # initiate a list of signature
    signatures = []
    # loop over all the rows
    for i in range(len(data)):
        # extract the name of the according problem
        name = data.iloc[i]['name']
        # process the name
        name = processing_name(name)
        # get the input
        inputs = data.iloc[i]['inputs']
        # extract the name
        input_keys = get_keys(inputs)
        # create the function signature architecture
        signature = f'def {name}({input_keys}):'
        # adding the signature to the list
        signatures.append(signature)
    
    data['signature'] = signatures
    return data

# Call the functions
mtbp_path = 'data/mtpb.jsonl'
mtbp_data = read_json_line_format(mtbp_path)
mtbp_data = create_signature_for_function(mtbp_data)

In [46]:
from santaC import *

max_token_to_generate = 128
model = MySantaCoder('SmplM', max_token_to_generate)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from santaC import *


def get_code_for_prompt(code_text, prompt_index, keep_context=True):
    lines = code_text.split('\n')
    prompt_count = 0
    start_index = 0
    for i, line in enumerate(lines):
        if line.strip().startswith('#'):  # we have a new prompt
            if prompt_count == prompt_index:  # we have reached the desired prompt
                break
            prompt_count += 1
            if not keep_context:
                start_index = i
    return '\n'.join(lines[start_index:i+2])  # include the next line after the prompt

def get_code_for_prompt_old(code_text, prompt_index):
    lines = code_text.split('\n')
    prompt_count = 0
    for i, line in enumerate(lines):
        if line.strip().startswith('#'):  # we have a new prompt
            if prompt_count == prompt_index:  # we have reached the desired prompt
                break
            prompt_count += 1
    return '\n'.join(lines[:i+2])  # include the next line after the prompt

def remove_context(code):
    """remove all lines starting with '\t#' 
    """
    lines = code.split('\n')
    # keep only lines that don't start with '\t#'
    lines = [line for line in lines if not line.startswith('\t#')]
    return '\n'.join(lines)


def generating_step_by_step_with_context(model, data, keep_context = True):
    """Generating code step by step 
    """
    codes = []
    for j in range(len(data)):
        # start with the signature for the incoming problem
        code = data.iloc[j]['signature']
        if j > 2 : 
            break
        else:
            # initiate the list of prompt to generate
            prompts = data.iloc[j]['prompts']
            # Iterate over each prompt
            for i, prompt in enumerate(prompts):
                # show what prompts is currently beeing used
                print(f"Prompt {i+1}: {prompt}")
                
                # Add the prompt to the previously generated code
                input_text = code + '\n\t' + '#' + prompt
                
                # Encode the input text
                input_ids = model.tokenizer.encode(input_text, return_tensors='pt')

                # Generate the output
                output_ids = model.forward(input_ids)

                # Decode the output
                output_text = model.decode_output(output_ids[0])
                
                # keep only the last code generated after the output
                code = get_code_for_prompt_old(output_text, i) 
                
        if keep_context==False:
            # remove context if set to False
            code = remove_context(code)

        # print("Final generated code:\n", code)
        codes.append(code)
    


generating_step_by_step_with_context(model, mtbp_data, True)

In [16]:
trys = 'def function(a):\n\t#Assign the value a to "my_string"\n\tmy_string=a\n\t#print the value ones\n\tprint(my_string)\n\t#print the value twice\n\tprint(my_string, my_string)\n\t#return the string\n\treturn my_string'
trys_wocon = 'def function(a):\n\tmy_string=a\n\tprint(my_string)\n\tprint(my_string, my_string)\n\t#return the string\n\treturn my_string'
print(trys_wocon)

def function(a):
	my_string=a
	print(my_string)
	print(my_string, my_string)
	#return the string
	return my_string


In [4]:
def get_code_for_prompt(code_text, prompt_index, keep_context=True):
    lines = code_text.split('\n')
    prompt_count = 0
    start_index = 0
    for i, line in enumerate(lines):
        if line.strip().startswith('#'):  # we have a new prompt
            if prompt_count == prompt_index:  # we have reached the desired prompt
                break
            prompt_count += 1
            if not keep_context:
                start_index = i
    return '\n'.join(lines[start_index:i+2])  # include the next line after the prompt

In [5]:
def get_code_for_prompt_old(code_text, prompt_index):
    lines = code_text.split('\n')
    prompt_count = 0
    for i, line in enumerate(lines):
        if line.strip().startswith('#'):  # we have a new prompt
            if prompt_count == prompt_index:  # we have reached the desired prompt
                break
            prompt_count += 1
    return '\n'.join(lines[:i+2])  # include the next line after the prompt

In [22]:
output = get_code_for_prompt_old(trys_wocon, 0)
print(output)

def function(a):
	my_string=a
	print(my_string)
	print(my_string, my_string)
	#return the string
	return my_string


In [35]:
input_ids = model.tokenizer.encode('#', return_tensors='pt')
print(input_ids)

tensor([[2]])


In [51]:
mtbp_data.iloc[0]

prompts           [Assign the string "{A}" to a variable named "...
inputs            [{'A': 'abcde'}, {'A': 'abcdecadeCADE'}, {'A':...
outputs           [[a, b, c, d, e], [a, b, c, d, e], [a], [ , e,...
max_gen_length                                                128.0
category                                                     string
name                                                Sandwich string
description        Append a string in the middle of another string.
id                                                                1
signature                                   def sandwich_string(A):
Name: 0, dtype: object

In [49]:
indice = 1
code_to_prompt = mtbp_data.iloc[indice]['signature'] + "\n\t" + '#' + mtbp_data.iloc[indice]['prompts'][0]
input_ids = model.tokenizer.encode(code_to_prompt, return_tensors='pt')

# Generate the output
output_ids = model.forward(input_ids)

# Decode the output
output_text = model.decode_output(output_ids[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


In [50]:
print(output_text)

def normalize_integer_list(numbers):
	#Define a list of integers named "numbers" with the values {numbers}.
	return [int(i) for i in numbers]

def is_int(number):
	#Check whether the input "number" is an integer.
	return isinstance(number, int)

def is_float(number):
	#Check whether the input "number" is a float.
	return isinstance(number, float)

def is_string(string):
	#Check whether the input "string" is a string.
	return isinstance(string, str)

def is_tuple(t):
	#Check whether the input "t"


In [18]:
output = get_code_for_prompt(output_text, 1)
output

'def sandwich_string(A):\n\t#Assign the string "{A}" to a variable named "my_string".\n\tmy_string = "{A}"\n\t#Print the string "sandwich" to the screen.\n\tprint(my_string)'