<h1> MTPB Data: step-by-step instructions --> incremental code generation </h1>

<p>
generate code using MTPB's "prompt" instructions, alternating between prompts and code as shown below:

Input: prompt1
Output: code1

Input: code1 + prompt2
Output: code2

Input: code1 + code2 + prompt3
Output: code3</p>

In [1]:
import pandas as pd
import json
import re

def read_json_line_format(path_to_file):
    """
        Read a JSON Lines format and store it into a dataframe.
    """
    data = []
    with open(path_to_file, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.json_normalize(data)
    return df
########################### Unused yet ###############################
def extract_bracket_content(text):
    pattern = "{(.*?)}"
    match = re.search(pattern, text)
    if match is not None:
        return match.group(1)
    else:
        return None
    
def managing_prompts_with_input(prompts, input):
    """
        This function gives an example of the architecture of the input
    """
    # we will simply add an example of the architecture of the first input. eg {input} for example : 'input' = [1,2,3]
    processed_prompts = []
    # Look for the prompt to change
    for prompt in prompts:
        # extract the input key to replace with the for example
        input_key = extract_bracket_content(prompt)
        if input_key is None:
            processed_prompts.append(prompt)
        else:
            added_prompt = '{' + input_key + '}' + f' for example : {input_key} = {input[input_key]} '
            processed_prompt = prompt.replace(input_key,added_prompt)

    return processed_prompt
#######################################################################

def get_keys(input_list):
    """Get the list of unique input keys and list it (comma separated).
    """
    keys = set()
    for d in input_list:
        keys.update(d.keys())
    keys = sorted(list(keys))  # sort keys for consistent output
    return ','.join(keys)

def processing_name(name):
    """Processing the name of the problem to match the syntax of a function
    """
    name = name.lower()  # convert to lowercase
    name = re.sub('[^a-z0-9 ]', '', name)  # remove any non-alphanumeric characters (except spaces)
    name = re.sub(' ', '_', name)  # replace spaces with underscores
    return name

def create_signature_for_function(data):
    """Create the function signature for each problem.
    """
    # initiate a list of signature
    signatures = []
    # loop over all the rows
    for i in range(len(data)):
        # extract the name of the according problem
        name = data.iloc[i]['name']
        # process the name
        name = processing_name(name)
        # get the input
        inputs = data.iloc[i]['inputs']
        # extract the name
        input_keys = get_keys(inputs)
        # create the function signature architecture
        signature = f'def {name}({input_keys}):'
        # adding the signature to the list
        signatures.append(signature)
    
    data['signature'] = signatures
    return data

# Call the functions
mtbp_path = 'data/mtpb.jsonl'
mtbp_data = read_json_line_format(mtbp_path)
mtbp_data = create_signature_for_function(mtbp_data)

In [2]:
from santaC import *

max_token_to_generate = 128
model = MySantaCoder('GrdS', max_token_to_generate)

  from .autonotebook import tqdm as notebook_tqdm
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from santaC import *


def get_code_for_prompt(code_text, prompt_index):
    lines = code_text.split('\n')
    prompt_count = 0
    for i, line in enumerate(lines):
        if line.strip().startswith('#'):  # we have a new prompt
            if prompt_count == prompt_index:  # we have reached the desired prompt
                break
            prompt_count += 1
    return '\n'.join(lines[:i+2])  # include the next line after the prompt

def remove_context(code):
    #TODO: remove all lines starting with '\t#'
    lines = code.split('\n')
    # keep only lines that don't start with '\t#'
    lines = [line for line in lines if not line.startswith('\t#')]
    return '\n'.join(lines)


def generating_step_by_step_with_context(model, data, keep_context = True):
    """Generating code step by step 
    """
    # Adding it to 
    for j in range(len(data)):
        # start with the signature for the incoming problem
        code = data.iloc[j]['signature']
        if j > 2 : 
            break
        else:
            # initiate the list of prompt to generate
            prompts = data.iloc[j]['prompts']
            # Iterate over each prompt
            for i, prompt in enumerate(prompts):
                # show what prompts is currently beeing used
                print(f"Prompt {i+1}: {prompt}")
                
                # Add the prompt to the previously generated code
                input_text = code + '\n\t' + '#' + prompt
                
                # Encode the input text
                input_ids = model.tokenizer.encode(input_text, return_tensors='pt')

                # Generate the output
                output_ids = model.forward(input_ids)

                # Decode the output
                output_text = model.decode_output(output_ids[0])

                # Process output
                code = get_code_for_prompt(output_text, i)

                # remove context if set to False
                if keep_context == False:
                    code = remove_context(code)

        print("Final generated code:\n", code)


generating_step_by_step_with_context(model, mtbp_data, False)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 1: Assign the string "{A}" to a variable named "my_string".


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 2: Lowercase the given string "my_string".


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 3: Assign the distinct characters of the string to a variable named "chars".


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 4: Sort these characters in alphabetical order.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 5: Print the resulting list of characters.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Final generated code:
 def sandwich_string(A):
	my_string = "{A}"
	my_string = my_string.lower()
	my_list = my_string.split()
	chars = set(my_list)
	sandwich = "".join(chars)
	return sandwich

#Create a function named "sandwich_string" that takes a string as an argument and returns the same string, but with all the characters in the string in lower case.

Prompt 1: Define a list of integers named "numbers" with the values {numbers}.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 2: Calculate the sum of the elements in variable "numbers" and store the result to variable "total".


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 3: Divide each element of the list by the total and multiply by 100, store the result to variable "normalized".


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 4: Convert each element in variable "normalized" into a formatted string with single decimal point and store the result into "formatted".


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 5: Print the variable "formatted".


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Final generated code:
 def normalize_integer_list(numbers):
	total = sum(numbers)
	normalized_numbers = []
	for number in numbers:
		normalized_numbers.append(number/total*100)
	return normalized_numbers

def normalize_float_list(numbers):
	total = sum(numbers)
	normalized_numbers = []
	for number in numbers:
		normalized_numbers.append(number/total*100)
	return normalized_numbers

def normalize_string_list(numbers):
	total = sum
	formatted = []
	for number in numbers:
		formatted.append(str(number/total*100))
	return formatted

def normalize_list(numbers):
	if type(numbers) == list:
		if type(numbers[0]) == int:
			return normalize_integer_list(numbers)
		elif type(numbers[0]) == float:
			return normalize_float_list(numbers)
		elif type(numbers[0]) == str:
			return normalize_string_list(numbers)
	else:
		return numbers

def normalize_list_of_lists(lists):
	if type(lists) == list:
		if type(lists[0]) == list:
			return normalize_list(lists)
		else:
			return lists
	else:
		return lis

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 2: Write a function that takes an integer hours and converts it to seconds.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Prompt 3: Print the total seconds of {a1} hours and {a2} minutes.
Final generated code:
 def convert_time(a1,a2):
	return a1*3600+a2*60

def convert_time_to_seconds(a1,a2):
	return a1*3600+a2*60

def convert_time_to_minutes(a1,a2):
	return a1*60+a2


In [6]:
trys = 'def function(a):\n\t#Assign the value a to "my_string"\n\tmy_string=a\n\t#print the value ones\n\tprint(my_string)\n\t#print the value twice\n\tprint(my_string, my_string)\n\t#return the string\n\treturn my_string'

print(trys)

def function(a):
	#Assign the value a to "my_string"
	my_string=a
	#print the value ones
	print(my_string)
	#print the value twice
	print(my_string, my_string)
	#return the string
	return my_string


In [None]:
def get_code_for_prompt(code_text, prompt_index):
    lines = code_text.split('\n')
    prompt_count = 0
    for i, line in enumerate(lines):
        if line.strip().startswith('#'):  # we have a new prompt
            if prompt_count == prompt_index:  # we have reached the desired prompt
                break
            prompt_count += 1
    return '\n'.join(lines[:i+2])  # include the next line after the prompt

In [11]:
output = get_code_for_prompt(trys, 3)
print(output)

def function(a):
	#Assign the value a to "my_string"
	my_string=a
	#print the value ones
	print(my_string)
	#print the value twice
	print(my_string, my_string)
	#return the string
	return my_string


In [12]:
indice = 0
code_to_prompt = mtbp_data.iloc[indice]['signature'] + "\n\t" + "#" + mtbp_data.iloc[indice]['prompts'][0]
input_ids = model.tokenizer.encode(code_to_prompt, return_tensors='pt')

# Generate the output
output_ids = model.forward(input_ids)

# Decode the output
output_text = model.decode_output(output_ids[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


In [13]:
print(output_text)

def sandwich_string(A):
	#Assign the string "{A}" to a variable named "my_string".
	my_string = "{A}"
	#Print the string "sandwich" to the screen.
	print(my_string)
	#Return the string "sandwich" to the function.
	return my_string

#Call the function sandwich_string with the value of 10.
sandwich_string(10)

#Assign the string "sandwich" to a variable named "my_string".
my_string = "sandwich"
#Print the string "sandwich" to the screen.
print(my_string)
#


In [18]:
output = get_code_for_prompt(output_text, 1)
output

'def sandwich_string(A):\n\t#Assign the string "{A}" to a variable named "my_string".\n\tmy_string = "{A}"\n\t#Print the string "sandwich" to the screen.\n\tprint(my_string)'