<h1> MTPB Data: step-by-step instructions --> incremental code generation </h1>

<p>
generate code using MTPB's "prompt" instructions, alternating between prompts and code as shown below:

Input: prompt1
Output: code1

Input: code1 + prompt2
Output: code2

Input: code1 + code2 + prompt3
Output: code3</p>

In [2]:
import pandas as pd
import json
import re

def read_json_line_format(path_to_file):
    """
        Read a JSON Lines format and store it into a dataframe.
    """
    data = []
    with open(path_to_file, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.json_normalize(data)
    return df
########################### Unused yet ###############################
def extract_bracket_content(text):
    pattern = "{(.*?)}"
    match = re.search(pattern, text)
    if match is not None:
        return match.group(1)
    else:
        return None
    
def managing_prompts_with_input(prompts, input):
    """
        This function gives an example of the architecture of the input
    """
    # we will simply add an example of the architecture of the first input. eg {input} for example : 'input' = [1,2,3]
    processed_prompts = []
    # Look for the prompt to change
    for prompt in prompts:
        # extract the input key to replace with the for example
        input_key = extract_bracket_content(prompt)
        if input_key is None:
            processed_prompts.append(prompt)
        else:
            added_prompt = '{' + input_key + '}' + f' for example : {input_key} = {input[input_key]} '
            processed_prompt = prompt.replace(input_key,added_prompt)

    return processed_prompt
#######################################################################

def get_keys(input_list):
    """Get the list of unique input keys and list it (comma separated).
    """
    keys = set()
    for d in input_list:
        keys.update(d.keys())
    keys = sorted(list(keys))  # sort keys for consistent output
    return ','.join(keys)

def processing_name(name):
    """Processing the name of the problem to match the syntax of a function
    """
    name = name.lower()  # convert to lowercase
    name = re.sub('[^a-z0-9 ]', '', name)  # remove any non-alphanumeric characters (except spaces)
    name = re.sub(' ', '_', name)  # replace spaces with underscores
    return name

def create_signature_for_function(data):
    """Create the function signature for each problem.
    """
    # initiate a list of signature
    signatures = []
    # loop over all the rows
    for i in range(len(data)):
        # extract the name of the according problem
        name = data.iloc[i]['name']
        # process the name
        name = processing_name(name)
        # get the input
        inputs = data.iloc[i]['inputs']
        # extract the name
        input_keys = get_keys(inputs)
        # create the function signature architecture
        signature = f'def {name}({input_keys}):'
        # adding the signature to the list
        signatures.append(signature)
    
    data['signature'] = signatures
    return data


In [3]:
from santaC import *

max_token_to_generate = 128
model = MySantaCoder('SmplM', max_token_to_generate)



  from .autonotebook import tqdm as notebook_tqdm
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [68]:
# Load the data
mtbp_path = 'data/mtpb.jsonl'
converted_mtbp_data = 'data/converted_mtpb.jsonl'
mtbp_data = read_json_line_format(mtbp_path)
converted_mtbp_data = read_json_line_format(converted_mtbp_data)

# mtbp_data = create_signature_for_function(mtbp_data)
mtbp_data['signature'] = converted_mtbp_data['signature']

In [73]:
mtbp_data.iloc[0]['prompts']

['Assign the string "{A}" to a variable named "my_string".',
 'Lowercase the given string "my_string".',
 'Assign the distinct characters of the string to a variable named "chars".',
 'Sort these characters in alphabetical order.',
 'Print the resulting list of characters.']

In [63]:
def keep_code_until_after_first_comment(code):
    # Split the code into lines
    code_lines = code.split('\n')
    # Initialize a list to store the output lines
    output_lines = []
    # Initialize a variable to keep track if a comment has been found
    comment_found = False
    # Loop over the lines
    for line in code_lines:
        # Add the line to the output
        output_lines.append(line)
        # If the line starts with a tab followed by a hash and a comment hasn't been found before,
        # mark that a comment has been found
        if line.startswith('\t#') and not comment_found:
            comment_found = True
        # If a comment has been found and the current line does not start with a comment, stop adding lines to the output
        elif comment_found and not line.startswith('\t#'):
            # need to make a function that keep the structure
            break
    # Join the output lines back together with newlines and return the result
    return '\n'.join(output_lines)

def get_code_for_prompt_old(code_text, prompt_index):
    lines = code_text.split('\n')
    prompt_count = 0
    for i, line in enumerate(lines):
        if line.strip().startswith('#'):  # we have a new prompt
            if prompt_count == prompt_index:  # we have reached the desired prompt
                break
            prompt_count += 1
    return '\n'.join(lines[:i+2])  # include the next line after the prompt

def remove_context(code):
    """remove all lines starting with '\t#' 
    """
    lines = code.split('\n')
    # keep only lines that don't start with '\t#'
    lines = [line for line in lines if not line.startswith('\t#')]
    return '\n'.join(lines)

# we want to build a function that : 
#     - stop generates after the instruction is full filled after the last prompt :
#         * only line after is kept ( current naive approach )
#         * the rest is kept if the first line contains : 
#             - an 'if' statement is found : keep all the if structure until new indentation
#                 ex :  \n\tif (...):\n\t\t#code\n\t#after {all the 'after' needs to be deleted}
#             - a 'def' is found : keep all the def function until the 'return' associated with this function
#             - a 'for' loop is found : keep all the structure of the 'for' loop ( the same way as the 'if' statement works )
#             - a 'while' loop is found : keep all the structure of the 'while' loop ( the same way as the 'if and the 'for')
#     - when find a module 'import' place it before the first function signature it finds ( might need to be tricky )
#         - either doing afterward meaning once everything has been generated
#         - doing it before the generation starts

stop_words = ['def', 'if', 'for', 'while']

def check_if_start_with(stop_words, line):
    # check if a string begins with some stop words
    for word in stop_words:
        if line.strip().startswith(word) and count_indentation(line) == 1:
            return True   
    return False

def count_indentation(line):
    # count the number of indentation in a line
    count = 0
    for char in line:
        if char == '\t':
            count += 1
        else:
            break
    return count

def identify_what_step_instruction(lines, index_prompt, keep_context):
    """
        function that spot the line at wich the current step is being considered ( the index of the last prompt )

        Input:
            lines
        Output:
            index of the line where the generation starts
    """
    # initialise the promp cursor
    prompt_count = 0
    # initialise the index at which the generation has started
    index_to_start = 0
    # iterate through the lines
    for i,line in enumerate(lines):
        # if the context as been kept (we count the number of comments generated)
        if keep_context:
            # we need to know according to what prompt we are trying to generate code
            if line.strip().startswith('#') and count_indentation(line) == 1:  # we have a new prompt
                if prompt_count == index_prompt:  # we have reached the desired prompt
                    index_to_start = i
                prompt_count += 1
        else:
            # the first prompt encountered is the actual step
            if line.strip().startswith('#'):
                index_to_start = i
                   
    return index_to_start

def identify_chunks_of_code(list_lines, stop_words):
    """
        Truncate the chunk of code that is to be generated with the current step

        Input:
            list_lines: the list of lines after the current step
    """
    # initialise the level of indentation of the current chunks of code and the number of line within it
    indentation_reference = 0
    nb_line_of_code = 1
    # initialise the chunk of codes that needs to be saved
    chunk_of_code = []
    try:
        # first line is the line right after the last
        first_line = list_lines[0]
        
        if check_if_start_with(stop_words, first_line):
            indentation_reference = count_indentation(first_line) + 1 # number of '\t' found in the line ( needs to count it )
            # keep track of the structure and check the indentation level
            for _, step_in in enumerate(list_lines[1:]): 
                # check the indentation level and as soon as line looses their indentation cut off
                indentation_level = count_indentation(step_in)
                if indentation_level >= indentation_reference:
                    nb_line_of_code+=1
                else:
                    break
            # append all of the lines
            for i in range(nb_line_of_code):
                chunk_of_code.append(list_lines[i])         
        else:
            chunk_of_code.append(list_lines[0])
    except NameError:
        print(f'{NameError} for this one...')

    return chunk_of_code

def generation_cut_off(gen_code, stop_words, keep_context = False, index_prompt = None):
    """
        A function that cut off the code which let only the instructed code that was generated

        Input:
            gen_code: a text of generated code
            stop_words: a list of stop words
        Output:
            processed_gen_code: the processed code
    """
    
    # First we have to split the code as a list of lines
    lines = gen_code.split('\n')

    # we identify the last prompts we are interested in. 
    index_last_prompt = identify_what_step_instruction(lines, index_prompt, keep_context)

    # we then keep only the code generated to come
    begining = index_last_prompt + 1
    steps = lines[begining:]
    
    # we extract the chunks of codes that is generated out for the last prompt
    chunks_of_code = identify_chunks_of_code(steps, stop_words)

    # we need to keep the code previously generated and the last piece of code that has been generated
    codes = lines[:index_last_prompt] + chunks_of_code
            
    return '\n'.join(codes)


In [67]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from santaC import *

def generating_step_by_step(model, data, stop_words, keep_context = True, early_stopping = None):
    """Generating code step by step 
    """
    codes = []
    for j in range(len(data)):
        if early_stopping is not None and j > early_stopping:
            break
        # start with the signature for the incoming problem
        code = data.iloc[j]['signature']
        # initiate the list of prompt to generate
        prompts = data.iloc[j]['prompts']
        # Iterate over each prompt
        for i, prompt in enumerate(prompts):
            # Add the prompt to the previously generated code
            input_text = code + '\n\t' + '#' + prompt
            
            # Encode the input text
            input_ids = model.tokenizer.encode(input_text, return_tensors='pt')

            # Generate the output
            output_ids = model.forward(input_ids)

            # Decode the output
            output_text = model.decode_output(output_ids[0])

            code = generation_cut_off(output_text, stop_words, keep_context, i)

            # keep only the last code generated after the output
            if keep_context==False:
                # remove context if set to False
                code = remove_context(code)

        # print("Final generated code:\n", code)
        codes.append(code)

    return codes
    


codes = generating_step_by_step(model=model, data=mtbp_data, stop_words=stop_words, keep_context = True, early_stopping=2)
#mtbp_data['gen_code'] = codes
#mtbp_data.to_csv('data/step_by_step/structural_mtbp_with_context_Sampling.csv')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generati

Testing the generation cut off function to see if it works properly:


In [1]:
test1 = 'def count_zeros(ars):\n\t# Initialize counter to zero\n\tcounter = 0\n\t# Loop over each element in the list\n\tfor i in ars:\n\t\t# Increment counter if the element is zero\n\t\tif i == 0:\n\t\t\tcounter += 1\n\t# Return the count of zeros\n\treturn counter'
test2 = 'def first_element(ars):\n\t# Assign the first element of the list to a variable\n\tfirst = ars[0]\n\t# Return the first element\n\treturn first'
test3 = 'def sum_while(ars):\n\t# Initialize variables\n\ttotal = 0\n\ti = 0\n\t# While loop until reaching the end of the list\n\twhile i < len(ars):\n\t\t# Add current value to total\n\t\ttotal += ars[i]\n\t\t# Increment the index\n\t\ti += 1\n\t# Return the total sum\n\treturn total'
test4 = 'def filter_negatives(ars):\n\t# Initialize an empty list for positive numbers\n\tpositives = []\n\t# Loop over each element in the list\n\tfor num in ars:\n\t\t# Add number to the list if it\'s non-negative\n\t\tif num >= 0:\n\t\t\tpositives.append(num)\n\t# Return the list of non-negative numbers\n\treturn positives'
test5 = 'def flatten_list(ars):\n\t# Initialize an empty list for the result\n\tflattened = []\n\t# Loop over each element in the list\n\tfor sublist in ars:\n\t\t# Loop over each element in the sublist\n\t\tfor item in sublist:\n\t\t\t# Add item to the flattened list\n\t\t\tflattened.append(item)\n\t# Return the flattened list\n\treturn flattened'


In [60]:
codes = generation_cut_off(test5, stop_words, True, 2)

In [61]:
print(test5)
print('=============')
print(codes)
print('==========')

def flatten_list(ars):
	# Initialize an empty list for the result
	flattened = []
	# Loop over each element in the list
	for sublist in ars:
		# Loop over each element in the sublist
		for item in sublist:
			# Add item to the flattened list
			flattened.append(item)
	# Return the flattened list
	return flattened
def flatten_list(ars):
	# Initialize an empty list for the result
	flattened = []
	# Loop over each element in the list
	for sublist in ars:
		# Loop over each element in the sublist
		for item in sublist:
			# Add item to the flattened list
			flattened.append(item)
	return flattened


In [5]:
import pandas as pd
mtbp_without = pd.read_csv('data/step_by_step/line_by_line_without_context_Sampling.csv')
mtbp_with = pd.read_csv('data/step_by_step/line_by_line_with_context_Sampling.csv')
coverted_data = read_json_line_format('data/converted_mtpb.jsonl')

In [13]:
coverted_data.iloc[10]['code']

'def invert_dict(a1,a2,a3):\r\npass'

In [16]:
trys = 'def function(a):\n\t#Assign the value a to "my_string"\n\tmy_string=a\n\t#print the value ones\n\tprint(my_string)\n\t#print the value twice\n\tprint(my_string, my_string)\n\t#return the string\n\treturn my_string'
trys_wocon = 'def function(a):\n\tmy_string=a\n\tprint(my_string)\n\tprint(my_string, my_string)\n\t#return the string\n\treturn my_string'
print(trys_wocon)

def function(a):
	my_string=a
	print(my_string)
	print(my_string, my_string)
	#return the string
	return my_string


In [None]:
# def generation_cut_off(gen_code, stop_words, keep_context = False, index_prompt = None):
#     """
#         A function that cut off the code which let only the instructed code that was generated

#         Input:
#             gen_code: a text of generated code
#             stop_words: a list of stop words
#         Output:
#             processed_gen_code: the processed code
#     """
#     codes = []
#     begining = 0
#     chunks_of_code = 0
#     prompt_count = 0
#     # First we have to split the code as a list of lines
#     lines = gen_code.split('\n')
#     # output_lines = []
#     # We go through each line except for the first one which is 'def my_signature_function()'
#     for i,line in enumerate(lines):
#         # if the context as been kept 
#         if keep_context:
#             # we need to know according to what prompt we are trying to generate code
#             if line.strip().startswith('#'):  # we have a new prompt
#                 if prompt_count == index_prompt:  # we have reached the desired prompt
#                     # we need to keep the step that is generated
#                     # the first line of code after the last step to generate
#                     begining = i+1
#                     steps = lines[begining:]
#                     # if it begins with any stopwords
#                     if check_if_start_with(stop_words, steps[0]):
#                         indentation_reference = count_indentation(steps[0]) + 1 # number of '\t' found in the line ( needs to count it )
#                         for k, step_in in enumerate(steps[1:]): 
#                             # check the indentation level and as soon as line looses their indentation cut off
#                             indentation_level = count_indentation(step_in)
#                             if indentation_level == indentation_reference:
#                                 chunks_of_code+=1
#                             else:
#                                 break                                     
#                 prompt_count += 1
#         else:
#             # the first prompt encountered is the actual step
            
#             # then follow up with the structure we need to keep
            
#     return None

In [None]:
stop_words = ['def', 'if', 'for', 'while']

def check_if_start_with(stop_words, line):
    # check if a string begins with some stop words
    for word in stop_words:
        if line.strip().startwith(word):
            return True   
    return False

def count_indentation(line):
    # count the number of indentation in a line
    count = 0
    for char in line:
        if char == '\t':
            count += 1
        else:
            break
    return count

def identify_what_step_instruction(lines, index_prompt, keep_context):
    """
        function that spot the line at wich the current step is being considered ( the index of the last prompt )

        Input:
            lines
        Output:
            index of the line where the generation starts
    """
    # initialise the promp cursor
    prompt_count = 0
    # initialise the index at which the generation has started
    index_to_start = 0
    # iterate through the lines
    for i,line in enumerate(lines):
        # if the context as been kept (we count the number of comments generated)
        if keep_context:
            # we need to know according to what prompt we are trying to generate code
            if line.strip().startswith('#'):  # we have a new prompt
                if prompt_count == index_prompt:  # we have reached the desired prompt
                    index_to_start = i
                prompt_count += 1
        else:
            # the first prompt encountered is the actual step
            if line.strip().startswith('#'):
                index_to_start = i
                   
    return index_to_start

def identify_chunks_of_code(list_lines, stop_words):
    """
        Truncate the chunk of code that is to be generated with the current step

        Input:
            list_lines: the list of lines after the current step
    """
    # initialise the level of indentation of the current chunks of code and the number of line within it
    indentation_reference = 0
    nb_line_of_code = 1
    # initialise the chunk of codes that needs to be saved
    chunk_of_code = []

    if check_if_start_with(stop_words, list_lines[0]):
        indentation_reference = count_indentation(list_lines[0]) + 1 # number of '\t' found in the line ( needs to count it )
        # keep track of the structure and check the indentation level
        for _, step_in in enumerate(list_lines[1:]): 
            # check the indentation level and as soon as line looses their indentation cut off
            indentation_level = count_indentation(step_in)
            if indentation_level == indentation_reference:
                nb_line_of_code+=1
            else:
                break

        for i in range(nb_line_of_code):
            chunk_of_code.append(list_lines[i])         
    else:
        chunk_of_code.append(list_lines[0])

    return chunk_of_code

def generation_cut_off(gen_code, stop_words, keep_context = False, index_prompt = None):
    """
        A function that cut off the code which let only the instructed code that was generated

        Input:
            gen_code: a text of generated code
            stop_words: a list of stop words
        Output:
            processed_gen_code: the processed code
    """
    
    # First we have to split the code as a list of lines
    lines = gen_code.split('\n')

    # we identify the last prompts we are interested in. 
    index_last_prompt = identify_what_step_instruction(lines, index_prompt, keep_context)

    # we then keep only the code generated to come
    begining = index_last_prompt+1
    steps = lines[begining:]

    # we extract the chunks of codes that is generated out for the last prompt
    chunks_of_code = identify_chunks_of_code(steps, stop_words)

    # we need to keep the code previously generated and the last piece of code that has been generated
    codes = lines[:index_last_prompt] + chunks_of_code
            
    return '\n'.join(codes)

In [86]:
from santaC import *

model = MySantaCoder(generation_method = 'SmplM', num_sol=10)

data = pd.read_csv('data/mbpp_test.csv')

  from .autonotebook import tqdm as notebook_tqdm
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [28]:
line = ['\tfor i in range(k):', '\tvalue+=i']

In [32]:
line[0].strip().startswith('for')

True

<h1>Exploration of the outputs<h1>

In [1]:
import pandas as pd

lbl_c_mtbp = pd.read_csv("data/step_by_step/line_by_line_with_context_Sampling.csv")

In [4]:
def calculate_comment_ratio(func_code):
    # Split the code into lines
    lines = func_code.strip().split('\n')
    # Initialize counters
    code_lines = 0
    comment_lines = 0
    # Iterate through lines
    for line in lines:
        stripped_line = line.strip()
        # Ignore empty lines
        if stripped_line == "":
            continue
        
        # If the line starts with '#', it is a comment line
        if stripped_line[0] == '#':
            comment_lines += 1
        else:
            # Otherwise, it is a line of code
            code_lines += 1

    # Check if there are no lines of code to prevent division by zero
    if code_lines == 0:
        return "No lines of code present"
    
    # Calculate and return the ratio
    return comment_lines / code_lines

def comment_ratio(df):
    #Initialize counter
    ratio = 0
    for i in range(len(df)):
        ratio+=calculate_comment_ratio(df.iloc[i]['gen_code'])
    return ratio/len(df)



In [5]:
ratio = comment_ratio(lbl_c_mtbp)

In [12]:
print(lbl_c_mtbp.iloc[4]['prompts'])

['Assign the list of numbers "{A}" to a variable named "my_numbers".', 'Count the number of negative numbers in the list as "n_neg".', 'Count the number of positive numbers in the list as "n_pos".', 'Print out the larger number of those two.']
