<h1> MTPB Data: step-by-step instructions --> incremental code generation </h1>

<p>
generate code using MTPB's "prompt" instructions, alternating between prompts and code as shown below:

Input: prompt1
Output: code1

Input: code1 + prompt2
Output: code2

Input: code1 + code2 + prompt3
Output: code3</p>

In [1]:
# Initialisation

from santaC import *
import pandas as pd
import json
import re
from get_data import read_json_line_format

max_token_to_generate = 248
model = MySantaCoder('GrdS', max_token_to_generate)

# Load the data
mtbp_path = 'data/mtpb.jsonl'
converted_mtbp_data = 'data/converted_mtpb.jsonl'
mtbp_data = read_json_line_format(mtbp_path)
converted_mtbp_data = read_json_line_format(converted_mtbp_data)
mtbp_data['signature'] = converted_mtbp_data['signature']

  from .autonotebook import tqdm as notebook_tqdm
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [2]:
test1 = 'def count_zeros(ars):\n\t# Initialize counter to zero\n\tcounter = 0\n\t# Loop over each element in the list\n\tfor i in ars:\n\t\t# Increment counter if the element is zero\n\t\tif i == 0:\n\t\t\tcounter += 1\n\t# Return the count of zeros\n\treturn counter'
test2 = 'def first_element(ars):\n\t# Assign the first element of the list to a variable\n\tfirst = ars[0]\n\t# Return the first element\n\treturn first'
test3 = 'def sum_while(ars):\n\t# Initialize variables\n\ttotal = 0\n\ti = 0\n\t# While loop until reaching the end of the list\n\twhile i < len(ars):\n\t\t# Add current value to total\n\t\ttotal += ars[i]\n\t\t# Increment the index\n\t\ti += 1\n\t# Return the total sum\n\treturn total'
test4 = 'def filter_negatives(ars):\n\t# Initialize an empty list for positive numbers\n\tpositives = []\n\t# Loop over each element in the list\n\tfor num in ars:\n\t\t# Add number to the list if it\'s non-negative\n\t\tif num >= 0:\n\t\t\tpositives.append(num)\n\t# Return the list of non-negative numbers\n\treturn positives'
test5 = 'def flatten_list(ars):\n\t# Initialize an empty list for the result\n\tflattened = []\n\t# Loop over each element in the list\n\tfor sublist in ars:\n\t\t# Loop over each element in the sublist\n\t\tfor item in sublist:\n\t\t\t# Add item to the flattened list\n\t\t\tflattened.append(item)\n\t# Return the flattened list\n\treturn flattened'
lines1 = test1.split('\n')
lines2 = test2.split('\n')
lines3 = test3.split('\n')
lines4 = test4.split('\n')
lines5 = test5.split('\n')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from santaC import *
from generation_processing import *

def generating_step_by_step(model, data, stop_words, keep_context = True, early_stopping = None):
    """Generating code step by step 
    """
    codes = []
    for j in range(len(data)):
        if early_stopping is not None and j > early_stopping:
            break
        # start with the signature for the incoming problem
        code = data.iloc[j]['signature']
        # initiate the list of prompt to generate
        prompts = data.iloc[j]['prompts']
        # Iterate over each prompt
        for i, prompt in enumerate(prompts):
            # Add the prompt to the previously generated code
            input_text = code + '\n\t' + '#' + prompt

            # Encode the input text
            input_ids = model.tokenizer.encode(input_text, return_tensors='pt')

            # Generate the output
            output_ids = model.forward(input_ids)

            # Decode the output
            output_text = model.decode_output(output_ids[0])

            code = generation_cut_off(output_text, stop_words, keep_context, i)
            
            # keep only the last code generated after the output
            if keep_context==False:
                # remove context if set to False
                code = remove_context(code)

        # print("Final generated code:\n", code)
        codes.append(code)

    return codes
    

stop_words = ['def', 'if', 'for', 'while']
codes = generating_step_by_step(model=model, data=mtbp_data, stop_words=stop_words, keep_context = True)
mtbp_data['gen_code'] = codes
mtbp_data.to_csv('data/step_by_step/structural1_mtbp_with_context_Sampling.csv')

Testing the generation cut off function to see if it works properly:


<h1>Exploration of the outputs<h1>

In [66]:
import pandas as pd

lbl_c_mtbp = pd.read_csv("data/MTBP/step_by_step/structural_mtbp_with_context_Sampling.csv")
code_sampling_step_step = ['def sandwich_string(A):\n\t#Assign the string "{A}" to a variable named "my_string".\n\tmy_string = "{A}"\n\t#Lowercase the given string "my_string".\n\tmy_string = my_string.lower()\n\t#Assign the distinct characters of the string to a variable named "chars".\n\tchars = my_string\n\t#Sort these characters in alphabetical order.\n\tchars = sorted(chars)\n\t#Print the resulting list of characters.\n\tprint(chars)',
 'def normalize_integer_list(numbers):\n\t#Define a list of integers named "numbers" with the values {numbers}.\n\t#Return the list of normalized integers.\n\t#Calculate the sum of the elements in variable "numbers" and store the result to variable "total".\n\t#Divide each element of the list by the total and multiply by 100, store the result to variable "normalized".\n\t#Convert each element in variable "normalized" into a formatted string with single decimal point and store the result into "formatted".\n\t#Print the variable "formatted".',
 'def convert_time(a1,a2):\n\t#Write a function that takes an integer minutes and converts it to seconds.\n\ta = int(a1)\n\t#Write a function that takes an integer hours and converts it to seconds.\n\tb = int(a2)\n\t#Print the total seconds of {a1} hours and {a2} minutes.\n\tprint(f"Total seconds: {a*60+b}")',
 'def squared_fibonacci(a1):\n\t#Implement a function which returns the n-th Fibonacci number.\n\ta2 = 0\n\t#Implement a function that computes the square of an integer argument.\n\tdef square(a3):\n\t\treturn a3**2\n\t#Print out the square of {a1}-th Fibonacci number.\n\tdef fibonacci(n):\n\t\tif n == 0:\n\t\t\treturn 0\n\t\telif n == 1:\n\t\t\treturn 1\n\t\telse:\n\t\t\treturn fibonacci(n-1) + fibonacci(n-2)',
 'def count_negative_numbers(A):\n\t#Assign the list of numbers "{A}" to a variable named "my_numbers".\n\tmy_numbers = A\n\t#Count the number of negative numbers in the list as "n_neg".\n\tn_neg = 0\n\t#Count the number of positive numbers in the list as "n_pos".\n\tn_pos = 0\n\t#Print out the larger number of those two.\n\tif my_numbers[0] > my_numbers[1]:\n\t\tn_neg = n_neg + 1',
 'def pandas_mean(a1,a2,a3,a4):\n\t#Import the pandas library.\n\timport pandas as pd\n\t#Create a dataframe with a column labeled "Yes" with values [{a1}, {a2}] and a column named "No" with values [{a3}, {a4}].\n\tdf = pd.DataFrame({\'Yes\': [a1, a2], \'No\': [a3, a4]})\n\t#Compute the mean per column and store the value in a variable named means.\n\tmeans = df.mean()\n\t#Print the variable means.\n\tprint(means)',
 'def fizz_buzz(a1,a2,a3,a4):\n\t#Write a function that returns a number, for numbers multiple of {a1} print "fizz" instead of a number, for numbers multiple of {a2} print "buzz", for numbers which are multiples of both {a1} and {a2} "fizzbuzz".\n\t#Do not use division operator, %.\n\t#Create a list of integers ranging from {a3} to {a4}.\n\t#Call the written function for each element in the list and store the result as "new_list".\n\t#Print out the list "new_list".',
 'def bi-grams(a1):\n\t#Write a function that can take a string and return a list of word bigrams as pairs.\n\t#For example: bi-grams(\'cat\') should return [(\'c\', \'a\'), (\'a\', \'t\'), (\'a\', \'c\')]\n\t#Assign the string "{a1}" to a variable named sentence.\n\t#Print out the bi-grams for the variable named sentence.',
 'def top_note(a1,a2,a3,a4):\n\t#Assign the names ["Kevin", "John", "Mike", "Mitch"] as keys and corresponding notes [{a1}, {a2}, {a3}, {a4}] as values to a dictionary named "my_notes".\n\tmy_notes = {"Kevin":a1, "John":a2, "Mike":a3, "Mitch":a4}\n\t#Create a function that takes a dictionary of objects like {{ "name": "John", "notes": [3, 5, 4] }} and returns a dictionary of objects like {{ "name": "John", "top_note": 5 }}.\n\treturn my_notes\n\t#For each name in the dictionary get the top_note and store the pairs of names and top_notes as "my_list".\n\t#return my_list\n\t#Find the name with the highest top_note and assign it to "top_name".\n\t#Print the variable top_name.',
 'def hex_to_binary(a1):\n\t#Create a function that will take a HEX number and returns the binary equivalent (as a string). E.g., to_binary(0xFF) = "11111111".\n\t#A list of the 8 bit integers in the HEX number is returned, in the same order as the string.\n\t#Create a function that will take the output of the above function and return the HEX number. E.g., to_hex("11111111") = 0xFF.\n\t#Assign the value {a1} to a variable named "my_hex".\n\t#Convert the variable "my_hex" into the binary equivalent as string named "my_binary".\n\t#Convert "my_binary" back to a HEX number named "result".\n\t#Print the result.']

In [79]:
i = 9
print(lbl_c_mtbp.iloc[i]['gen_code'])
print(code_sampling_step_step[i])

	a2 = bin(int(a1, 16))[2:]
def hex_to_binary(a1):
	#Create a function that will take a HEX number and returns the binary equivalent (as a string). E.g., to_binary(0xFF) = "11111111".
	#A list of the 8 bit integers in the HEX number is returned, in the same order as the string.
	#Create a function that will take the output of the above function and return the HEX number. E.g., to_hex("11111111") = 0xFF.
	#Assign the value {a1} to a variable named "my_hex".
	#Convert the variable "my_hex" into the binary equivalent as string named "my_binary".
	#Convert "my_binary" back to a HEX number named "result".
	#Print the result.


In [53]:
test = lbl_c_mtbp.iloc[3]['gen_code']
list_output = test.split('\n')
output_first = '\n'.join(list_output[:2])

In [54]:
input_ids = model.tokenizer.encode(output_first, return_tensors='pt')

# Generate the output
output_ids = model.forward(input_ids)

# Decode the output
output_text = model.decode_output(output_ids[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


In [4]:
def calculate_comment_ratio(func_code):
    # Split the code into lines
    lines = func_code.strip().split('\n')
    # Initialize counters
    code_lines = 0
    comment_lines = 0
    # Iterate through lines
    for line in lines:
        stripped_line = line.strip()
        # Ignore empty lines
        if stripped_line == "":
            continue
        
        # If the line starts with '#', it is a comment line
        if stripped_line[0] == '#':
            comment_lines += 1
        else:
            # Otherwise, it is a line of code
            code_lines += 1

    # Check if there are no lines of code to prevent division by zero
    if code_lines == 0:
        return "No lines of code present"
    
    # Calculate and return the ratio
    return comment_lines / code_lines

def comment_ratio(df):
    #Initialize counter
    ratio = 0
    for i in range(len(df)):
        ratio+=calculate_comment_ratio(df.iloc[i]['gen_code'])
    return ratio/len(df)



In [5]:
ratio = comment_ratio(lbl_c_mtbp)

In [12]:
print(lbl_c_mtbp.iloc[4]['prompts'])

['Assign the list of numbers "{A}" to a variable named "my_numbers".', 'Count the number of negative numbers in the list as "n_neg".', 'Count the number of positive numbers in the list as "n_pos".', 'Print out the larger number of those two.']
