<h1> MTPB Data: step-by-step instructions --> incremental code generation </h1>

<p>
generate code using MTPB's "prompt" instructions, alternating between prompts and code as shown below:

Input: prompt1
Output: code1

Input: code1 + prompt2
Output: code2

Input: code1 + code2 + prompt3
Output: code3</p>

In [1]:
# Initialisation

from santaC import *
import pandas as pd
import json
import re
from get_data import read_json_line_format

max_token_to_generate = 248
model = MySantaCoder('GrdS', max_token_to_generate)

# Load the data
mtbp_path = 'data/mtpb.jsonl'
converted_mtbp_data = 'data/converted_mtpb.jsonl'
mtbp_data = read_json_line_format(mtbp_path)
converted_mtbp_data = read_json_line_format(converted_mtbp_data)
mtbp_data['signature'] = converted_mtbp_data['signature']

  from .autonotebook import tqdm as notebook_tqdm
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [2]:
test1 = 'def count_zeros(ars):\n\t# Initialize counter to zero\n\tcounter = 0\n\t# Loop over each element in the list\n\tfor i in ars:\n\t\t# Increment counter if the element is zero\n\t\tif i == 0:\n\t\t\tcounter += 1\n\t# Return the count of zeros\n\treturn counter'
test2 = 'def first_element(ars):\n\t# Assign the first element of the list to a variable\n\tfirst = ars[0]\n\t# Return the first element\n\treturn first'
test3 = 'def sum_while(ars):\n\t# Initialize variables\n\ttotal = 0\n\ti = 0\n\t# While loop until reaching the end of the list\n\twhile i < len(ars):\n\t\t# Add current value to total\n\t\ttotal += ars[i]\n\t\t# Increment the index\n\t\ti += 1\n\t# Return the total sum\n\treturn total'
test4 = 'def filter_negatives(ars):\n\t# Initialize an empty list for positive numbers\n\tpositives = []\n\t# Loop over each element in the list\n\tfor num in ars:\n\t\t# Add number to the list if it\'s non-negative\n\t\tif num >= 0:\n\t\t\tpositives.append(num)\n\t# Return the list of non-negative numbers\n\treturn positives'
test5 = 'def flatten_list(ars):\n\t# Initialize an empty list for the result\n\tflattened = []\n\t# Loop over each element in the list\n\tfor sublist in ars:\n\t\t# Loop over each element in the sublist\n\t\tfor item in sublist:\n\t\t\t# Add item to the flattened list\n\t\t\tflattened.append(item)\n\t# Return the flattened list\n\treturn flattened'
lines1 = test1.split('\n')
lines2 = test2.split('\n')
lines3 = test3.split('\n')
lines4 = test4.split('\n')
lines5 = test5.split('\n')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from santaC import *
from generation_processing import *

def generating_step_by_step(model, data, stop_words, keep_context = True, early_stopping = None):
    """Generating code step by step 
    """
    codes = []
    for j in range(len(data)):
        if early_stopping is not None and j > early_stopping:
            break
        # start with the signature for the incoming problem
        code = data.iloc[j]['signature']
        # initiate the list of prompt to generate
        prompts = data.iloc[j]['prompts']
        # Iterate over each prompt
        for i, prompt in enumerate(prompts):
            # Add the prompt to the previously generated code
            input_text = code + '\n\t' + '#' + prompt

            # Encode the input text
            input_ids = model.tokenizer.encode(input_text, return_tensors='pt')

            # Generate the output
            output_ids = model.forward(input_ids)

            # Decode the output
            output_text = model.decode_output(output_ids[0])

            code = generation_cut_off(output_text, stop_words, keep_context, i)
            
            # keep only the last code generated after the output
            if keep_context==False:
                # remove context if set to False
                code = remove_context(code)

        # print("Final generated code:\n", code)
        codes.append(code)

    return codes
    

stop_words = ['def', 'if', 'for', 'while']
codes = generating_step_by_step(model=model, data=mtbp_data, stop_words=stop_words, keep_context = True)
mtbp_data['gen_code'] = codes
mtbp_data.to_csv('data/step_by_step/structural1_mtbp_with_context_Sampling.csv')

Testing the generation cut off function to see if it works properly:


<h1>Exploration of the outputs<h1>

In [1]:
import pandas as pd
from generation_processing import *
path_to_data = 'data/MTBP/baseline/MTBP_Greedy_solutions.csv'
sampling_data = pd.read_csv(path_to_data)

In [2]:
data = cut_off_function_baselines(sampling_data)
formatted_data = format_for_testing(data)

In [7]:
formatted_data.to_csv('data/MTBP/baseline/MTBP_Processed_Greedy_solutions.csv')

In [21]:
import ast
test = ast.literal_eval(sampling_data.iloc[0]['test_list'])

In [23]:
reports = pd.read_csv('data/MTBP/reports/reports_structural_mtbp_with_context_Sampling.csv')

In [39]:
reports.iloc[13]['Pass_one']

'[0.0, 0.0, 0.0, 0.0, 0.0]'

In [4]:
def calculate_comment_ratio(func_code):
    # Split the code into lines
    lines = func_code.strip().split('\n')
    # Initialize counters
    code_lines = 0
    comment_lines = 0
    # Iterate through lines
    for line in lines:
        stripped_line = line.strip()
        # Ignore empty lines
        if stripped_line == "":
            continue
        
        # If the line starts with '#', it is a comment line
        if stripped_line[0] == '#':
            comment_lines += 1
        else:
            # Otherwise, it is a line of code
            code_lines += 1

    # Check if there are no lines of code to prevent division by zero
    if code_lines == 0:
        return "No lines of code present"
    
    # Calculate and return the ratio
    return comment_lines / code_lines

def comment_ratio(df):
    #Initialize counter
    ratio = 0
    for i in range(len(df)):
        ratio+=calculate_comment_ratio(df.iloc[i]['gen_code'])
    return ratio/len(df)

